From b22f6e2c6d016a58b3dfbfc3ed4fed4b74533f1b Mon Sep 17 00:00:00 2001 From: Josh Rosenberg <26495692+MojoVampire@users.noreply.github.com> Date: Tue, 21 May 2024 10:22:10 -0400 Subject: [PATCH 1/9] Initial work on critical sections for fast sequences while developing test for regressions --- Include/internal/pycore_critical_section.h | 22 ++++++++++++++++++++++ Objects/unicodeobject.c | 8 +++++--- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/Include/internal/pycore_critical_section.h b/Include/internal/pycore_critical_section.h index 573d09a09683ef..7e9790136db58d 100644 --- a/Include/internal/pycore_critical_section.h +++ b/Include/internal/pycore_critical_section.h @@ -7,6 +7,7 @@ #include "pycore_lock.h" // PyMutex #include "pycore_pystate.h" // _PyThreadState_GET() +#include "listobject.h" // PyList_CheckExact #include #ifdef __cplusplus @@ -108,6 +109,25 @@ extern "C" { _PyCriticalSection2_End(&_cs2); \ } +// Specialized version of critical section locking called to safely use +// PySequence_Fast APIs under nogil +// For performance, the argument *to* PySequence_Fast is provided to the +// macro, not the *result* of PySequence_Fast (which would require an extra +// test to determine if the lock must be held) +# define Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(original) \ + { \ + PyObject *_orig_seq = _PyObject_CAST(original); \ + const bool _should_lock_cs = PyList_CheckExact(_orig_seq); \ + _PyCriticalSection _cs; \ + if (_should_lock_cs) \ + _PyCriticalSection_Begin(&_cs, \ + &_orig_seq->ob_mutex) \ + +# define Py_END_CRITICAL_SECTION_SEQUENCE_FAST() \ + if (_should_lock_cs) \ + _PyCriticalSection_End(&_cs); \ + } + // Asserts that the mutex is locked. The mutex must be held by the // top-most critical section otherwise there's the possibility // that the mutex would be swalled out in some code paths. @@ -137,6 +157,8 @@ extern "C" { # define Py_END_CRITICAL_SECTION() # define Py_BEGIN_CRITICAL_SECTION2(a, b) # define Py_END_CRITICAL_SECTION2() +# define Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(original) +# define Py_END_CRITICAL_SECTION_SEQUENCE_FAST() # define _Py_CRITICAL_SECTION_ASSERT_MUTEX_LOCKED(mutex) # define _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(op) #endif /* !Py_GIL_DISABLED */ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 057b417074ebea..480b6713905c70 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -44,6 +44,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include "pycore_bytesobject.h" // _PyBytes_Repeat() #include "pycore_ceval.h" // _PyEval_GetBuiltin() #include "pycore_codecs.h" // _PyCodec_Lookup() +#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST #include "pycore_format.h" // F_LJUST #include "pycore_initconfig.h" // _PyStatus_OK() #include "pycore_interp.h" // PyInterpreterState.fs_codec @@ -9559,13 +9560,14 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) return NULL; } - /* NOTE: the following code can't call back into Python code, - * so we are sure that fseq won't be mutated. - */ + Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq); items = PySequence_Fast_ITEMS(fseq); seqlen = PySequence_Fast_GET_SIZE(fseq); res = _PyUnicode_JoinArray(separator, items, seqlen); + + Py_END_CRITICAL_SECTION_SEQUENCE_FAST(); + Py_DECREF(fseq); return res; } From 2b4fe199c77df4acad6525234142a9f6c27207c3 Mon Sep 17 00:00:00 2001 From: Josh Rosenberg <26495692+MojoVampire@users.noreply.github.com> Date: Tue, 21 May 2024 10:50:09 -0400 Subject: [PATCH 2/9] Test for efficacy of fast sequence locking macro on str.join, both tests segfault or die with assertion failures >95% of the time without locking macro, pass reliably with locking macro --- Include/internal/pycore_critical_section.h | 2 +- Lib/test/test_free_threading/test_str.py | 74 ++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 Lib/test/test_free_threading/test_str.py diff --git a/Include/internal/pycore_critical_section.h b/Include/internal/pycore_critical_section.h index 7e9790136db58d..a77cba8b060eb1 100644 --- a/Include/internal/pycore_critical_section.h +++ b/Include/internal/pycore_critical_section.h @@ -114,7 +114,7 @@ extern "C" { // For performance, the argument *to* PySequence_Fast is provided to the // macro, not the *result* of PySequence_Fast (which would require an extra // test to determine if the lock must be held) -# define Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(original) \ +# define Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(original) \ { \ PyObject *_orig_seq = _PyObject_CAST(original); \ const bool _should_lock_cs = PyList_CheckExact(_orig_seq); \ diff --git a/Lib/test/test_free_threading/test_str.py b/Lib/test/test_free_threading/test_str.py new file mode 100644 index 00000000000000..3e965c75c0264c --- /dev/null +++ b/Lib/test/test_free_threading/test_str.py @@ -0,0 +1,74 @@ +import sys +import unittest + +from itertools import cycle +from threading import Thread +from unittest import TestCase + +from test.support import threading_helper + +@threading_helper.requires_working_threading() +class TestStr(TestCase): + def test_racing_join_extend(self): + '''Test joining a string being extended by another thread''' + l = [] + OBJECT_COUNT = 100_000 + + def writer_func(): + l.extend(map(str, range(OBJECT_COUNT, OBJECT_COUNT*2))) + + def reader_func(): + while True: + count = len(l) + ''.join(l) + if count == OBJECT_COUNT: + break + + writer = Thread(target=writer_func) + readers = [] + for x in range(30): + reader = Thread(target=reader_func) + readers.append(reader) + reader.start() + + writer.start() + writer.join() + for reader in readers: + reader.join() + + def test_racing_join_replace(self): + ''' + Test joining a string of characters being replaced with ephemeral + strings by another thread. + ''' + l = [*'abcdefg'] + MAX_ORDINAL = 10_000 + + def writer_func(): + for i, c in zip(cycle(range(len(l))), + map(chr, range(128, MAX_ORDINAL))): + l[i] = c + del l[:] # Empty list to tell readers to exit + + def reader_func(): + while True: + empty = not l + ''.join(l) + if empty: + break + + writer = Thread(target=writer_func) + readers = [] + for x in range(30): + reader = Thread(target=reader_func) + readers.append(reader) + reader.start() + + writer.start() + writer.join() + for reader in readers: + reader.join() + + +if __name__ == "__main__": + unittest.main() From 536defc57813d10078e5ee49f13e6836031a136d Mon Sep 17 00:00:00 2001 From: Josh Rosenberg <26495692+MojoVampire@users.noreply.github.com> Date: Tue, 21 May 2024 11:36:18 -0400 Subject: [PATCH 3/9] Blurb for change --- .../next/C API/2024-05-21-11-35-11.gh-issue-119247.U6n6mh.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 Misc/NEWS.d/next/C API/2024-05-21-11-35-11.gh-issue-119247.U6n6mh.rst diff --git a/Misc/NEWS.d/next/C API/2024-05-21-11-35-11.gh-issue-119247.U6n6mh.rst b/Misc/NEWS.d/next/C API/2024-05-21-11-35-11.gh-issue-119247.U6n6mh.rst new file mode 100644 index 00000000000000..3b2cdc8cf2dc5c --- /dev/null +++ b/Misc/NEWS.d/next/C API/2024-05-21-11-35-11.gh-issue-119247.U6n6mh.rst @@ -0,0 +1,4 @@ +Added ``Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST`` and +``Py_END_CRITICAL_SECTION_SEQUENCE_FAST`` macros to make it possible to use +PySequence_Fast APIs safely when free-threaded, and update str.join to work +without the GIL using them. From 0475c8eb410257559caee1a15664bacffb14528b Mon Sep 17 00:00:00 2001 From: "Josh {*()} Rosenberg" <26495692+MojoVampire@users.noreply.github.com> Date: Tue, 21 May 2024 18:16:58 +0000 Subject: [PATCH 4/9] Update Include/internal/pycore_critical_section.h Use braces to restrict scope of conditional locking Co-authored-by: Sam Gross --- Include/internal/pycore_critical_section.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Include/internal/pycore_critical_section.h b/Include/internal/pycore_critical_section.h index a77cba8b060eb1..a08f434e390f9b 100644 --- a/Include/internal/pycore_critical_section.h +++ b/Include/internal/pycore_critical_section.h @@ -124,8 +124,9 @@ extern "C" { &_orig_seq->ob_mutex) \ # define Py_END_CRITICAL_SECTION_SEQUENCE_FAST() \ - if (_should_lock_cs) \ + if (_should_lock_cs) { \ _PyCriticalSection_End(&_cs); \ + } \ } // Asserts that the mutex is locked. The mutex must be held by the From 1f3cec034c671148f30bc6a29e4b9430175a4c7e Mon Sep 17 00:00:00 2001 From: "Josh {*()} Rosenberg" <26495692+MojoVampire@users.noreply.github.com> Date: Tue, 21 May 2024 18:36:12 +0000 Subject: [PATCH 5/9] Tweak comments per review Co-authored-by: Sam Gross --- Include/internal/pycore_critical_section.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Include/internal/pycore_critical_section.h b/Include/internal/pycore_critical_section.h index a08f434e390f9b..f9b32b6f0dfd97 100644 --- a/Include/internal/pycore_critical_section.h +++ b/Include/internal/pycore_critical_section.h @@ -109,11 +109,11 @@ extern "C" { _PyCriticalSection2_End(&_cs2); \ } -// Specialized version of critical section locking called to safely use -// PySequence_Fast APIs under nogil -// For performance, the argument *to* PySequence_Fast is provided to the -// macro, not the *result* of PySequence_Fast (which would require an extra -// test to determine if the lock must be held) +// Specialized version of critical section locking to safely use +// PySequence_Fast APIs without the GIL. For performance, the argument *to* +// PySequence_Fast() is provided to the macro, not the *result* of +// PySequence_Fast(), which would require an extra test to determine if the +// lock must be acquired. # define Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(original) \ { \ PyObject *_orig_seq = _PyObject_CAST(original); \ From 707852da197c4ac42778a78c7066dac2efc18372 Mon Sep 17 00:00:00 2001 From: "Josh {*()} Rosenberg" <26495692+MojoVampire@users.noreply.github.com> Date: Tue, 21 May 2024 18:38:36 +0000 Subject: [PATCH 6/9] Use braced if per review Co-authored-by: Sam Gross --- Include/internal/pycore_critical_section.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Include/internal/pycore_critical_section.h b/Include/internal/pycore_critical_section.h index f9b32b6f0dfd97..d821665a69cb5f 100644 --- a/Include/internal/pycore_critical_section.h +++ b/Include/internal/pycore_critical_section.h @@ -119,9 +119,9 @@ extern "C" { PyObject *_orig_seq = _PyObject_CAST(original); \ const bool _should_lock_cs = PyList_CheckExact(_orig_seq); \ _PyCriticalSection _cs; \ - if (_should_lock_cs) \ - _PyCriticalSection_Begin(&_cs, \ - &_orig_seq->ob_mutex) \ + if (_should_lock_cs) { \ + _PyCriticalSection_Begin(&_cs, &_orig_seq->ob_mutex); \ + } # define Py_END_CRITICAL_SECTION_SEQUENCE_FAST() \ if (_should_lock_cs) { \ From 66f0e6ceb6033c20c5f6a33523eb038802923bac Mon Sep 17 00:00:00 2001 From: Josh Rosenberg <26495692+MojoVampire@users.noreply.github.com> Date: Tue, 21 May 2024 14:55:15 -0400 Subject: [PATCH 7/9] Per review, refactor extend test to extend multiple times, and reduce total time to run tests (both tests combine to less than 200 ms in debug build on my laptop) --- Lib/test/test_free_threading/test_str.py | 26 ++++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Lib/test/test_free_threading/test_str.py b/Lib/test/test_free_threading/test_str.py index 3e965c75c0264c..b1bd9b43defb6c 100644 --- a/Lib/test/test_free_threading/test_str.py +++ b/Lib/test/test_free_threading/test_str.py @@ -2,7 +2,7 @@ import unittest from itertools import cycle -from threading import Thread +from threading import Event, Thread from unittest import TestCase from test.support import threading_helper @@ -12,21 +12,20 @@ class TestStr(TestCase): def test_racing_join_extend(self): '''Test joining a string being extended by another thread''' l = [] - OBJECT_COUNT = 100_000 - + ITERS = 100 + READERS = 10 + done_event = Event() def writer_func(): - l.extend(map(str, range(OBJECT_COUNT, OBJECT_COUNT*2))) - + for i in range(ITERS): + l.extend(map(str, range(i))) + l.clear() + done_event.set() def reader_func(): - while True: - count = len(l) + while not done_event.is_set(): ''.join(l) - if count == OBJECT_COUNT: - break - writer = Thread(target=writer_func) readers = [] - for x in range(30): + for x in range(READERS): reader = Thread(target=reader_func) readers.append(reader) reader.start() @@ -42,7 +41,8 @@ def test_racing_join_replace(self): strings by another thread. ''' l = [*'abcdefg'] - MAX_ORDINAL = 10_000 + MAX_ORDINAL = 1_000 + READERS = 20 def writer_func(): for i, c in zip(cycle(range(len(l))), @@ -59,7 +59,7 @@ def reader_func(): writer = Thread(target=writer_func) readers = [] - for x in range(30): + for x in range(READERS): reader = Thread(target=reader_func) readers.append(reader) reader.start() From 515ee56bd2bb4f4949f51a24bb4a456cd7dae03a Mon Sep 17 00:00:00 2001 From: Josh Rosenberg <26495692+MojoVampire@users.noreply.github.com> Date: Tue, 21 May 2024 14:56:53 -0400 Subject: [PATCH 8/9] Remove listobject.h include per review (all users of pycore_critical_section include Python.h which provides listobject.h) --- Include/internal/pycore_critical_section.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Include/internal/pycore_critical_section.h b/Include/internal/pycore_critical_section.h index d821665a69cb5f..7bebcdb67c7169 100644 --- a/Include/internal/pycore_critical_section.h +++ b/Include/internal/pycore_critical_section.h @@ -7,7 +7,6 @@ #include "pycore_lock.h" // PyMutex #include "pycore_pystate.h" // _PyThreadState_GET() -#include "listobject.h" // PyList_CheckExact #include #ifdef __cplusplus From 254a5a51721e6ae55f85c5c4f412f7b4d2190783 Mon Sep 17 00:00:00 2001 From: Josh Rosenberg <26495692+MojoVampire@users.noreply.github.com> Date: Wed, 22 May 2024 10:30:36 -0400 Subject: [PATCH 9/9] Reduce readers in test_racing_join_replace to 10, use Event to synchronize, and perform multiple joins per loop to increase chance of repro without synchronization --- Lib/test/test_free_threading/test_str.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_free_threading/test_str.py b/Lib/test/test_free_threading/test_str.py index b1bd9b43defb6c..f5e93b7de0aa9b 100644 --- a/Lib/test/test_free_threading/test_str.py +++ b/Lib/test/test_free_threading/test_str.py @@ -42,20 +42,21 @@ def test_racing_join_replace(self): ''' l = [*'abcdefg'] MAX_ORDINAL = 1_000 - READERS = 20 + READERS = 10 + done_event = Event() def writer_func(): for i, c in zip(cycle(range(len(l))), map(chr, range(128, MAX_ORDINAL))): l[i] = c - del l[:] # Empty list to tell readers to exit + done_event.set() def reader_func(): - while True: - empty = not l + while not done_event.is_set(): + ''.join(l) + ''.join(l) + ''.join(l) ''.join(l) - if empty: - break writer = Thread(target=writer_func) readers = []