From 1dd4bb65fe57b3f42ec0e6d0a0873cf90f7ab9ae Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Tue, 14 Jan 2025 11:45:05 -0800
Subject: [PATCH 01/10] Use prefetch in GC mark alive phase.

---
 ...-01-22-14-22-34.gh-issue-129201.wiZzEb.rst |   4 +
 Python/gc_free_threading.c                    | 250 ++++++++++++++----
 2 files changed, 197 insertions(+), 57 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-01-22-14-22-34.gh-issue-129201.wiZzEb.rst

diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-01-22-14-22-34.gh-issue-129201.wiZzEb.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-01-22-14-22-34.gh-issue-129201.wiZzEb.rst
new file mode 100644
index 00000000000000..d424ce10dbe840
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-01-22-14-22-34.gh-issue-129201.wiZzEb.rst
@@ -0,0 +1,4 @@
+The free-threaded version of the cyclic garbage collector has been optimized
+to use CPU prefetch instructions during the collection.  On large object
+graphs, this can reduce collection times by making it more likely that data
+is in the CPU cache when its needed.
diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index d1023d9351086f..bf1087be9254fe 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -21,6 +21,9 @@
 // enable the "mark alive" pass of GC
 #define GC_ENABLE_MARK_ALIVE 1
 
+// if true, enable the use of "prefetch" CPU instructions
+#define GC_ENABLE_PREFETCH_INSTRUCTIONS 1
+
 // include additional roots in "mark alive" pass
 #define GC_MARK_ALIVE_EXTRA_ROOTS 1
 
@@ -464,29 +467,75 @@ gc_maybe_untrack(PyObject *op)
 }
 
 #ifdef GC_ENABLE_MARK_ALIVE
+
+// prefetch buffer and stack //////////////////////////////////
+
+// The buffer is a circular FIFO queue of PyObject pointers.  We take
+// care to not dereference these pointers until they are taken out of
+// the buffer.  A prefetch CPU instruction is issued when a pointer is
+// put into the buffer.  If all is working as expected, there will be
+// enough time between the enqueue and dequeue so that the needed memory
+// for the object, most importantly ob_gc_bits and ob_type words, will
+// already be in the CPU cache.
+#define BUFFER_SIZE 256
+#define BUFFER_HI 16
+#define BUFFER_LO 8
+
+#if !(defined(__GNUC__) || defined(__clang__))
+#undef GC_ENABLE_PREFETCH_INSTRUCTIONS
+#endif
+
+#ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
+#define prefetch(ptr) __builtin_prefetch(ptr, 1, 3)
+#else
+#define prefetch(ptr)
+#endif
+
+struct gc_mark_args {
+    Py_ssize_t enqueued;
+    Py_ssize_t dequeued;
+    _PyObjectStack stack;
+    PyObject *buffer[BUFFER_SIZE];
+};
+
+// Called when we run out of space in the buffer.  The object will be added
+// to gc_mark_args.stack instead.
 static int
-mark_alive_stack_push(PyObject *op, _PyObjectStack *stack)
+gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
 {
-    if (op == NULL) {
-        return 0;
+    if (_PyObjectStack_Push(ms, op) < 0) {
+        return -1;
     }
-    if (!_PyObject_GC_IS_TRACKED(op)) {
+    return 0;
+}
+
+// Called when there is space in the buffer for the object.  Add it to the end
+// of the buffer and issue the prefetch instruction.
+static inline void
+gc_mark_buffer_push(PyObject *op, struct gc_mark_args *args)
+{
+#if Py_DEBUG
+        Py_ssize_t buf_used = args->enqueued - args->dequeued;
+        assert(buf_used < BUFFER_SIZE);
+#endif
+        args->buffer[args->enqueued % BUFFER_SIZE] = op;
+        args->enqueued++;
+        prefetch(op);
+}
+
+// Called when we find an object that needs to be marked alive (either from a
+// root or from calling tp_traverse).
+static int
+gc_mark_enqueue(PyObject *op, struct gc_mark_args *args)
+{
+    assert(op != NULL);
+    if (args->enqueued - args->dequeued < BUFFER_SIZE) {
+        gc_mark_buffer_push(op, args);
         return 0;
     }
-    if (gc_is_alive(op)) {
-        return 0; // already visited this object
-    }
-    if (gc_maybe_untrack(op)) {
-        return 0; // was untracked, don't visit it
-    }
-
-    // Need to call tp_traverse on this object. Add to stack and mark it
-    // alive so we don't traverse it a second time.
-    gc_set_alive(op);
-    if (_PyObjectStack_Push(stack, op) < 0) {
-        return -1;
+    else {
+        return gc_mark_stack_push(&args->stack, op);
     }
-    return 0;
 }
 
 static bool
@@ -503,28 +552,60 @@ gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area,
     return true;
 }
 
+static int
+gc_mark_traverse_list(PyObject *self, void *args)
+{
+    PyListObject *list = (PyListObject *)self;
+    if (list->ob_item == NULL) {
+        return 0;
+    }
+    for (Py_ssize_t i = 0; i < Py_SIZE(list); i++) {
+        if (gc_mark_enqueue(list->ob_item[i], args) < 0) {
+            return -1;
+        }
+    }
+    return 0;
+}
+
+static int
+gc_mark_traverse_tuple(PyObject *self, void *args)
+{
+    _PyTuple_MaybeUntrack(self);
+    if (!gc_has_bit(self,  _PyGC_BITS_TRACKED)) {
+        return 0;
+    }
+    PyTupleObject *tuple = _PyTuple_CAST(self);
+    for (Py_ssize_t i = Py_SIZE(tuple); --i >= 0; ) {
+        PyObject *item = tuple->ob_item[i];
+        if (item == NULL) {
+            continue;
+        }
+        if (gc_mark_enqueue(tuple->ob_item[i], args) < 0) {
+            return -1;
+        }
+    }
+    return 0;
+}
+
 static void
 gc_abort_mark_alive(PyInterpreterState *interp,
                     struct collection_state *state,
-                    _PyObjectStack *stack)
+                    struct gc_mark_args *args)
 {
     // We failed to allocate memory for "stack" while doing the "mark
     // alive" phase.  In that case, free the object stack and make sure
     // that no objects have the alive bit set.
-    _PyObjectStack_Clear(stack);
+    _PyObjectStack_Clear(&args->stack);
     gc_visit_heaps(interp, &gc_clear_alive_bits, &state->base);
 }
 
 #ifdef GC_MARK_ALIVE_STACKS
 static int
-gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref)
+gc_visit_stackref_mark_alive(struct gc_mark_args *args, _PyStackRef stackref)
 {
-    // Note: we MUST check that it is deferred before checking the rest.
-    // Otherwise we might read into invalid memory due to non-deferred references
-    // being dead already.
-    if (PyStackRef_IsDeferred(stackref) && !PyStackRef_IsNull(stackref)) {
+    if (!PyStackRef_IsNull(stackref)) {
         PyObject *op = PyStackRef_AsPyObjectBorrow(stackref);
-        if (mark_alive_stack_push(op, stack) < 0) {
+        if (gc_mark_enqueue(op, args) < 0) {
             return -1;
         }
     }
@@ -532,7 +613,7 @@ gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref)
 }
 
 static int
-gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *stack)
+gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, struct gc_mark_args *args)
 {
     _Py_FOR_EACH_TSTATE_BEGIN(interp, p) {
         for (_PyInterpreterFrame *f = p->current_frame; f != NULL; f = f->previous) {
@@ -542,12 +623,12 @@ gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *st
             }
 
             PyCodeObject *co = (PyCodeObject *)executable;
-            int max_stack = co->co_nlocalsplus + co->co_stacksize;
-            if (gc_visit_stackref_mark_alive(stack, f->f_executable) < 0) {
+            int max_stack = co->co_nlocals;
+            if (gc_visit_stackref_mark_alive(args, f->f_executable) < 0) {
                 return -1;
             }
             for (int i = 0; i < max_stack; i++) {
-                if (gc_visit_stackref_mark_alive(stack, f->localsplus[i]) < 0) {
+                if (gc_visit_stackref_mark_alive(args, f->localsplus[i]) < 0) {
                     return -1;
                 }
             }
@@ -880,22 +961,73 @@ static int
 move_legacy_finalizer_reachable(struct collection_state *state);
 
 #ifdef GC_ENABLE_MARK_ALIVE
-static int
-propagate_alive_bits(_PyObjectStack *stack)
+
+static void
+gc_mark_buffer_prime(struct gc_mark_args *args)
 {
     for (;;) {
-        PyObject *op = _PyObjectStack_Pop(stack);
+        Py_ssize_t buf_used = args->enqueued - args->dequeued;
+        if (buf_used >= BUFFER_HI) {
+            // When priming, don't fill the buffer since that would
+            // likely cause the stack to be used shortly after when it
+            // fills. We want to use the buffer as much as possible and
+            // so we only fill to BUFFER_HI, not BUFFER_SIZE.
+            return;
+        }
+        PyObject *op = _PyObjectStack_Pop(&args->stack);
         if (op == NULL) {
             break;
         }
-        assert(_PyObject_GC_IS_TRACKED(op));
-        assert(gc_is_alive(op));
+        gc_mark_buffer_push(op, args);
+    }
+}
+
+static int
+gc_propagate_alive(struct gc_mark_args *args)
+{
+    for (;;) {
+        Py_ssize_t buf_used = args->enqueued - args->dequeued;
+        if (buf_used <= BUFFER_LO) {
+            // The mark buffer is getting empty.  If it's too empty
+            // then there will not be enough delay between issuing
+            // the prefetch vs when the object is actually accessed.
+            // Prime the buffer with object pointers from the stack,
+            // if there are any available.
+            gc_mark_buffer_prime(args);
+            if (args->enqueued == args->dequeued) {
+                return 0; // stack and buffer are both empty
+            }
+        }
+        PyObject *op = args->buffer[args->dequeued % BUFFER_SIZE];
+        args->dequeued++;
+
+        if (!gc_has_bit(op, _PyGC_BITS_TRACKED)) {
+            continue;
+        }
+
+        if (gc_is_alive(op)) {
+            continue; // already visited this object
+        }
+
+        // Need to call tp_traverse on this object. Mark it alive so we
+        // don't traverse it a second time.
+        gc_set_alive(op);
+
         traverseproc traverse = Py_TYPE(op)->tp_traverse;
-        if (traverse(op, (visitproc)&mark_alive_stack_push, stack) < 0) {
+        if (traverse == PyList_Type.tp_traverse) {
+            if (gc_mark_traverse_list(op, args) < 0) {
+                return -1;
+            }
+        }
+        else if (traverse == PyTuple_Type.tp_traverse) {
+            if (gc_mark_traverse_tuple(op, args) < 0) {
+                return -1;
+            }
+        }
+        else if (traverse(op, (visitproc)&gc_mark_enqueue, args) < 0) {
             return -1;
         }
     }
-    return 0;
 }
 
 // Using tp_traverse, mark everything reachable from known root objects
@@ -915,48 +1047,52 @@ propagate_alive_bits(_PyObjectStack *stack)
 //
 // Returns -1 on failure (out of memory).
 static int
-mark_alive_from_roots(PyInterpreterState *interp,
-                      struct collection_state *state)
+gc_mark_alive_from_roots(PyInterpreterState *interp,
+                         struct collection_state *state)
 {
 #ifdef GC_DEBUG
     // Check that all objects don't have alive bit set
     gc_visit_heaps(interp, &validate_alive_bits, &state->base);
 #endif
-    _PyObjectStack stack = { NULL };
-
-    #define STACK_PUSH(op) \
-        if (mark_alive_stack_push(op, &stack) < 0) { \
-            gc_abort_mark_alive(interp, state, &stack); \
-            return -1; \
+    struct gc_mark_args mark_args = { 0 };
+
+    #define MARK_ENQUEUE(op) \
+        if (op != NULL ) { \
+            if (gc_mark_enqueue(op, &mark_args) < 0) { \
+                gc_abort_mark_alive(interp, state, &mark_args); \
+                return -1; \
+            } \
         }
-    STACK_PUSH(interp->sysdict);
+    MARK_ENQUEUE(interp->sysdict);
 #ifdef GC_MARK_ALIVE_EXTRA_ROOTS
-    STACK_PUSH(interp->builtins);
-    STACK_PUSH(interp->dict);
+    MARK_ENQUEUE(interp->builtins);
+    MARK_ENQUEUE(interp->dict);
     struct types_state *types = &interp->types;
     for (int i = 0; i < _Py_MAX_MANAGED_STATIC_BUILTIN_TYPES; i++) {
-        STACK_PUSH(types->builtins.initialized[i].tp_dict);
-        STACK_PUSH(types->builtins.initialized[i].tp_subclasses);
+        MARK_ENQUEUE(types->builtins.initialized[i].tp_dict);
+        MARK_ENQUEUE(types->builtins.initialized[i].tp_subclasses);
     }
     for (int i = 0; i < _Py_MAX_MANAGED_STATIC_EXT_TYPES; i++) {
-        STACK_PUSH(types->for_extensions.initialized[i].tp_dict);
-        STACK_PUSH(types->for_extensions.initialized[i].tp_subclasses);
+        MARK_ENQUEUE(types->for_extensions.initialized[i].tp_dict);
+        MARK_ENQUEUE(types->for_extensions.initialized[i].tp_subclasses);
     }
 #endif
 #ifdef GC_MARK_ALIVE_STACKS
-    if (gc_visit_thread_stacks_mark_alive(interp, &stack) < 0) {
-        gc_abort_mark_alive(interp, state, &stack);
+    if (gc_visit_thread_stacks_mark_alive(interp, &mark_args) < 0) {
+        gc_abort_mark_alive(interp, state, &mark_args);
         return -1;
     }
 #endif
-    #undef STACK_PUSH
+    #undef MARK_ENQUEUE
 
     // Use tp_traverse to find everything reachable from roots.
-    if (propagate_alive_bits(&stack) < 0) {
-        gc_abort_mark_alive(interp, state, &stack);
+    if (gc_propagate_alive(&mark_args) < 0) {
+        gc_abort_mark_alive(interp, state, &mark_args);
         return -1;
     }
 
+    assert(mark_args.stack.head == NULL);
+
     return 0;
 }
 #endif // GC_ENABLE_MARK_ALIVE
@@ -1531,7 +1667,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state,
     if (!state->gcstate->freeze_active) {
         // Mark objects reachable from known roots as "alive".  These will
         // be ignored for rest of the GC pass.
-        int err = mark_alive_from_roots(interp, state);
+        int err = gc_mark_alive_from_roots(interp, state);
         if (err < 0) {
             _PyEval_StartTheWorld(interp);
             PyErr_NoMemory();

From 1b4e8c39e99ce39b39c749010615fc799dc19077 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Wed, 22 Jan 2025 15:04:39 -0800
Subject: [PATCH 02/10] Add prefetch() macro that might work on MSVC.

---
 Python/gc_free_threading.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index bf1087be9254fe..865a415bf3a065 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -481,12 +481,23 @@ gc_maybe_untrack(PyObject *op)
 #define BUFFER_HI 16
 #define BUFFER_LO 8
 
-#if !(defined(__GNUC__) || defined(__clang__))
-#undef GC_ENABLE_PREFETCH_INSTRUCTIONS
+#ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
+#if (defined(__GNUC__) || defined(__clang__))
+#define USE_BUILTIN_PREFETCH 1
+#elif (defined(__cplusplus) && (__cplusplus >= 201103))
+#if defined(_MSC_VER)
+#include <instrin.h>
+#else
+#include <xmmintrin.h>
+#endif
+#define USE_MM_PREFETCH 1
 #endif
+#endif // GC_ENABLE_PREFETCH_INSTRUCTIONS
 
-#ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
+#if defined(USE_BUILTIN_PREFETCH)
 #define prefetch(ptr) __builtin_prefetch(ptr, 1, 3)
+#elif defined(USE_MM_PREFETCH)
+#define prefetch(ptr) __mm_prefetch(ptr, _MM_HINT_T0)
 #else
 #define prefetch(ptr)
 #endif
@@ -514,7 +525,7 @@ gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
 static inline void
 gc_mark_buffer_push(PyObject *op, struct gc_mark_args *args)
 {
-#if Py_DEBUG
+#ifdef Py_DEBUG
         Py_ssize_t buf_used = args->enqueued - args->dequeued;
         assert(buf_used < BUFFER_SIZE);
 #endif

From 9fbfd43a724c49ea52600138ad2732ef6b120198 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Wed, 22 Jan 2025 20:12:28 -0800
Subject: [PATCH 03/10] Improve portability of prefetch macros.

---
 Python/gc_free_threading.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index 865a415bf3a065..17172daae714ba 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -481,23 +481,23 @@ gc_maybe_untrack(PyObject *op)
 #define BUFFER_HI 16
 #define BUFFER_LO 8
 
-#ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
-#if (defined(__GNUC__) || defined(__clang__))
-#define USE_BUILTIN_PREFETCH 1
-#elif (defined(__cplusplus) && (__cplusplus >= 201103))
-#if defined(_MSC_VER)
-#include <instrin.h>
+#if defined(__GNUC__) || defined(__clang__)
+    #define PREFETCH_L1(ptr)  __builtin_prefetch(ptr, 1, 3)
+    #define PREFETCH_L2(ptr)  __builtin_prefetch(ptr, 1, 2)
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC)
+    #include <mmintrin.h>
+    #define PREFETCH_L1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+    #define PREFETCH_L2(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+#elif defined(__aarch64__)
+    #define PREFETCH_L1(ptr)  do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
+    #define PREFETCH_L2(ptr)  do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
 #else
-#include <xmmintrin.h>
-#endif
-#define USE_MM_PREFETCH 1
+    #define PREFETCH_L1(ptr) do { (void)(ptr); } while (0)  /* disabled */
+    #define PREFETCH_L2(ptr) do { (void)(ptr); } while (0)  /* disabled */
 #endif
-#endif // GC_ENABLE_PREFETCH_INSTRUCTIONS
 
-#if defined(USE_BUILTIN_PREFETCH)
-#define prefetch(ptr) __builtin_prefetch(ptr, 1, 3)
-#elif defined(USE_MM_PREFETCH)
-#define prefetch(ptr) __mm_prefetch(ptr, _MM_HINT_T0)
+#ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
+#define prefetch(ptr) PREFETCH_L2(ptr)
 #else
 #define prefetch(ptr)
 #endif

From 7f51104490216a377c749e29bfe125d837e49d3e Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Thu, 23 Jan 2025 20:33:07 -0800
Subject: [PATCH 04/10] Implement "spans" for marking phase.

When traversing a list or tuple, use a "span" if the buffer can't hold
all the items from the collection.  This reduces the size of the object
stack needed if large collections are encountered.  It also helps
keeps the buffer size optimal for prefetching.
---
 Python/gc_free_threading.c | 168 +++++++++++++++++++++++++++----------
 1 file changed, 123 insertions(+), 45 deletions(-)

diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index 17172daae714ba..59e3c4e5c1387e 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -482,8 +482,8 @@ gc_maybe_untrack(PyObject *op)
 #define BUFFER_LO 8
 
 #if defined(__GNUC__) || defined(__clang__)
-    #define PREFETCH_L1(ptr)  __builtin_prefetch(ptr, 1, 3)
-    #define PREFETCH_L2(ptr)  __builtin_prefetch(ptr, 1, 2)
+    #define PREFETCH_L1(ptr)  __builtin_prefetch(ptr, 0, 3)
+    #define PREFETCH_L2(ptr)  __builtin_prefetch(ptr, 0, 2)
 #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC)
     #include <mmintrin.h>
     #define PREFETCH_L1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
@@ -497,17 +497,30 @@ gc_maybe_untrack(PyObject *op)
 #endif
 
 #ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
-#define prefetch(ptr) PREFETCH_L2(ptr)
+#define prefetch(ptr) PREFETCH_L1(ptr)
 #else
 #define prefetch(ptr)
 #endif
 
-struct gc_mark_args {
+// a contigous sequence of PyObject pointers
+typedef struct {
+    PyObject **start;
+    PyObject **end;
+} gc_span_t;
+
+typedef struct {
+    Py_ssize_t size;
+    Py_ssize_t capacity;
+    gc_span_t *stack;
+} gc_span_stack_t;
+
+typedef struct {
     Py_ssize_t enqueued;
     Py_ssize_t dequeued;
     _PyObjectStack stack;
+    gc_span_stack_t spans;
     PyObject *buffer[BUFFER_SIZE];
-};
+} gc_mark_args_t;
 
 // Called when we run out of space in the buffer.  The object will be added
 // to gc_mark_args.stack instead.
@@ -520,24 +533,45 @@ gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
     return 0;
 }
 
+static int
+gc_mark_span_push(gc_span_stack_t *ss, PyObject **start, PyObject **end)
+{
+    if (ss->size >= ss->capacity) {
+        if (ss->capacity == 0) {
+            ss->capacity = 256;
+        }
+        else {
+            ss->capacity *= 2;
+        }
+        ss->stack = (gc_span_t *)PyMem_Realloc(ss->stack, ss->capacity * sizeof(gc_span_t));
+        if (ss->stack == NULL) {
+            return -1;
+        }
+    }
+    ss->stack[ss->size].start = start;
+    ss->stack[ss->size].end = end;
+    ss->size++;
+    return 0;
+}
+
 // Called when there is space in the buffer for the object.  Add it to the end
 // of the buffer and issue the prefetch instruction.
-static inline void
-gc_mark_buffer_push(PyObject *op, struct gc_mark_args *args)
+static void
+gc_mark_buffer_push(PyObject *op, gc_mark_args_t *args)
 {
 #ifdef Py_DEBUG
         Py_ssize_t buf_used = args->enqueued - args->dequeued;
         assert(buf_used < BUFFER_SIZE);
 #endif
+        prefetch(op);
         args->buffer[args->enqueued % BUFFER_SIZE] = op;
         args->enqueued++;
-        prefetch(op);
 }
 
 // Called when we find an object that needs to be marked alive (either from a
 // root or from calling tp_traverse).
 static int
-gc_mark_enqueue(PyObject *op, struct gc_mark_args *args)
+gc_mark_enqueue(PyObject *op, gc_mark_args_t *args)
 {
     assert(op != NULL);
     if (args->enqueued - args->dequeued < BUFFER_SIZE) {
@@ -549,6 +583,25 @@ gc_mark_enqueue(PyObject *op, struct gc_mark_args *args)
     }
 }
 
+static int
+gc_mark_enqueue_span(PyObject **item, Py_ssize_t size, gc_mark_args_t *args)
+{
+    Py_ssize_t used = args->enqueued - args->dequeued;
+    Py_ssize_t free = BUFFER_SIZE - used;
+    if (free > size) {
+        for (Py_ssize_t i = 0; i < size; i++) {
+            gc_mark_buffer_push(item[i], args);
+        }
+    }
+    else {
+        PyObject **end = &item[size];
+        if (gc_mark_span_push(&args->spans, item, end) < 0) {
+            return -1;
+        }
+    }
+    return 0;
+}
+
 static bool
 gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area,
                     void *block, size_t block_size, void *args)
@@ -570,10 +623,8 @@ gc_mark_traverse_list(PyObject *self, void *args)
     if (list->ob_item == NULL) {
         return 0;
     }
-    for (Py_ssize_t i = 0; i < Py_SIZE(list); i++) {
-        if (gc_mark_enqueue(list->ob_item[i], args) < 0) {
-            return -1;
-        }
+    if (gc_mark_enqueue_span(list->ob_item, PyList_GET_SIZE(list), args) < 0) {
+        return -1;
     }
     return 0;
 }
@@ -586,14 +637,8 @@ gc_mark_traverse_tuple(PyObject *self, void *args)
         return 0;
     }
     PyTupleObject *tuple = _PyTuple_CAST(self);
-    for (Py_ssize_t i = Py_SIZE(tuple); --i >= 0; ) {
-        PyObject *item = tuple->ob_item[i];
-        if (item == NULL) {
-            continue;
-        }
-        if (gc_mark_enqueue(tuple->ob_item[i], args) < 0) {
-            return -1;
-        }
+    if (gc_mark_enqueue_span(tuple->ob_item, Py_SIZE(tuple), args) < 0) {
+        return -1;
     }
     return 0;
 }
@@ -601,18 +646,21 @@ gc_mark_traverse_tuple(PyObject *self, void *args)
 static void
 gc_abort_mark_alive(PyInterpreterState *interp,
                     struct collection_state *state,
-                    struct gc_mark_args *args)
+                    gc_mark_args_t *args)
 {
     // We failed to allocate memory for "stack" while doing the "mark
     // alive" phase.  In that case, free the object stack and make sure
     // that no objects have the alive bit set.
     _PyObjectStack_Clear(&args->stack);
+    if (args->spans.stack != NULL) {
+        PyMem_Free(args->spans.stack);
+    }
     gc_visit_heaps(interp, &gc_clear_alive_bits, &state->base);
 }
 
 #ifdef GC_MARK_ALIVE_STACKS
 static int
-gc_visit_stackref_mark_alive(struct gc_mark_args *args, _PyStackRef stackref)
+gc_visit_stackref_mark_alive(gc_mark_args_t *args, _PyStackRef stackref)
 {
     if (!PyStackRef_IsNull(stackref)) {
         PyObject *op = PyStackRef_AsPyObjectBorrow(stackref);
@@ -624,7 +672,7 @@ gc_visit_stackref_mark_alive(struct gc_mark_args *args, _PyStackRef stackref)
 }
 
 static int
-gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, struct gc_mark_args *args)
+gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, gc_mark_args_t *args)
 {
     _Py_FOR_EACH_TSTATE_BEGIN(interp, p) {
         for (_PyInterpreterFrame *f = p->current_frame; f != NULL; f = f->previous) {
@@ -974,39 +1022,65 @@ move_legacy_finalizer_reachable(struct collection_state *state);
 #ifdef GC_ENABLE_MARK_ALIVE
 
 static void
-gc_mark_buffer_prime(struct gc_mark_args *args)
-{
-    for (;;) {
-        Py_ssize_t buf_used = args->enqueued - args->dequeued;
-        if (buf_used >= BUFFER_HI) {
-            // When priming, don't fill the buffer since that would
-            // likely cause the stack to be used shortly after when it
-            // fills. We want to use the buffer as much as possible and
-            // so we only fill to BUFFER_HI, not BUFFER_SIZE.
-            return;
+gc_prime_from_spans(gc_mark_args_t *args)
+{
+    Py_ssize_t space = BUFFER_HI - (args->enqueued - args->dequeued);
+    assert(space >= 1); // needed to make progress
+    gc_span_t entry = args->spans.stack[--args->spans.size];
+    while (entry.start < entry.end) {
+        PyObject *op = *entry.start;
+        if (op != NULL) {
+            if (space > 0) {
+                gc_mark_buffer_push(op, args);
+                space--;
+            }
+            else {
+                // no more space in buffer, push remaining
+                gc_mark_span_push(&args->spans, entry.start, entry.end);
+                break;
+            }
         }
-        PyObject *op = _PyObjectStack_Pop(&args->stack);
-        if (op == NULL) {
-            break;
+        entry.start++;
+    }
+}
+
+static void
+gc_prime_buffer(gc_mark_args_t *args)
+{
+    if (args->spans.size > 0) {
+        gc_prime_from_spans(args);
+    }
+    else {
+        // When priming, don't fill the buffer too full since that would
+        // likely cause the stack to be used shortly after when it
+        // fills. We want to use the buffer as much as possible and so
+        // we only fill to BUFFER_HI, not BUFFER_SIZE.
+        Py_ssize_t space = BUFFER_HI - (args->enqueued - args->dequeued);
+        while (space > 0) {
+            PyObject *op = _PyObjectStack_Pop(&args->stack);
+            if (op == NULL) {
+                return;
+            }
+            gc_mark_buffer_push(op, args);
+            space--;
         }
-        gc_mark_buffer_push(op, args);
     }
 }
 
 static int
-gc_propagate_alive(struct gc_mark_args *args)
+gc_propagate_alive(gc_mark_args_t *args)
 {
     for (;;) {
         Py_ssize_t buf_used = args->enqueued - args->dequeued;
         if (buf_used <= BUFFER_LO) {
             // The mark buffer is getting empty.  If it's too empty
             // then there will not be enough delay between issuing
-            // the prefetch vs when the object is actually accessed.
-            // Prime the buffer with object pointers from the stack,
-            // if there are any available.
-            gc_mark_buffer_prime(args);
+            // the prefetch and when the object is actually accessed.
+            // Prime the buffer with object pointers from the stack or
+            // from the spans, if there are any available.
+            gc_prime_buffer(args);
             if (args->enqueued == args->dequeued) {
-                return 0; // stack and buffer are both empty
+                return 0; // buffer empty, done
             }
         }
         PyObject *op = args->buffer[args->dequeued % BUFFER_SIZE];
@@ -1065,7 +1139,7 @@ gc_mark_alive_from_roots(PyInterpreterState *interp,
     // Check that all objects don't have alive bit set
     gc_visit_heaps(interp, &validate_alive_bits, &state->base);
 #endif
-    struct gc_mark_args mark_args = { 0 };
+    gc_mark_args_t mark_args = { 0 };
 
     #define MARK_ENQUEUE(op) \
         if (op != NULL ) { \
@@ -1102,6 +1176,10 @@ gc_mark_alive_from_roots(PyInterpreterState *interp,
         return -1;
     }
 
+    assert(mark_args.spans.size == 0);
+    if (mark_args.spans.stack != NULL) {
+        PyMem_Free(mark_args.spans.stack);
+    }
     assert(mark_args.stack.head == NULL);
 
     return 0;

From 041b2e4e7afb387f1356f57fc234c76c8a93fd2d Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Fri, 24 Jan 2025 13:09:11 -0800
Subject: [PATCH 05/10] Fix bug in "span" enqueue logic.

It's possible for lists or tuples to have a NULL item.  Handle that
in the case that all item elements fit into the buffer.
---
 Python/gc_free_threading.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index 59e3c4e5c1387e..bc778a08cdcbdb 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -590,7 +590,11 @@ gc_mark_enqueue_span(PyObject **item, Py_ssize_t size, gc_mark_args_t *args)
     Py_ssize_t free = BUFFER_SIZE - used;
     if (free > size) {
         for (Py_ssize_t i = 0; i < size; i++) {
-            gc_mark_buffer_push(item[i], args);
+            PyObject *op = item[i];
+            if (op == NULL) {
+                continue;
+            }
+            gc_mark_buffer_push(op, args);
         }
     }
     else {

From 86a5c95e95a5b36c1cdd6a016ca647be1c23f1c8 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Mon, 27 Jan 2025 11:38:04 -0800
Subject: [PATCH 06/10] Add some asserts, minor code cleanup.

---
 Python/gc_free_threading.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index bc778a08cdcbdb..ad5816cb9b0c24 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -502,7 +502,7 @@ gc_maybe_untrack(PyObject *op)
 #define prefetch(ptr)
 #endif
 
-// a contigous sequence of PyObject pointers
+// a contigous sequence of PyObject pointers, can contain NULLs
 typedef struct {
     PyObject **start;
     PyObject **end;
@@ -548,6 +548,7 @@ gc_mark_span_push(gc_span_stack_t *ss, PyObject **start, PyObject **end)
             return -1;
         }
     }
+    assert(end > start);
     ss->stack[ss->size].start = start;
     ss->stack[ss->size].end = end;
     ss->size++;
@@ -560,12 +561,12 @@ static void
 gc_mark_buffer_push(PyObject *op, gc_mark_args_t *args)
 {
 #ifdef Py_DEBUG
-        Py_ssize_t buf_used = args->enqueued - args->dequeued;
-        assert(buf_used < BUFFER_SIZE);
+    Py_ssize_t buf_used = args->enqueued - args->dequeued;
+    assert(buf_used < BUFFER_SIZE);
 #endif
-        prefetch(op);
-        args->buffer[args->enqueued % BUFFER_SIZE] = op;
-        args->enqueued++;
+    prefetch(op);
+    args->buffer[args->enqueued % BUFFER_SIZE] = op;
+    args->enqueued++;
 }
 
 // Called when we find an object that needs to be marked alive (either from a
@@ -598,6 +599,7 @@ gc_mark_enqueue_span(PyObject **item, Py_ssize_t size, gc_mark_args_t *args)
         }
     }
     else {
+        assert(size > 0);
         PyObject **end = &item[size];
         if (gc_mark_span_push(&args->spans, item, end) < 0) {
             return -1;

From e12dd2eec943c84b0693d7d87e97e8ec88ab7884 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Thu, 30 Jan 2025 20:43:15 -0800
Subject: [PATCH 07/10] Fix bug in tuple untrack logic.

Need to clear the "alive" bit.
---
 Python/gc_free_threading.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index 8775f7b38d16e6..89eb5754f3276d 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -648,6 +648,7 @@ gc_mark_traverse_tuple(PyObject *self, void *args)
 {
     _PyTuple_MaybeUntrack(self);
     if (!gc_has_bit(self,  _PyGC_BITS_TRACKED)) {
+        gc_clear_alive(self);
         return 0;
     }
     PyTupleObject *tuple = _PyTuple_CAST(self);

From 15ada78c1663523238b4d624d3979822b668ff25 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Fri, 31 Jan 2025 20:40:08 -0800
Subject: [PATCH 08/10] Improve "prefetch" instruction macros.

---
 Python/gc_free_threading.c | 53 ++++++++++++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index 89eb5754f3276d..88193469cb3cff 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -489,25 +489,56 @@ gc_maybe_untrack(PyObject *op)
 #define BUFFER_HI 16
 #define BUFFER_LO 8
 
+// Prefetch intructions will fetch the line of data from memory that
+// contains the byte specified with the source operand to a location in
+// the cache hierarchy specified by a locality hint.  The instruction
+// is only a hint and the CPU is free to ignore it.  Instructions and
+// behaviour are CPU specific but the definitions of locality hints
+// below are mostly consistent.
+//
+// * T0 (temporal data) prefetch data into all levels of the cache hierarchy.
+//
+// * T1 (temporal data with respect to first level cache) prefetch data into
+//   level 2 cache and higher.
+//
+// * T2 (temporal data with respect to second level cache) prefetch data into
+//   level 3 cache and higher, or an implementation-specific choice.
+//
+// * NTA (non-temporal data with respect to all cache levels) prefetch data into
+//   non-temporal cache structure and into a location close to the processor,
+//   minimizing cache pollution.
+
 #if defined(__GNUC__) || defined(__clang__)
-    #define PREFETCH_L1(ptr)  __builtin_prefetch(ptr, 0, 3)
-    #define PREFETCH_L2(ptr)  __builtin_prefetch(ptr, 0, 2)
+    #define PREFETCH_T0(ptr)  __builtin_prefetch(ptr, 0, 3)
+    #define PREFETCH_T1(ptr)  __builtin_prefetch(ptr, 0, 2)
+    #define PREFETCH_T2(ptr)  __builtin_prefetch(ptr, 0, 1)
+    #define PREFETCH_NTA(ptr)  __builtin_prefetch(ptr, 0, 0)
 #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC)
     #include <mmintrin.h>
-    #define PREFETCH_L1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
-    #define PREFETCH_L2(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
-#elif defined(__aarch64__)
-    #define PREFETCH_L1(ptr)  do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
-    #define PREFETCH_L2(ptr)  do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
+    #define PREFETCH_T0(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+    #define PREFETCH_T1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+    #define PREFETCH_T2(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T2)
+    #define PREFETCH_NTA(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_NTA)
+#elif defined (__aarch64__)
+    #define PREFETCH_T0(ptr)  \
+        do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
+    #define PREFETCH_T1(ptr)  \
+        do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
+    #define PREFETCH_T2(ptr)  \
+        do { __asm__ __volatile__("prfm pldl3keep, %0" ::"Q"(*(ptr))); } while (0)
+    #define PREFETCH_NTA(ptr)  \
+        do { __asm__ __volatile__("prfm pldl1strm, %0" ::"Q"(*(ptr))); } while (0)
 #else
-    #define PREFETCH_L1(ptr) do { (void)(ptr); } while (0)  /* disabled */
-    #define PREFETCH_L2(ptr) do { (void)(ptr); } while (0)  /* disabled */
+    #define PREFETCH_T0(ptr) do { (void)(ptr); } while (0)  /* disabled */
+    #define PREFETCH_T1(ptr) do { (void)(ptr); } while (0)  /* disabled */
+    #define PREFETCH_T2(ptr) do { (void)(ptr); } while (0)  /* disabled */
+    #define PREFETCH_NTA(ptr) do { (void)(ptr); } while (0)  /* disabled */
 #endif
 
 #ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
-#define prefetch(ptr) PREFETCH_L1(ptr)
+    #define prefetch(ptr) PREFETCH_T1(ptr)
 #else
-#define prefetch(ptr)
+    #define prefetch(ptr)
 #endif
 
 // a contigous sequence of PyObject pointers, can contain NULLs

From abfc49a9cc6c3cd8e9583b9a9c09c9200d25a8d2 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Fri, 31 Jan 2025 14:17:45 -0800
Subject: [PATCH 09/10] Use of prefetching conditionally.

Using the prefetch buffer only helps if there are enough objects.  Use
the long-lived count to decide if it's worth enabling.  If not, fallback
to the previous method of marking objects alive (dereference object
pointers as we encounter them).

Improve code by adding some additional helper functions, adding comments
and general tidying.  The buffer logic has been changed to use a mask
for size rather than the % operator.  Some other small optimizations
that only help a little.
---
 ...-01-22-14-22-34.gh-issue-129201.wiZzEb.rst |   9 +-
 Python/gc_free_threading.c                    | 207 ++++++++++++++----
 2 files changed, 165 insertions(+), 51 deletions(-)

diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-01-22-14-22-34.gh-issue-129201.wiZzEb.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-01-22-14-22-34.gh-issue-129201.wiZzEb.rst
index d424ce10dbe840..26737330716181 100644
--- a/Misc/NEWS.d/next/Core_and_Builtins/2025-01-22-14-22-34.gh-issue-129201.wiZzEb.rst
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-01-22-14-22-34.gh-issue-129201.wiZzEb.rst
@@ -1,4 +1,5 @@
-The free-threaded version of the cyclic garbage collector has been optimized
-to use CPU prefetch instructions during the collection.  On large object
-graphs, this can reduce collection times by making it more likely that data
-is in the CPU cache when its needed.
+The free-threaded version of the cyclic garbage collector has been optimized to
+conditionally use CPU prefetch instructions during the collection.  This can
+reduce collection times by making it more likely that data is in the CPU cache
+when it is needed.  The prefetch instructions are enabled if the number of
+long-lived objects (objects surviving a full collection) exceeds a threshold.
diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index 88193469cb3cff..4e6b0bcb3a005a 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -485,9 +485,10 @@ gc_maybe_untrack(PyObject *op)
 // enough time between the enqueue and dequeue so that the needed memory
 // for the object, most importantly ob_gc_bits and ob_type words, will
 // already be in the CPU cache.
-#define BUFFER_SIZE 256
+#define BUFFER_SIZE 256  // this must be a power of 2
 #define BUFFER_HI 16
 #define BUFFER_LO 8
+#define BUFFER_MASK (BUFFER_SIZE - 1)
 
 // Prefetch intructions will fetch the line of data from memory that
 // contains the byte specified with the source operand to a location in
@@ -554,15 +555,63 @@ typedef struct {
 } gc_span_stack_t;
 
 typedef struct {
-    Py_ssize_t enqueued;
-    Py_ssize_t dequeued;
+    unsigned int in;
+    unsigned int out;
     _PyObjectStack stack;
     gc_span_stack_t spans;
     PyObject *buffer[BUFFER_SIZE];
+    bool use_prefetch;
 } gc_mark_args_t;
 
-// Called when we run out of space in the buffer.  The object will be added
-// to gc_mark_args.stack instead.
+
+// Returns number of entries in buffer
+static inline unsigned int
+gc_mark_buffer_len(gc_mark_args_t *args)
+{
+    return args->in - args->out;
+}
+
+// Returns number of free entry slots in buffer
+static inline unsigned int
+gc_mark_buffer_avail(gc_mark_args_t *args)
+{
+    return BUFFER_SIZE - gc_mark_buffer_len(args);
+}
+
+static inline bool
+gc_mark_buffer_is_empty(gc_mark_args_t *args)
+{
+    return args->in == args->out;
+}
+
+static inline bool
+gc_mark_buffer_is_full(gc_mark_args_t *args)
+{
+    return gc_mark_buffer_len(args) == BUFFER_SIZE;
+}
+
+static inline PyObject *
+gc_mark_buffer_pop(gc_mark_args_t *args)
+{
+    assert(!gc_mark_buffer_is_empty(args));
+    PyObject *op = args->buffer[args->out & BUFFER_MASK];
+    args->out++;
+    return op;
+}
+
+// Called when there is space in the buffer for the object.  Issue the
+// prefetch instruction and add it to the end of the buffer.
+static inline void
+gc_mark_buffer_push(PyObject *op, gc_mark_args_t *args)
+{
+    assert(!gc_mark_buffer_is_full(args));
+    prefetch(op);
+    args->buffer[args->in & BUFFER_MASK] = op;
+    args->in++;
+}
+
+// Called when we run out of space in the buffer or if the prefetching
+// is disabled. The object will be pushed on the gc_mark_args.stack.
 static int
 gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
 {
@@ -575,6 +624,9 @@ gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
 static int
 gc_mark_span_push(gc_span_stack_t *ss, PyObject **start, PyObject **end)
 {
+    if (start == end) {
+        return 0;
+    }
     if (ss->size >= ss->capacity) {
         if (ss->capacity == 0) {
             ss->capacity = 256;
@@ -594,27 +646,36 @@ gc_mark_span_push(gc_span_stack_t *ss, PyObject **start, PyObject **end)
     return 0;
 }
 
-// Called when there is space in the buffer for the object.  Add it to the end
-// of the buffer and issue the prefetch instruction.
-static void
-gc_mark_buffer_push(PyObject *op, gc_mark_args_t *args)
+static int
+gc_mark_enqueue_no_buffer(PyObject *op, gc_mark_args_t *args)
 {
-#ifdef Py_DEBUG
-    Py_ssize_t buf_used = args->enqueued - args->dequeued;
-    assert(buf_used < BUFFER_SIZE);
-#endif
-    prefetch(op);
-    args->buffer[args->enqueued % BUFFER_SIZE] = op;
-    args->enqueued++;
+    if (op == NULL) {
+        return 0;
+    }
+    if (!gc_has_bit(op,  _PyGC_BITS_TRACKED)) {
+        return 0;
+    }
+    if (gc_is_alive(op)) {
+        return 0; // already visited this object
+    }
+    if (gc_maybe_untrack(op)) {
+        return 0; // was untracked, don't visit it
+    }
+
+    // Need to call tp_traverse on this object. Add to stack and mark it
+    // alive so we don't traverse it a second time.
+    gc_set_alive(op);
+    if (_PyObjectStack_Push(&args->stack, op) < 0) {
+        return -1;
+    }
+    return 0;
 }
 
-// Called when we find an object that needs to be marked alive (either from a
-// root or from calling tp_traverse).
 static int
-gc_mark_enqueue(PyObject *op, gc_mark_args_t *args)
+gc_mark_enqueue_buffer(PyObject *op, gc_mark_args_t *args)
 {
     assert(op != NULL);
-    if (args->enqueued - args->dequeued < BUFFER_SIZE) {
+    if (!gc_mark_buffer_is_full(args)) {
         gc_mark_buffer_push(op, args);
         return 0;
     }
@@ -623,12 +684,31 @@ gc_mark_enqueue(PyObject *op, gc_mark_args_t *args)
     }
 }
 
+// Called when we find an object that needs to be marked alive (either from a
+// root or from calling tp_traverse).
+static int
+gc_mark_enqueue(PyObject *op, gc_mark_args_t *args)
+{
+    if (args->use_prefetch) {
+        return gc_mark_enqueue_buffer(op, args);
+    }
+    else {
+        return gc_mark_enqueue_no_buffer(op, args);
+    }
+}
+
+// Called when we have a contigous sequence of PyObject pointers, either
+// a tuple or list object.  This will add the items to the buffer if there
+// is space for them all otherwise push a new "span" on the span stack.  Using
+// spans has the advantage of not creating a deep _PyObjectStack stack when
+// dealing with long sequences.  Those sequences will be processed in smaller
+// chunks by the gc_prime_from_spans() function.
 static int
 gc_mark_enqueue_span(PyObject **item, Py_ssize_t size, gc_mark_args_t *args)
 {
-    Py_ssize_t used = args->enqueued - args->dequeued;
+    Py_ssize_t used = gc_mark_buffer_len(args);
     Py_ssize_t free = BUFFER_SIZE - used;
-    if (free > size) {
+    if (free >= size) {
         for (Py_ssize_t i = 0; i < size; i++) {
             PyObject *op = item[i];
             if (op == NULL) {
@@ -694,9 +774,9 @@ gc_abort_mark_alive(PyInterpreterState *interp,
                     struct collection_state *state,
                     gc_mark_args_t *args)
 {
-    // We failed to allocate memory for "stack" while doing the "mark
-    // alive" phase.  In that case, free the object stack and make sure
-    // that no objects have the alive bit set.
+    // We failed to allocate memory while doing the "mark alive" phase.
+    // In that case, free the memory used for marking state and make
+    // sure that no objects have the alive bit set.
     _PyObjectStack_Clear(&args->stack);
     if (args->spans.stack != NULL) {
         PyMem_Free(args->spans.stack);
@@ -1089,24 +1169,26 @@ move_legacy_finalizer_reachable(struct collection_state *state);
 static void
 gc_prime_from_spans(gc_mark_args_t *args)
 {
-    Py_ssize_t space = BUFFER_HI - (args->enqueued - args->dequeued);
-    assert(space >= 1); // needed to make progress
+    Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len(args);
+    // there should always be at least this amount of space
+    assert(space <= gc_mark_buffer_avail(args));
+    assert(space > 0);
     gc_span_t entry = args->spans.stack[--args->spans.size];
-    while (entry.start < entry.end) {
+    // spans on the stack should always have one or more elements
+    assert(entry.start < entry.end);
+    do {
         PyObject *op = *entry.start;
+        entry.start++;
         if (op != NULL) {
-            if (space > 0) {
-                gc_mark_buffer_push(op, args);
-                space--;
-            }
-            else {
-                // no more space in buffer, push remaining
+            gc_mark_buffer_push(op, args);
+            space--;
+            if (space == 0) {
+                // buffer is as full was we want and not done with span
                 gc_mark_span_push(&args->spans, entry.start, entry.end);
-                break;
+                return;
             }
         }
-        entry.start++;
-    }
+    } while (entry.start < entry.end);
 }
 
 static void
@@ -1120,23 +1202,24 @@ gc_prime_buffer(gc_mark_args_t *args)
         // likely cause the stack to be used shortly after when it
         // fills. We want to use the buffer as much as possible and so
         // we only fill to BUFFER_HI, not BUFFER_SIZE.
-        Py_ssize_t space = BUFFER_HI - (args->enqueued - args->dequeued);
-        while (space > 0) {
+        Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len(args);
+        assert(space > 0);
+        do {
             PyObject *op = _PyObjectStack_Pop(&args->stack);
             if (op == NULL) {
                 return;
             }
             gc_mark_buffer_push(op, args);
             space--;
-        }
+        } while (space > 0);
     }
 }
 
 static int
-gc_propagate_alive(gc_mark_args_t *args)
+gc_propagate_alive_prefetch(gc_mark_args_t *args)
 {
     for (;;) {
-        Py_ssize_t buf_used = args->enqueued - args->dequeued;
+        Py_ssize_t buf_used = gc_mark_buffer_len(args);
         if (buf_used <= BUFFER_LO) {
             // The mark buffer is getting empty.  If it's too empty
             // then there will not be enough delay between issuing
@@ -1144,12 +1227,11 @@ gc_propagate_alive(gc_mark_args_t *args)
             // Prime the buffer with object pointers from the stack or
             // from the spans, if there are any available.
             gc_prime_buffer(args);
-            if (args->enqueued == args->dequeued) {
-                return 0; // buffer empty, done
+            if (gc_mark_buffer_is_empty(args)) {
+                return 0;
             }
         }
-        PyObject *op = args->buffer[args->dequeued % BUFFER_SIZE];
-        args->dequeued++;
+        PyObject *op = gc_mark_buffer_pop(args);
 
         if (!gc_has_bit(op, _PyGC_BITS_TRACKED)) {
             continue;
@@ -1174,12 +1256,35 @@ gc_propagate_alive(gc_mark_args_t *args)
                 return -1;
             }
         }
-        else if (traverse(op, (visitproc)&gc_mark_enqueue, args) < 0) {
+        else if (traverse(op, (visitproc)&gc_mark_enqueue_buffer, args) < 0) {
             return -1;
         }
     }
 }
 
+static int
+gc_propagate_alive(gc_mark_args_t *args)
+{
+    if (args->use_prefetch) {
+        return gc_propagate_alive_prefetch(args);
+    }
+    else {
+        for (;;) {
+            PyObject *op = _PyObjectStack_Pop(&args->stack);
+            if (op == NULL) {
+                break;
+            }
+            assert(_PyObject_GC_IS_TRACKED(op));
+            assert(gc_is_alive(op));
+            traverseproc traverse = Py_TYPE(op)->tp_traverse;
+            if (traverse(op, (visitproc)&gc_mark_enqueue_no_buffer, args) < 0) {
+                return -1;
+            }
+        }
+        return 0;
+    }
+}
+
 // Using tp_traverse, mark everything reachable from known root objects
 // (which must be non-garbage) as alive (_PyGC_BITS_ALIVE is set).  In
 // most programs, this marks nearly all objects that are not actually
@@ -1206,6 +1311,14 @@ gc_mark_alive_from_roots(PyInterpreterState *interp,
 #endif
     gc_mark_args_t mark_args = { 0 };
 
+    // Using prefetch instructions is only a win if the set of objects being
+    // examined by the GC does not fit into CPU caches.  Otherwise, using the
+    // buffer and prefetch instructions is just overhead.  Using the long lived
+    // object count seems a good estimate of if things will fit in the cache.
+    // On 64-bit platforms, the minimum object size is 32 bytes.  A 4MB L2 cache
+    // would hold about 130k objects.
+    mark_args.use_prefetch = interp->gc.long_lived_total > 200000;
+
     #define MARK_ENQUEUE(op) \
         if (op != NULL ) { \
             if (gc_mark_enqueue(op, &mark_args) < 0) { \

From 652cf336b53e1678b4810aa4d22e7bfa810b2716 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Tue, 4 Feb 2025 17:18:35 -0800
Subject: [PATCH 10/10] Add assert for buffer size settings.

---
 Python/gc_free_threading.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index 4e6b0bcb3a005a..7c994cc3f1d9fa 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -485,11 +485,20 @@ gc_maybe_untrack(PyObject *op)
 // enough time between the enqueue and dequeue so that the needed memory
 // for the object, most importantly ob_gc_bits and ob_type words, will
 // already be in the CPU cache.
-#define BUFFER_SIZE 256  // this must be a power of 2
+#define BUFFER_SIZE 256
 #define BUFFER_HI 16
 #define BUFFER_LO 8
 #define BUFFER_MASK (BUFFER_SIZE - 1)
 
+// the buffer size must be an exact power of two
+static_assert(BUFFER_SIZE > 0 && !(BUFFER_SIZE & BUFFER_MASK),
+              "Invalid BUFFER_SIZE, must be power of 2");
+// the code below assumes these relationships are true
+static_assert(BUFFER_HI < BUFFER_SIZE &&
+              BUFFER_LO < BUFFER_HI &&
+              BUFFER_LO > 0,
+              "Invalid prefetch buffer level settings.");
+
 // Prefetch intructions will fetch the line of data from memory that
 // contains the byte specified with the source operand to a location in
 // the cache hierarchy specified by a locality hint.  The instruction
@@ -1183,7 +1192,7 @@ gc_prime_from_spans(gc_mark_args_t *args)
             gc_mark_buffer_push(op, args);
             space--;
             if (space == 0) {
-                // buffer is as full was we want and not done with span
+                // buffer is as full as we want and not done with span
                 gc_mark_span_push(&args->spans, entry.start, entry.end);
                 return;
             }