From 3094372e5fc8725c7de13479e4df7a0e6479fc44 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Tue, 29 Apr 2025 16:01:23 -0700
Subject: [PATCH 01/18] gh-127266: avoid data races when updating type slots

In the free-threaded build, avoid data races caused by updating type
slots or type flags after the type was initially created.  For those
(typically rare) cases, use the stop-the-world mechanism.  Remove the
use of atomics when reading or writing type flags.  The use of atomics
is not sufficient to avoid races (since flags are sometimes read without
a lock and without atomics) and are no longer required.
---
 Include/internal/pycore_interp_structs.h      |   3 +
 Include/internal/pycore_object.h              |   2 +-
 Include/internal/pycore_typeobject.h          |   1 -
 Include/object.h                              |  12 +-
 Include/refcount.h                            |   3 -
 ...-03-14-13-08-20.gh-issue-127266._tyfBp.rst |   6 +
 Objects/typeobject.c                          | 291 ++++++++++++------
 Python/ceval.c                                |  14 +
 Tools/tsan/suppressions_free_threading.txt    |   8 +-
 9 files changed, 230 insertions(+), 110 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-03-14-13-08-20.gh-issue-127266._tyfBp.rst

diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h
index af6ee3ab48939f..f193fed1153f14 100644
--- a/Include/internal/pycore_interp_structs.h
+++ b/Include/internal/pycore_interp_structs.h
@@ -667,8 +667,11 @@ struct _Py_interp_cached_objects {
 
     /* object.__reduce__ */
     PyObject *objreduce;
+#ifndef Py_GIL_DISABLED
+    /* resolve_slotdups() */
     PyObject *type_slots_pname;
     pytype_slotdef *type_slots_ptrs[MAX_EQUIV];
+#endif
 
     /* TypeVar and related types */
     PyTypeObject *generic_type;
diff --git a/Include/internal/pycore_object.h b/Include/internal/pycore_object.h
index b7e162c8abcabf..986bcc9fd08b1c 100644
--- a/Include/internal/pycore_object.h
+++ b/Include/internal/pycore_object.h
@@ -313,7 +313,7 @@ extern int _PyDict_CheckConsistency(PyObject *mp, int check_content);
 // Fast inlined version of PyType_HasFeature()
 static inline int
 _PyType_HasFeature(PyTypeObject *type, unsigned long feature) {
-    return ((FT_ATOMIC_LOAD_ULONG_RELAXED(type->tp_flags) & feature) != 0);
+    return ((type->tp_flags) & feature) != 0;
 }
 
 extern void _PyType_InitCache(PyInterpreterState *interp);
diff --git a/Include/internal/pycore_typeobject.h b/Include/internal/pycore_typeobject.h
index 1a4f89fd2449a0..0ee7d555c56cdd 100644
--- a/Include/internal/pycore_typeobject.h
+++ b/Include/internal/pycore_typeobject.h
@@ -134,7 +134,6 @@ extern int _PyType_AddMethod(PyTypeObject *, PyMethodDef *);
 extern void _PyType_SetFlagsRecursive(PyTypeObject *self, unsigned long mask,
                                       unsigned long flags);
 
-extern unsigned int _PyType_GetVersionForCurrentState(PyTypeObject *tp);
 PyAPI_FUNC(void) _PyType_SetVersion(PyTypeObject *tp, unsigned int version);
 PyTypeObject *_PyType_LookupByVersion(unsigned int version);
 
diff --git a/Include/object.h b/Include/object.h
index 8cc83abb8574e3..994cac1ad17501 100644
--- a/Include/object.h
+++ b/Include/object.h
@@ -620,6 +620,12 @@ given type object has a specified feature.
 #define Py_TPFLAGS_HAVE_FINALIZE (1UL << 0)
 #define Py_TPFLAGS_HAVE_VERSION_TAG   (1UL << 18)
 
+// Flag values for ob_flags (16 bits available, if SIZEOF_VOID_P > 4).
+#define _Py_IMMORTAL_FLAGS (1 << 0)
+#define _Py_STATICALLY_ALLOCATED_FLAG (1 << 2)
+#if defined(Py_GIL_DISABLED) && defined(Py_DEBUG)
+#define _Py_TYPE_REVEALED_FLAG (1 << 3)
+#endif
 
 #define Py_CONSTANT_NONE 0
 #define Py_CONSTANT_FALSE 1
@@ -776,11 +782,7 @@ PyType_HasFeature(PyTypeObject *type, unsigned long feature)
     // PyTypeObject is opaque in the limited C API
     flags = PyType_GetFlags(type);
 #else
-#   ifdef Py_GIL_DISABLED
-        flags = _Py_atomic_load_ulong_relaxed(&type->tp_flags);
-#   else
-        flags = type->tp_flags;
-#   endif
+    flags = type->tp_flags;
 #endif
     return ((flags & feature) != 0);
 }
diff --git a/Include/refcount.h b/Include/refcount.h
index 177bbdaf0c5977..ebd1dba6d15e1a 100644
--- a/Include/refcount.h
+++ b/Include/refcount.h
@@ -19,9 +19,6 @@ immortal. The latter should be the only instances that require
 cleanup during runtime finalization.
 */
 
-#define _Py_STATICALLY_ALLOCATED_FLAG 4
-#define _Py_IMMORTAL_FLAGS 1
-
 #if SIZEOF_VOID_P > 4
 /*
 In 64+ bit systems, any object whose 32 bit reference count is >= 2**31
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-03-14-13-08-20.gh-issue-127266._tyfBp.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-03-14-13-08-20.gh-issue-127266._tyfBp.rst
new file mode 100644
index 00000000000000..b26977628de136
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-03-14-13-08-20.gh-issue-127266._tyfBp.rst
@@ -0,0 +1,6 @@
+In the free-threaded build, avoid data races caused by updating type slots
+or type flags after the type was initially created.  For those (typically
+rare) cases, use the stop-the-world mechanism.  Remove the use of atomics
+when reading or writing type flags.  The use of atomics is not sufficient to
+avoid races (since flags are sometimes read without a lock and without
+atomics) and are no longer required.
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 4e614daaa6955b..22628018cc2a8f 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -48,7 +48,7 @@ class object "PyObject *" "&PyBaseObject_Type"
          & ((1 << MCACHE_SIZE_EXP) - 1))
 
 #define MCACHE_HASH_METHOD(type, name)                                  \
-    MCACHE_HASH(FT_ATOMIC_LOAD_UINT32_RELAXED((type)->tp_version_tag),   \
+    MCACHE_HASH(FT_ATOMIC_LOAD_UINT_RELAXED((type)->tp_version_tag),   \
                 ((Py_ssize_t)(name)) >> 3)
 #define MCACHE_CACHEABLE_NAME(name)                             \
         PyUnicode_CheckExact(name) &&                           \
@@ -60,11 +60,19 @@ class object "PyObject *" "&PyBaseObject_Type"
 
 #ifdef Py_GIL_DISABLED
 
-// There's a global lock for mutation of types.  This avoids having to take
-// additional locks while doing various subclass processing which may result
-// in odd behaviors w.r.t. running with the GIL as the outer type lock could
-// be released and reacquired during a subclass update if there's contention
-// on the subclass lock.
+// There's a global lock for types that ensures that tp_version_tag and
+// _spec_cache are correctly updated if the type is modified.  It also protects
+// tp_mro, tp_bases, and tp_base.  This avoids having to take additional locks
+// while doing various subclass processing which may result in odd behaviors
+// w.r.t. running with the GIL as the outer type lock could be released and
+// reacquired during a subclass update if there's contention on the subclass
+// lock.
+//
+// Note that this lock does not protect updates of other type slots or the
+// tp_flags member.  Instead, we either ensure those updates are done before
+// the type has been revealed to other threads or we only do those updates
+// while the stop-the-world mechanism is active.  The slots and flags are read
+// in many places without holding a lock and without atomics.
 #define TYPE_LOCK &PyInterpreterState_Get()->types.mutex
 #define BEGIN_TYPE_LOCK() Py_BEGIN_CRITICAL_SECTION_MUT(TYPE_LOCK)
 #define END_TYPE_LOCK() Py_END_CRITICAL_SECTION()
@@ -74,8 +82,59 @@ class object "PyObject *" "&PyBaseObject_Type"
 
 #define END_TYPE_DICT_LOCK() Py_END_CRITICAL_SECTION2()
 
+#ifdef Py_DEBUG
+// Return true if the world is currently stopped.
+static bool
+types_world_is_stopped(void)
+{
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    return interp->stoptheworld.world_stopped;
+}
+#endif
+
+// Checks that the type has not yet been revealed (exposed) to other
+// threads.  The _Py_TYPE_REVEALED_FLAG flag is set by type_new() and
+// PyType_FromMetaclass() to indicate that a newly initialized type might be
+// revealed.  We only have ob_flags on 64-bit platforms.
+#if SIZEOF_VOID_P > 4
+#define TYPE_IS_REVEALED(tp) ((((PyObject *)(tp))->ob_flags & _Py_TYPE_REVEALED_FLAG) != 0)
+#else
+#define TYPE_IS_REVEALED(tp) 0
+#endif
+
+#ifdef Py_DEBUG
 #define ASSERT_TYPE_LOCK_HELD() \
-    _Py_CRITICAL_SECTION_ASSERT_MUTEX_LOCKED(TYPE_LOCK)
+    if (!types_world_is_stopped()) { _Py_CRITICAL_SECTION_ASSERT_MUTEX_LOCKED(TYPE_LOCK); }
+
+// Checks if we can safely update type slots or tp_flags.
+#define ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp) \
+    assert(!TYPE_IS_REVEALED(tp) || types_world_is_stopped())
+
+#define ASSERT_NEW_TYPE_OR_LOCKED(tp) \
+    if (TYPE_IS_REVEALED(tp)) { ASSERT_TYPE_LOCK_HELD(); }
+#else
+#define ASSERT_TYPE_LOCK_HELD()
+#define ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp)
+#define ASSERT_NEW_TYPE_OR_LOCKED(tp)
+#endif
+
+static void
+types_stop_world(void)
+{
+    assert(!types_world_is_stopped());
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    _PyEval_StopTheWorld(interp);
+    assert(types_world_is_stopped());
+}
+
+static void
+types_start_world(void)
+{
+    assert(types_world_is_stopped());
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    _PyEval_StartTheWorld(interp);
+    assert(!types_world_is_stopped());
+}
 
 #else
 
@@ -84,6 +143,12 @@ class object "PyObject *" "&PyBaseObject_Type"
 #define BEGIN_TYPE_DICT_LOCK(d)
 #define END_TYPE_DICT_LOCK()
 #define ASSERT_TYPE_LOCK_HELD()
+#define TYPE_IS_REVEALED(tp) 0
+#define ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp)
+#define ASSERT_NEW_TYPE_OR_LOCKED(tp)
+#define types_world_is_stopped() 1
+#define types_stop_world()
+#define types_start_world()
 
 #endif
 
@@ -346,21 +411,14 @@ _PyStaticType_GetBuiltins(void)
 static void
 type_set_flags(PyTypeObject *tp, unsigned long flags)
 {
-    if (tp->tp_flags & Py_TPFLAGS_READY) {
-        // It's possible the type object has been exposed to other threads
-        // if it's been marked ready.  In that case, the type lock should be
-        // held when flags are modified.
-        ASSERT_TYPE_LOCK_HELD();
-    }
-    // Since PyType_HasFeature() reads the flags without holding the type
-    // lock, we need an atomic store here.
-    FT_ATOMIC_STORE_ULONG_RELAXED(tp->tp_flags, flags);
+    ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp);
+    tp->tp_flags = flags;
 }
 
 static void
 type_set_flags_with_mask(PyTypeObject *tp, unsigned long mask, unsigned long flags)
 {
-    ASSERT_TYPE_LOCK_HELD();
+    ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp);
     unsigned long new_flags = (tp->tp_flags & ~mask) | flags;
     type_set_flags(tp, new_flags);
 }
@@ -498,6 +556,7 @@ static inline void
 set_tp_bases(PyTypeObject *self, PyObject *bases, int initial)
 {
     assert(PyTuple_Check(bases));
+    ASSERT_NEW_TYPE_OR_LOCKED(self);
     if (self->tp_flags & _Py_TPFLAGS_STATIC_BUILTIN) {
         // XXX tp_bases can probably be statically allocated for each
         // static builtin type.
@@ -542,7 +601,7 @@ clear_tp_bases(PyTypeObject *self, int final)
 static inline PyObject *
 lookup_tp_mro(PyTypeObject *self)
 {
-    ASSERT_TYPE_LOCK_HELD();
+    ASSERT_NEW_TYPE_OR_LOCKED(self);
     return self->tp_mro;
 }
 
@@ -1027,7 +1086,6 @@ PyType_Unwatch(int watcher_id, PyObject* obj)
 static void
 set_version_unlocked(PyTypeObject *tp, unsigned int version)
 {
-    ASSERT_TYPE_LOCK_HELD();
     assert(version == 0 || (tp->tp_versions_used != _Py_ATTR_CACHE_UNUSED));
 #ifndef Py_GIL_DISABLED
     PyInterpreterState *interp = _PyInterpreterState_GET();
@@ -1075,7 +1133,7 @@ type_modified_unlocked(PyTypeObject *type)
        We don't assign new version tags eagerly, but only as
        needed.
      */
-    ASSERT_TYPE_LOCK_HELD();
+    ASSERT_NEW_TYPE_OR_LOCKED(type);
     if (type->tp_version_tag == 0) {
         return;
     }
@@ -1106,6 +1164,8 @@ type_modified_unlocked(PyTypeObject *type)
         while (bits) {
             assert(i < TYPE_MAX_WATCHERS);
             if (bits & 1) {
+                // Note that PyErr_FormatUnraisable is potentially re-entrant
+                // and the watcher callback might be too.
                 PyType_WatchCallback cb = interp->type_watchers[i];
                 if (cb && (cb(type) < 0)) {
                     PyErr_FormatUnraisable(
@@ -1245,14 +1305,6 @@ _PyType_LookupByVersion(unsigned int version)
 #endif
 }
 
-unsigned int
-_PyType_GetVersionForCurrentState(PyTypeObject *tp)
-{
-    return tp->tp_version_tag;
-}
-
-
-
 #define MAX_VERSIONS_PER_CLASS 1000
 #if _Py_ATTR_CACHE_UNUSED < MAX_VERSIONS_PER_CLASS
 #error "_Py_ATTR_CACHE_UNUSED must be bigger than max"
@@ -1586,10 +1638,13 @@ type_set_abstractmethods(PyObject *tp, PyObject *value, void *Py_UNUSED(closure)
 
     BEGIN_TYPE_LOCK();
     type_modified_unlocked(type);
+    types_stop_world();
     if (abstract)
         type_add_flags(type, Py_TPFLAGS_IS_ABSTRACT);
     else
         type_clear_flags(type, Py_TPFLAGS_IS_ABSTRACT);
+    types_start_world();
+    ASSERT_TYPE_LOCK_HELD();
     END_TYPE_LOCK();
 
     return 0;
@@ -1624,8 +1679,8 @@ type_get_mro(PyObject *tp, void *Py_UNUSED(closure))
     return mro;
 }
 
-static PyTypeObject *best_base(PyObject *);
-static int mro_internal(PyTypeObject *, PyObject **);
+static PyTypeObject *find_best_base(PyObject *);
+static int mro_internal(PyTypeObject *, int, PyObject **);
 static int type_is_subtype_base_chain(PyTypeObject *, PyTypeObject *);
 static int compatible_for_assignment(PyTypeObject *, PyTypeObject *, const char *);
 static int add_subclass(PyTypeObject*, PyTypeObject*);
@@ -1640,13 +1695,15 @@ static int update_subclasses(PyTypeObject *type, PyObject *attr_name,
 static int recurse_down_subclasses(PyTypeObject *type, PyObject *name,
                                    update_callback callback, void *data);
 
+// Compute tp_mro for this type and all of its subclasses.  This
+// is called after __bases__ is assigned to an existing type.
 static int
 mro_hierarchy(PyTypeObject *type, PyObject *temp)
 {
     ASSERT_TYPE_LOCK_HELD();
 
     PyObject *old_mro;
-    int res = mro_internal(type, &old_mro);
+    int res = mro_internal(type, 0, &old_mro);
     if (res <= 0) {
         /* error / reentrance */
         return res;
@@ -1708,9 +1765,9 @@ mro_hierarchy(PyTypeObject *type, PyObject *temp)
 }
 
 static int
-type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases)
+type_check_new_bases(PyTypeObject *type, PyObject *new_bases, PyTypeObject **best_base)
 {
-    // Check arguments
+    // Check arguments, this is re-entrant due to the PySys_Audit() call
     if (!check_set_special_type_attr(type, new_bases, "__bases__")) {
         return -1;
     }
@@ -1759,20 +1816,29 @@ type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases)
     }
 
     // Compute the new MRO and the new base class
-    PyTypeObject *new_base = best_base(new_bases);
-    if (new_base == NULL)
+    *best_base = find_best_base(new_bases);
+    if (*best_base == NULL)
         return -1;
 
-    if (!compatible_for_assignment(type->tp_base, new_base, "__bases__")) {
+    if (!compatible_for_assignment(type->tp_base, *best_base, "__bases__")) {
         return -1;
     }
 
+    return 0;
+}
+
+static int
+type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases, PyTypeObject *best_base)
+{
+    ASSERT_TYPE_LOCK_HELD();
+
+    Py_ssize_t n;
     PyObject *old_bases = lookup_tp_bases(type);
     assert(old_bases != NULL);
     PyTypeObject *old_base = type->tp_base;
 
     set_tp_bases(type, Py_NewRef(new_bases), 0);
-    type->tp_base = (PyTypeObject *)Py_NewRef(new_base);
+    type->tp_base = (PyTypeObject *)Py_NewRef(best_base);
 
     PyObject *temp = PyList_New(0);
     if (temp == NULL) {
@@ -1796,7 +1862,10 @@ type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases)
            add to all new_bases */
         remove_all_subclasses(type, old_bases);
         res = add_all_subclasses(type, new_bases);
+        types_stop_world();
         update_all_slots(type);
+        types_start_world();
+        ASSERT_TYPE_LOCK_HELD();
     }
     else {
         res = 0;
@@ -1827,13 +1896,13 @@ type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases)
 
   bail:
     if (lookup_tp_bases(type) == new_bases) {
-        assert(type->tp_base == new_base);
+        assert(type->tp_base == best_base);
 
         set_tp_bases(type, old_bases, 0);
         type->tp_base = old_base;
 
         Py_DECREF(new_bases);
-        Py_DECREF(new_base);
+        Py_DECREF(best_base);
     }
     else {
         Py_DECREF(old_bases);
@@ -1848,9 +1917,13 @@ static int
 type_set_bases(PyObject *tp, PyObject *new_bases, void *Py_UNUSED(closure))
 {
     PyTypeObject *type = PyTypeObject_CAST(tp);
+    PyTypeObject *best_base;
     int res;
     BEGIN_TYPE_LOCK();
-    res = type_set_bases_unlocked(type, new_bases);
+    res = type_check_new_bases(type, new_bases, &best_base);
+    if (res == 0) {
+        res = type_set_bases_unlocked(type, new_bases, best_base);
+    }
     END_TYPE_LOCK();
     return res;
 }
@@ -3092,6 +3165,7 @@ static PyObject *
 class_name(PyObject *cls)
 {
     PyObject *name;
+    // Note that this is potentially re-entrant.
     if (PyObject_GetOptionalAttr(cls, &_Py_ID(__name__), &name) == 0) {
         name = PyObject_Repr(cls);
     }
@@ -3428,9 +3502,13 @@ mro_invoke(PyTypeObject *type)
     const int custom = !Py_IS_TYPE(type, &PyType_Type);
 
     if (custom) {
+        // Custom mro() method on metaclass.  This is potentially re-entrant.
+        // We are called either from type_ready() or from type_set_bases().
         mro_result = call_method_noarg((PyObject *)type, &_Py_ID(mro));
     }
     else {
+        // In this case, the mro() method on the type object is being used and
+        // we know that these calls are not re-entrant.
         mro_result = mro_implementation_unlocked(type);
     }
     if (mro_result == NULL)
@@ -3478,7 +3556,7 @@ mro_invoke(PyTypeObject *type)
      - Returns -1 in case of an error.
 */
 static int
-mro_internal_unlocked(PyTypeObject *type, int initial, PyObject **p_old_mro)
+mro_internal(PyTypeObject *type, int initial, PyObject **p_old_mro)
 {
     ASSERT_TYPE_LOCK_HELD();
 
@@ -3526,21 +3604,11 @@ mro_internal_unlocked(PyTypeObject *type, int initial, PyObject **p_old_mro)
     return 1;
 }
 
-static int
-mro_internal(PyTypeObject *type, PyObject **p_old_mro)
-{
-    int res;
-    BEGIN_TYPE_LOCK();
-    res = mro_internal_unlocked(type, 0, p_old_mro);
-    END_TYPE_LOCK();
-    return res;
-}
-
 /* Calculate the best base amongst multiple base classes.
    This is the first one that's on the path to the "solid base". */
 
 static PyTypeObject *
-best_base(PyObject *bases)
+find_best_base(PyObject *bases)
 {
     Py_ssize_t i, n;
     PyTypeObject *base, *winner, *candidate;
@@ -3629,6 +3697,7 @@ static int update_slot(PyTypeObject *, PyObject *);
 static void fixup_slot_dispatchers(PyTypeObject *);
 static int type_new_set_names(PyTypeObject *);
 static int type_new_init_subclass(PyTypeObject *, PyObject *);
+static bool has_slotdef(PyObject *);
 
 /*
  * Helpers for  __dict__ descriptor.  We don't want to expose the dicts
@@ -3826,7 +3895,7 @@ type_init(PyObject *cls, PyObject *args, PyObject *kwds)
 unsigned long
 PyType_GetFlags(PyTypeObject *type)
 {
-    return FT_ATOMIC_LOAD_ULONG_RELAXED(type->tp_flags);
+    return type->tp_flags;
 }
 
 
@@ -4604,6 +4673,10 @@ type_new_impl(type_new_ctx *ctx)
     }
 
     assert(_PyType_CheckConsistency(type));
+#if defined(Py_GIL_DISABLED) && defined(Py_DEBUG) && SIZEOF_VOID_P > 4
+    // After this point, other threads can potentally use this type.
+    ((PyObject*)type)->ob_flags |= _Py_TYPE_REVEALED_FLAG;
+#endif
 
     return (PyObject *)type;
 
@@ -4666,7 +4739,7 @@ type_new_get_bases(type_new_ctx *ctx, PyObject **type)
     }
 
     /* Calculate best base, and check that all bases are type objects */
-    PyTypeObject *base = best_base(ctx->bases);
+    PyTypeObject *base = find_best_base(ctx->bases);
     if (base == NULL) {
         return -1;
     }
@@ -5081,12 +5154,12 @@ PyType_FromMetaclass(
     }
 
     /* Calculate best base, and check that all bases are type objects */
-    PyTypeObject *base = best_base(bases);  // borrowed ref
+    PyTypeObject *base = find_best_base(bases);  // borrowed ref
     if (base == NULL) {
         goto finally;
     }
-    // best_base should check Py_TPFLAGS_BASETYPE & raise a proper exception,
-    // here we just check its work
+    // find_best_base() should check Py_TPFLAGS_BASETYPE & raise a proper
+    // exception, here we just check its work
     assert(_PyType_HasFeature(base, Py_TPFLAGS_BASETYPE));
 
     /* Calculate sizes */
@@ -5317,6 +5390,10 @@ PyType_FromMetaclass(
     }
 
     assert(_PyType_CheckConsistency(type));
+#if defined(Py_GIL_DISABLED) && defined(Py_DEBUG) && SIZEOF_VOID_P > 4
+    // After this point, other threads can potentally use this type.
+    ((PyObject*)type)->ob_flags |= _Py_TYPE_REVEALED_FLAG;
+#endif
 
  finally:
     if (PyErr_Occurred()) {
@@ -5610,8 +5687,6 @@ PyObject_GetItemData(PyObject *obj)
 static PyObject *
 find_name_in_mro(PyTypeObject *type, PyObject *name, int *error)
 {
-    ASSERT_TYPE_LOCK_HELD();
-
     Py_hash_t hash = _PyObject_HashFast(name);
     if (hash == -1) {
         *error = -1;
@@ -5920,9 +5995,13 @@ _PyType_CacheGetItemForSpecialization(PyHeapTypeObject *ht, PyObject *descriptor
 void
 _PyType_SetFlags(PyTypeObject *self, unsigned long mask, unsigned long flags)
 {
-    BEGIN_TYPE_LOCK();
-    type_set_flags_with_mask(self, mask, flags);
-    END_TYPE_LOCK();
+    unsigned long new_flags = (self->tp_flags & ~mask) | flags;
+    if (new_flags != self->tp_flags) {
+        types_stop_world();
+        // can't use new_flags here since they could be out-of-date
+        self->tp_flags = (self->tp_flags & ~mask) | flags;
+        types_start_world();
+    }
 }
 
 int
@@ -5969,9 +6048,9 @@ set_flags_recursive(PyTypeObject *self, unsigned long mask, unsigned long flags)
 void
 _PyType_SetFlagsRecursive(PyTypeObject *self, unsigned long mask, unsigned long flags)
 {
-    BEGIN_TYPE_LOCK();
+    types_stop_world();
     set_flags_recursive(self, mask, flags);
-    END_TYPE_LOCK();
+    types_start_world();
 }
 
 /* This is similar to PyObject_GenericGetAttr(),
@@ -6085,6 +6164,8 @@ _Py_type_getattro(PyObject *tp, PyObject *name)
     return _Py_type_getattro_impl(type, name, NULL);
 }
 
+// Called by type_setattro().  Updates both the type dict and
+// the type versions.
 static int
 type_update_dict(PyTypeObject *type, PyDictObject *dict, PyObject *name,
                  PyObject *value, PyObject **old_value)
@@ -6114,10 +6195,6 @@ type_update_dict(PyTypeObject *type, PyDictObject *dict, PyObject *name,
         return -1;
     }
 
-    if (is_dunder_name(name)) {
-        return update_slot(type, name);
-    }
-
     return 0;
 }
 
@@ -6175,7 +6252,9 @@ type_setattro(PyObject *self, PyObject *name, PyObject *value)
 
     PyObject *dict = type->tp_dict;
     if (dict == NULL) {
-        // We don't just do PyType_Ready because we could already be readying
+        // This is an unlikely case.  PyType_Ready has not yet been done and
+        // we need to initialize tp_dict.  We don't just do PyType_Ready
+        // because we could already be readying.
         BEGIN_TYPE_LOCK();
         dict = type->tp_dict;
         if (dict == NULL) {
@@ -6191,6 +6270,15 @@ type_setattro(PyObject *self, PyObject *name, PyObject *value)
     BEGIN_TYPE_DICT_LOCK(dict);
     res = type_update_dict(type, (PyDictObject *)dict, name, value, &old_value);
     assert(_PyType_CheckConsistency(type));
+    if (res == 0) {
+        if (is_dunder_name(name) && has_slotdef(name)) {
+            // The name corresponds to a type slot.
+            types_stop_world();
+            res = update_slot(type, name);
+            types_start_world();
+            ASSERT_TYPE_LOCK_HELD();
+        }
+    }
     END_TYPE_DICT_LOCK();
 
 done:
@@ -7120,15 +7208,10 @@ object_set_class(PyObject *self, PyObject *value, void *closure)
         return -1;
     }
 
-#ifdef Py_GIL_DISABLED
-    PyInterpreterState *interp = _PyInterpreterState_GET();
-    _PyEval_StopTheWorld(interp);
-#endif
+    types_stop_world();
     PyTypeObject *oldto = Py_TYPE(self);
     int res = object_set_class_world_stopped(self, newto);
-#ifdef Py_GIL_DISABLED
-    _PyEval_StartTheWorld(interp);
-#endif
+    types_start_world();
     if (res == 0) {
         if (oldto->tp_flags & Py_TPFLAGS_HEAPTYPE) {
             Py_DECREF(oldto);
@@ -8536,7 +8619,7 @@ type_ready_mro(PyTypeObject *type, int initial)
     }
 
     /* Calculate method resolution order */
-    if (mro_internal_unlocked(type, initial, NULL) < 0) {
+    if (mro_internal(type, initial, NULL) < 0) {
         return -1;
     }
     PyObject *mro = lookup_tp_mro(type);
@@ -11059,12 +11142,21 @@ resolve_slotdups(PyTypeObject *type, PyObject *name)
 {
     /* XXX Maybe this could be optimized more -- but is it worth it? */
 
+#ifdef Py_GIL_DISABLED
+    pytype_slotdef *ptrs[MAX_EQUIV];
+    pytype_slotdef **pp = ptrs;
+    /* Collect all slotdefs that match name into ptrs. */
+    for (pytype_slotdef *p = slotdefs; p->name_strobj; p++) {
+        if (p->name_strobj == name)
+            *pp++ = p;
+    }
+    *pp = NULL;
+#else
     /* pname and ptrs act as a little cache */
     PyInterpreterState *interp = _PyInterpreterState_GET();
 #define pname _Py_INTERP_CACHED_OBJECT(interp, type_slots_pname)
 #define ptrs _Py_INTERP_CACHED_OBJECT(interp, type_slots_ptrs)
     pytype_slotdef *p, **pp;
-    void **res, **ptr;
 
     if (pname != name) {
         /* Collect all slotdefs that match name into ptrs. */
@@ -11076,10 +11168,12 @@ resolve_slotdups(PyTypeObject *type, PyObject *name)
         }
         *pp = NULL;
     }
+#endif
 
     /* Look in all slots of the type matching the name. If exactly one of these
        has a filled-in slot, return a pointer to that slot.
        Otherwise, return NULL. */
+    void **res, **ptr;
     res = NULL;
     for (pp = ptrs; *pp; pp++) {
         ptr = slotptr(type, (*pp)->offset);
@@ -11089,11 +11183,25 @@ resolve_slotdups(PyTypeObject *type, PyObject *name)
             return NULL;
         res = ptr;
     }
-    return res;
+#ifndef Py_GIL_DISABLED
 #undef pname
 #undef ptrs
+#endif
+    return res;
 }
 
+// Return true if "name" corresponds to at least one slot definition.  This is
+// a more accurate but more expensive test compared to is_dunder_name().
+static bool
+has_slotdef(PyObject *name)
+{
+    for (pytype_slotdef *p = slotdefs; p->name_strobj; p++) {
+        if (p->name_strobj == name) {
+            return true;
+        }
+    }
+    return false;
+}
 
 /* Common code for update_slots_callback() and fixup_slot_dispatchers().
  *
@@ -11152,7 +11260,7 @@ resolve_slotdups(PyTypeObject *type, PyObject *name)
 static pytype_slotdef *
 update_one_slot(PyTypeObject *type, pytype_slotdef *p)
 {
-    ASSERT_TYPE_LOCK_HELD();
+    ASSERT_WORLD_STOPPED_OR_NEW_TYPE(type);
 
     PyObject *descr;
     PyWrapperDescrObject *d;
@@ -11275,7 +11383,7 @@ update_one_slot(PyTypeObject *type, pytype_slotdef *p)
 static int
 update_slots_callback(PyTypeObject *type, void *data)
 {
-    ASSERT_TYPE_LOCK_HELD();
+    ASSERT_WORLD_STOPPED_OR_NEW_TYPE(type);
 
     pytype_slotdef **pp = (pytype_slotdef **)data;
     for (; *pp; pp++) {
@@ -11293,7 +11401,7 @@ update_slot(PyTypeObject *type, PyObject *name)
     pytype_slotdef **pp;
     int offset;
 
-    ASSERT_TYPE_LOCK_HELD();
+    assert(types_world_is_stopped());
     assert(PyUnicode_CheckExact(name));
     assert(PyUnicode_CHECK_INTERNED(name));
 
@@ -11327,33 +11435,27 @@ update_slot(PyTypeObject *type, PyObject *name)
 static void
 fixup_slot_dispatchers(PyTypeObject *type)
 {
-    // This lock isn't strictly necessary because the type has not been
-    // exposed to anyone else yet, but update_ont_slot calls find_name_in_mro
-    // where we'd like to assert that the type is locked.
-    BEGIN_TYPE_LOCK();
-
     assert(!PyErr_Occurred());
     for (pytype_slotdef *p = slotdefs; p->name; ) {
         p = update_one_slot(type, p);
     }
-
-    END_TYPE_LOCK();
 }
 
+// Called when __bases__ is re-assigned.
 static void
 update_all_slots(PyTypeObject* type)
 {
     pytype_slotdef *p;
 
-    ASSERT_TYPE_LOCK_HELD();
-
-    /* Clear the VALID_VERSION flag of 'type' and all its subclasses. */
-    type_modified_unlocked(type);
+    assert(types_world_is_stopped());
 
     for (p = slotdefs; p->name; p++) {
         /* update_slot returns int but can't actually fail */
         update_slot(type, p->name_strobj);
     }
+
+    /* Clear the VALID_VERSION flag of 'type' and all its subclasses. */
+    type_modified_unlocked(type);
 }
 
 
@@ -11625,7 +11727,10 @@ PyType_Freeze(PyTypeObject *type)
     }
 
     BEGIN_TYPE_LOCK();
+    types_stop_world();
     type_add_flags(type, Py_TPFLAGS_IMMUTABLETYPE);
+    types_start_world();
+    ASSERT_TYPE_LOCK_HELD();
     type_modified_unlocked(type);
     END_TYPE_LOCK();
 
diff --git a/Python/ceval.c b/Python/ceval.c
index 19a1c9529dd9aa..41b661c55dbc3b 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -138,6 +138,19 @@
 #endif
 
 
+static void
+check_invalid_reentrancy(void)
+{
+#if defined(Py_DEBUG) && defined(Py_GIL_DISABLED)
+    // In the free-threaded build, the interpreter must not be re-entered if
+    // the world-is-stopped.  If so, that's a bug somewhere (quite likely in
+    // the painfully complex typeobject code).
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    assert(!interp->stoptheworld.world_stopped);
+#endif
+}
+
+
 #ifdef Py_DEBUG
 static void
 dump_item(_PyStackRef item)
@@ -999,6 +1012,7 @@ PyObject* _Py_HOT_FUNCTION DONT_SLP_VECTORIZE
 _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwflag)
 {
     _Py_EnsureTstateNotNULL(tstate);
+    check_invalid_reentrancy();
     CALL_STAT_INC(pyeval_calls);
 
 #if USE_COMPUTED_GOTOS && !Py_TAIL_CALL_INTERP
diff --git a/Tools/tsan/suppressions_free_threading.txt b/Tools/tsan/suppressions_free_threading.txt
index 21224e490b8160..404c30157362aa 100644
--- a/Tools/tsan/suppressions_free_threading.txt
+++ b/Tools/tsan/suppressions_free_threading.txt
@@ -12,15 +12,12 @@
 
 # These warnings trigger directly in a CPython function.
 
-race_top:assign_version_tag
-race_top:_Py_slot_tp_getattr_hook
 race_top:dump_traceback
 race_top:fatal_error
 race_top:_PyFrame_GetCode
 race_top:_PyFrame_Initialize
 race_top:_PyObject_TryGetInstanceAttribute
 race_top:PyUnstable_InterpreterFrame_GetLine
-race_top:type_modified_unlocked
 race_top:write_thread_id
 
 # gh-129068: race on shared range iterators (test_free_threading.test_zip.ZipThreading.test_threading)
@@ -29,9 +26,6 @@ race_top:rangeiter_next
 # gh-129748: test.test_free_threading.test_slots.TestSlots.test_object
 race_top:mi_block_set_nextx
 
-# gh-127266: type slot updates are not thread-safe (test_opcache.test_load_attr_method_lazy_dict)
-race_top:update_one_slot
-
 # https://gist.github.com/mpage/6962e8870606cfc960e159b407a0cb40
 thread:pthread_create
 
@@ -46,4 +40,4 @@ race:list_inplace_repeat_lock_held
 
 # PyObject_Realloc internally does memcpy which isn't atomic so can race
 # with non-locking reads. See #132070
-race:PyObject_Realloc
\ No newline at end of file
+race:PyObject_Realloc

From f447ce4adf750b9da15011e30eaf8aee3fd7b478 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Tue, 29 Apr 2025 16:10:51 -0700
Subject: [PATCH 02/18] For update_all_slots(), do updates more safely.

To avoid deadlocks while the world is stopped, we need to avoid calling APIs
like _PyObject_HashFast() and _PyDict_GetItemRef_KnownHash().  Collect the
slot updates to be done and then apply them all at once.  This reduces the
amount of code running in the stop-the-world condition.
---
 Objects/typeobject.c | 257 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 226 insertions(+), 31 deletions(-)

diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 22628018cc2a8f..7abedf40859646 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -1687,7 +1687,7 @@ static int add_subclass(PyTypeObject*, PyTypeObject*);
 static int add_all_subclasses(PyTypeObject *type, PyObject *bases);
 static void remove_subclass(PyTypeObject *, PyTypeObject *);
 static void remove_all_subclasses(PyTypeObject *type, PyObject *bases);
-static void update_all_slots(PyTypeObject *);
+static int update_all_slots(PyTypeObject *);
 
 typedef int (*update_callback)(PyTypeObject *, void *);
 static int update_subclasses(PyTypeObject *type, PyObject *attr_name,
@@ -1862,10 +1862,9 @@ type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases, PyTypeObject *b
            add to all new_bases */
         remove_all_subclasses(type, old_bases);
         res = add_all_subclasses(type, new_bases);
-        types_stop_world();
-        update_all_slots(type);
-        types_start_world();
-        ASSERT_TYPE_LOCK_HELD();
+        if (update_all_slots(type) < 0) {
+            goto bail;
+        }
     }
     else {
         res = 0;
@@ -3690,10 +3689,127 @@ solid_base(PyTypeObject *type)
     }
 }
 
+#ifdef Py_GIL_DISABLED
+
+// The structures and functions below are used in the free-threaded build
+// to safely make updates to type slots, when __bases__ is re-assigned.  Since
+// the slots are read without atomic operations and without locking, we can
+// only safely update them while the world is stopped.  However, with the
+// world stopped, we are very limited on which APIs can be safely used.  For
+// example, calling _PyObject_HashFast() or _PyDict_GetItemRef_KnownHash() are
+// not safe and can potentially cause deadlocks.  Hashing can be re-entrant
+// and _PyDict_GetItemRef_KnownHash can acquire a lock if the dictionary is
+// not owned by the current thread, to mark it shared on reading.
+//
+// We do the slot updates in two steps.  First, with TYPE_LOCK held, we lookup
+// the descriptor for each slot, for each subclass. We build a queue of
+// updates to perform but don't actually update the type structures.  After we
+// are finished the lookups, we stop-the-world and apply all of the updates.
+// The apply_slot_updates() code is simple and easy to confirm that it is
+// safe.
+
+typedef struct {
+    void **slot_ptr;
+    void *slot_value;
+} slot_update_item_t;
+
+// The number of slot updates performed is based on the number of changed
+// slots and the number of subclasses.  It's possible there are many updates
+// required if there are many subclasses (potentially an unbounded amount).
+// Usually the number of slot updates is small, most often zero or one.  When
+// running the unit tests, we don't exceed 20.  The chunk size is set to
+// handle the common case with a single chunk and to not require too many
+// chunk allocations if there are many subclasses.
+#define SLOT_UPDATE_CHUNK_SIZE 30
+
+typedef struct _slot_update {
+    struct _slot_update *prev;
+    Py_ssize_t n;
+    slot_update_item_t updates[SLOT_UPDATE_CHUNK_SIZE];
+} slot_update_chunk_t;
+
+// a queue of updates to be performed
+typedef struct {
+    slot_update_chunk_t *head;
+} slot_update_t;
+
+static slot_update_chunk_t *
+slot_update_new_chunk(void)
+{
+    slot_update_chunk_t *chunk = PyMem_Malloc(sizeof(slot_update_chunk_t));
+    if (chunk == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    chunk->prev = NULL;
+    chunk->n = 0;
+    return chunk;
+}
+
+static void
+slot_update_free_chunks(slot_update_t *updates)
+{
+    slot_update_chunk_t *chunk = updates->head;
+    while (chunk != NULL) {
+        slot_update_chunk_t *prev = chunk->prev;
+        PyMem_Free(chunk);
+        chunk = prev;
+    }
+}
+
+static int
+queue_slot_update(slot_update_t *updates, void **slot_ptr, void *slot_value)
+{
+    if (*slot_ptr == slot_value) {
+        return 0; // slot pointer not actually changed, don't queue update
+    }
+    if (updates->head == NULL || updates->head->n == SLOT_UPDATE_CHUNK_SIZE) {
+        slot_update_chunk_t *chunk = slot_update_new_chunk();
+        if (chunk == NULL) {
+            return -1; // out-of-memory
+        }
+        chunk->prev = updates->head;
+        updates->head = chunk;
+    }
+    slot_update_item_t *item = &updates->head->updates[updates->head->n];
+    item->slot_ptr = slot_ptr;
+    item->slot_value = slot_value;
+    updates->head->n++;
+    assert(updates->head->n <= SLOT_UPDATE_CHUNK_SIZE);
+    return 0;
+}
+
+static void
+apply_slot_updates(slot_update_t *updates)
+{
+    assert(types_world_is_stopped());
+    slot_update_chunk_t *chunk = updates->head;
+    while (chunk != NULL) {
+        for (Py_ssize_t i = 0; i < chunk->n; i++) {
+            slot_update_item_t *item = &chunk->updates[i];
+            *(item->slot_ptr) = item->slot_value;
+        }
+        chunk = chunk->prev;
+    }
+}
+
+#else
+
+// not used, slot updates are applied immediately
+typedef struct {} slot_update_t;
+
+#endif
+
+/// data passed to update_slots_callback()
+typedef struct {
+    slot_update_t *queued_updates;
+    pytype_slotdef **defs;
+} update_callback_data_t;
+
 static void object_dealloc(PyObject *);
 static PyObject *object_new(PyTypeObject *, PyObject *, PyObject *);
 static int object_init(PyObject *, PyObject *, PyObject *);
-static int update_slot(PyTypeObject *, PyObject *);
+static int update_slot(PyTypeObject *, PyObject *, slot_update_t *update);
 static void fixup_slot_dispatchers(PyTypeObject *);
 static int type_new_set_names(PyTypeObject *);
 static int type_new_init_subclass(PyTypeObject *, PyObject *);
@@ -6274,7 +6390,7 @@ type_setattro(PyObject *self, PyObject *name, PyObject *value)
         if (is_dunder_name(name) && has_slotdef(name)) {
             // The name corresponds to a type slot.
             types_stop_world();
-            res = update_slot(type, name);
+            res = update_slot(type, name, NULL);
             types_start_world();
             ASSERT_TYPE_LOCK_HELD();
         }
@@ -11254,13 +11370,22 @@ has_slotdef(PyObject *name)
  * There are some further special cases for specific slots, like supporting
  * __hash__ = None for tp_hash and special code for tp_new.
  *
- * When done, return a pointer to the next slotdef with a different offset,
- * because that's convenient for fixup_slot_dispatchers(). This function never
- * sets an exception: if an internal error happens (unlikely), it's ignored. */
-static pytype_slotdef *
-update_one_slot(PyTypeObject *type, pytype_slotdef *p)
+ * When done, next_p is set to the next slotdef with a different offset,
+ * because that's convenient for fixup_slot_dispatchers().
+ *
+ * If the queued_updates pointer is provided, the actual updates to the slot
+ * pointers are queued, rather than being immediately performed.  That argument
+ * is only used for the free-threaded build since those updates need to be
+ * done while the world is stopped.
+ *
+ * This function will only return an error if the queued_updates argument is
+ * provided and allocating memory for the queue fails.  Other exceptions that
+ * occur internally are ignored, such as when looking up descriptors. */
+static int
+update_one_slot(PyTypeObject *type, pytype_slotdef *p, pytype_slotdef **next_p,
+                slot_update_t *queued_updates)
 {
-    ASSERT_WORLD_STOPPED_OR_NEW_TYPE(type);
+    ASSERT_NEW_TYPE_OR_LOCKED(type);
 
     PyObject *descr;
     PyWrapperDescrObject *d;
@@ -11283,7 +11408,10 @@ update_one_slot(PyTypeObject *type, pytype_slotdef *p)
         do {
             ++p;
         } while (p->offset == offset);
-        return p;
+        if (next_p != NULL) {
+            *next_p = p;
+        }
+        return 0;
     }
     /* We may end up clearing live exceptions below, so make sure it's ours. */
     assert(!PyErr_Occurred());
@@ -11371,11 +11499,34 @@ update_one_slot(PyTypeObject *type, pytype_slotdef *p)
         }
         Py_DECREF(descr);
     } while ((++p)->offset == offset);
-    if (specific && !use_generic)
-        *ptr = specific;
-    else
-        *ptr = generic;
-    return p;
+
+    void *slot_value;
+    if (specific && !use_generic) {
+        slot_value = specific;
+    } else {
+        slot_value = generic;
+    }
+
+#ifdef Py_GIL_DISABLED
+    if (queued_updates != NULL) {
+        // queue the update to perform later, while world is stopped
+        if (queue_slot_update(queued_updates, ptr, slot_value) < 0) {
+            return -1;
+        }
+    } else {
+        // do the update to the type structure now
+        *ptr = slot_value;
+    }
+#else
+    // always do the update immediately
+    assert(queued_updates == NULL);
+    *ptr = slot_value;
+#endif
+
+    if (next_p != NULL) {
+        *next_p = p;
+    }
+    return 0;
 }
 
 /* In the type, update the slots whose slotdefs are gathered in the pp array.
@@ -11383,25 +11534,28 @@ update_one_slot(PyTypeObject *type, pytype_slotdef *p)
 static int
 update_slots_callback(PyTypeObject *type, void *data)
 {
-    ASSERT_WORLD_STOPPED_OR_NEW_TYPE(type);
+    ASSERT_NEW_TYPE_OR_LOCKED(type);
 
-    pytype_slotdef **pp = (pytype_slotdef **)data;
+    update_callback_data_t *update_data = (update_callback_data_t *)data;
+    pytype_slotdef **pp = update_data->defs;
     for (; *pp; pp++) {
-        update_one_slot(type, *pp);
+        if (update_one_slot(type, *pp, NULL, update_data->queued_updates) < 0) {
+            return -1;
+        }
     }
     return 0;
 }
 
 /* Update the slots after assignment to a class (type) attribute. */
 static int
-update_slot(PyTypeObject *type, PyObject *name)
+update_slot(PyTypeObject *type, PyObject *name, slot_update_t *queued_updates)
 {
     pytype_slotdef *ptrs[MAX_EQUIV];
     pytype_slotdef *p;
     pytype_slotdef **pp;
     int offset;
 
-    assert(types_world_is_stopped());
+    ASSERT_TYPE_LOCK_HELD();
     assert(PyUnicode_CheckExact(name));
     assert(PyUnicode_CHECK_INTERNED(name));
 
@@ -11425,8 +11579,12 @@ update_slot(PyTypeObject *type, PyObject *name)
     }
     if (ptrs[0] == NULL)
         return 0; /* Not an attribute that affects any slots */
+
+    update_callback_data_t callback_data;
+    callback_data.defs = ptrs;
+    callback_data.queued_updates = queued_updates;
     return update_subclasses(type, name,
-                             update_slots_callback, (void *)ptrs);
+                             update_slots_callback, (void *)&callback_data);
 }
 
 /* Store the proper functions in the slot dispatches at class (type)
@@ -11437,27 +11595,64 @@ fixup_slot_dispatchers(PyTypeObject *type)
 {
     assert(!PyErr_Occurred());
     for (pytype_slotdef *p = slotdefs; p->name; ) {
-        p = update_one_slot(type, p);
+        update_one_slot(type, p, &p, NULL);
     }
 }
 
+#ifdef Py_GIL_DISABLED
+
 // Called when __bases__ is re-assigned.
-static void
+static int
 update_all_slots(PyTypeObject* type)
 {
-    pytype_slotdef *p;
+    // Note that update_slot() can fail due to out-of-memory when allocating
+    // the queue chunks to hold the updates.  That's unlikely since the number
+    // of updates is normally small but we handle that case.  update_slot()
+    // can fail internally for other reasons (a lookup fails) but those
+    // errors are suppressed.
+    slot_update_t queued_updates = {0};
+    for (pytype_slotdef *p = slotdefs; p->name; p++) {
+        if (update_slot(type, p->name_strobj, &queued_updates) < 0) {
+            if (queued_updates.head) {
+                slot_update_free_chunks(&queued_updates);
+            }
+            return -1;
+        }
+    }
+    if (queued_updates.head != NULL) {
+        types_stop_world();
+        apply_slot_updates(&queued_updates);
+        types_start_world();
+        ASSERT_TYPE_LOCK_HELD();
 
-    assert(types_world_is_stopped());
+        slot_update_free_chunks(&queued_updates);
+
+        /* Clear the VALID_VERSION flag of 'type' and all its subclasses. */
+        type_modified_unlocked(type);
+    }
+    return 0;
+}
+
+#else
+
+// Called when __bases__ is re-assigned.
+static int
+update_all_slots(PyTypeObject* type)
+{
+    pytype_slotdef *p;
 
     for (p = slotdefs; p->name; p++) {
-        /* update_slot returns int but can't actually fail */
-        update_slot(type, p->name_strobj);
+        /* update_slot returns int but can't actually fail in this case*/
+        update_slot(type, p->name_strobj, NULL);
     }
 
     /* Clear the VALID_VERSION flag of 'type' and all its subclasses. */
     type_modified_unlocked(type);
+    return 0;
 }
 
+#endif
+
 
 PyObject *
 _PyType_GetSlotWrapperNames(void)

From d511ca6015e8a95c4f0f8c76330a8b57e3e1060c Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Tue, 29 Apr 2025 16:57:51 -0700
Subject: [PATCH 03/18] Avoid "empty structure" compile error.

---
 Objects/typeobject.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 7abedf40859646..743f88c8cd67fb 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -3795,8 +3795,8 @@ apply_slot_updates(slot_update_t *updates)
 
 #else
 
-// not used, slot updates are applied immediately
-typedef struct {} slot_update_t;
+// dummy definition, this parameter is only NULL in the default build
+typedef void slot_update_t;
 
 #endif
 

From 5e38497b323fdb828d61a0a965841f07086edbcc Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Tue, 29 Apr 2025 22:40:28 -0700
Subject: [PATCH 04/18] Use apply_slot_updates() for type_setattro().

---
 Objects/typeobject.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 743f88c8cd67fb..5dc7aafe7f4442 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -6314,6 +6314,32 @@ type_update_dict(PyTypeObject *type, PyDictObject *dict, PyObject *name,
     return 0;
 }
 
+static int
+update_slot_after_setattr(PyTypeObject *type, PyObject *name)
+{
+#ifdef Py_GIL_DISABLED
+    // stack allocate one chunk since that's all we need
+    assert(SLOT_UPDATE_CHUNK_SIZE >= MAX_EQUIV);
+    slot_update_chunk_t chunk = {0};
+    slot_update_t queued_updates = {&chunk};
+
+    if (update_slot(type, name, &queued_updates) < 0) {
+        return -1;
+    }
+    if (queued_updates.head != NULL) {
+        types_stop_world();
+        apply_slot_updates(&queued_updates);
+        types_start_world();
+        ASSERT_TYPE_LOCK_HELD();
+        // should never allocate another chunk
+        assert(chunk.prev == NULL);
+    }
+#else
+    update_slot(type, name, NULL);
+#endif
+    return 0;
+}
+
 static int
 type_setattro(PyObject *self, PyObject *name, PyObject *value)
 {
@@ -6389,10 +6415,7 @@ type_setattro(PyObject *self, PyObject *name, PyObject *value)
     if (res == 0) {
         if (is_dunder_name(name) && has_slotdef(name)) {
             // The name corresponds to a type slot.
-            types_stop_world();
-            res = update_slot(type, name, NULL);
-            types_start_world();
-            ASSERT_TYPE_LOCK_HELD();
+            res = update_slot_after_setattr(type, name);
         }
     }
     END_TYPE_DICT_LOCK();

From 8c74a0c2e29ca8d4afadca8346eed685811943ca Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Tue, 29 Apr 2025 22:48:26 -0700
Subject: [PATCH 05/18] Reduce number of items in test for slot updates.

Now that stop-the-world is used to do the slot update, these tests
are a lot slower in the free-threaded build.  Test with fewer items,
which will still hopefully be enough to find bugs in the specializer.
---
 Lib/test/test_opcache.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Lib/test/test_opcache.py b/Lib/test/test_opcache.py
index 21d7e62833c061..40349339df54fd 100644
--- a/Lib/test/test_opcache.py
+++ b/Lib/test/test_opcache.py
@@ -576,6 +576,7 @@ class TestRacesDoNotCrash(TestBase):
     # Careful with these. Bigger numbers have a higher chance of catching bugs,
     # but you can also burn through a *ton* of type/dict/function versions:
     ITEMS = 1000
+    SMALL_ITEMS = 100
     LOOPS = 4
     WRITERS = 2
 
@@ -619,7 +620,7 @@ class C:
                 __getitem__ = lambda self, item: None
 
             items = []
-            for _ in range(self.ITEMS):
+            for _ in range(self.SMALL_ITEMS):
                 item = C()
                 items.append(item)
             return items
@@ -790,7 +791,7 @@ class C:
                 __getattribute__ = lambda self, name: None
 
             items = []
-            for _ in range(self.ITEMS):
+            for _ in range(self.SMALL_ITEMS):
                 item = C()
                 items.append(item)
             return items

From 6cd7644c99c2980c9f054d6bd77b2d8e39cb0dee Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Tue, 29 Apr 2025 23:02:09 -0700
Subject: [PATCH 06/18] Add TSAN suppression for _Py_slot_tp_getattr_hook.

---
 Tools/tsan/suppressions_free_threading.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Tools/tsan/suppressions_free_threading.txt b/Tools/tsan/suppressions_free_threading.txt
index 404c30157362aa..3e4c70b636b585 100644
--- a/Tools/tsan/suppressions_free_threading.txt
+++ b/Tools/tsan/suppressions_free_threading.txt
@@ -41,3 +41,6 @@ race:list_inplace_repeat_lock_held
 # PyObject_Realloc internally does memcpy which isn't atomic so can race
 # with non-locking reads. See #132070
 race:PyObject_Realloc
+
+# This function writes to the tp_getattr slot in an unsafe way.
+race:_Py_slot_tp_getattr_hook

From 3cb225610df2ff9c89b6ec7a27bfc630a1420939 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Wed, 30 Apr 2025 00:30:14 -0700
Subject: [PATCH 07/18] Queue update of tp_flags as well.

The clearing of Py_TPFLAGS_HAVE_VECTORCALL must be done when the world
is stopped too.
---
 Objects/typeobject.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 5dc7aafe7f4442..268913b7d0f88b 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -171,6 +171,9 @@ slot_tp_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
 static int
 slot_tp_setattro(PyObject *self, PyObject *name, PyObject *value);
 
+static PyObject *
+slot_tp_call(PyObject *self, PyObject *args, PyObject *kwds);
+
 static inline PyTypeObject *
 type_from_ref(PyObject *ref)
 {
@@ -3709,6 +3712,7 @@ solid_base(PyTypeObject *type)
 // safe.
 
 typedef struct {
+    PyTypeObject *type;
     void **slot_ptr;
     void *slot_value;
 } slot_update_item_t;
@@ -3758,7 +3762,8 @@ slot_update_free_chunks(slot_update_t *updates)
 }
 
 static int
-queue_slot_update(slot_update_t *updates, void **slot_ptr, void *slot_value)
+queue_slot_update(slot_update_t *updates, PyTypeObject *type,
+                  void **slot_ptr, void *slot_value)
 {
     if (*slot_ptr == slot_value) {
         return 0; // slot pointer not actually changed, don't queue update
@@ -3772,6 +3777,7 @@ queue_slot_update(slot_update_t *updates, void **slot_ptr, void *slot_value)
         updates->head = chunk;
     }
     slot_update_item_t *item = &updates->head->updates[updates->head->n];
+    item->type = type;
     item->slot_ptr = slot_ptr;
     item->slot_value = slot_value;
     updates->head->n++;
@@ -3788,6 +3794,10 @@ apply_slot_updates(slot_update_t *updates)
         for (Py_ssize_t i = 0; i < chunk->n; i++) {
             slot_update_item_t *item = &chunk->updates[i];
             *(item->slot_ptr) = item->slot_value;
+            if (item->slot_value == slot_tp_call) {
+                /* A generic __call__ is incompatible with vectorcall */
+                type_clear_flags(item->type, Py_TPFLAGS_HAVE_VECTORCALL);
+            }
         }
         chunk = chunk->prev;
     }
@@ -11517,7 +11527,9 @@ update_one_slot(PyTypeObject *type, pytype_slotdef *p, pytype_slotdef **next_p,
             }
             if (p->function == slot_tp_call) {
                 /* A generic __call__ is incompatible with vectorcall */
-                type_clear_flags(type, Py_TPFLAGS_HAVE_VECTORCALL);
+                if (queued_updates == NULL) {
+                    type_clear_flags(type, Py_TPFLAGS_HAVE_VECTORCALL);
+                }
             }
         }
         Py_DECREF(descr);
@@ -11533,7 +11545,7 @@ update_one_slot(PyTypeObject *type, pytype_slotdef *p, pytype_slotdef **next_p,
 #ifdef Py_GIL_DISABLED
     if (queued_updates != NULL) {
         // queue the update to perform later, while world is stopped
-        if (queue_slot_update(queued_updates, ptr, slot_value) < 0) {
+        if (queue_slot_update(queued_updates, type, ptr, slot_value) < 0) {
             return -1;
         }
     } else {

From 47e41c9896587cbfc5a9c6ad5d37a5b95cf16051 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Wed, 30 Apr 2025 06:45:00 -0700
Subject: [PATCH 08/18] Performance, skip stop-the-world when possible.

Since we stack allocate one chunk, we need to check 'n' to see if there
are actually any updates to make.  It's pretty common that no updates
are actually needed.
---
 Objects/typeobject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 268913b7d0f88b..6c47c64ee24eaf 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -6336,7 +6336,7 @@ update_slot_after_setattr(PyTypeObject *type, PyObject *name)
     if (update_slot(type, name, &queued_updates) < 0) {
         return -1;
     }
-    if (queued_updates.head != NULL) {
+    if (queued_updates.head->n > 0) {
         types_stop_world();
         apply_slot_updates(&queued_updates);
         types_start_world();

From 9859ebf50986977b116d58f98ae81b054f04844c Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Thu, 1 May 2025 03:47:04 -0700
Subject: [PATCH 09/18] Always clear version after __bases__ update.

---
 Objects/typeobject.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index a8a38aa367baaf..cff8774405dda1 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -1868,6 +1868,8 @@ type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases, PyTypeObject *b
         if (update_all_slots(type) < 0) {
             goto bail;
         }
+        /* Clear the VALID_VERSION flag of 'type' and all its subclasses. */
+        type_modified_unlocked(type);
     }
     else {
         res = 0;
@@ -11618,11 +11620,7 @@ update_all_slots(PyTypeObject* type)
         apply_slot_updates(&queued_updates);
         types_start_world();
         ASSERT_TYPE_LOCK_HELD();
-
         slot_update_free_chunks(&queued_updates);
-
-        /* Clear the VALID_VERSION flag of 'type' and all its subclasses. */
-        type_modified_unlocked(type);
     }
     return 0;
 }
@@ -11634,14 +11632,10 @@ static int
 update_all_slots(PyTypeObject* type)
 {
     pytype_slotdef *p;
-
     for (p = slotdefs; p->name; p++) {
         /* update_slot returns int but can't actually fail in this case*/
         update_slot(type, p->name_strobj, NULL);
     }
-
-    /* Clear the VALID_VERSION flag of 'type' and all its subclasses. */
-    type_modified_unlocked(type);
     return 0;
 }
 

From 583c435df6179ba6675fa3274bab6bfcfbcf9862 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Thu, 1 May 2025 04:31:09 -0700
Subject: [PATCH 10/18] Add test for assigning __bases__.

---
 Lib/test/test_descr.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/Lib/test/test_descr.py b/Lib/test/test_descr.py
index 8e9d44a583cb31..796b5df65b9887 100644
--- a/Lib/test/test_descr.py
+++ b/Lib/test/test_descr.py
@@ -4113,6 +4113,32 @@ class E(D):
         else:
             self.fail("shouldn't be able to create inheritance cycles")
 
+    def test_assign_bases_many_subclasses(self):
+        class A:
+            x = 'hello'
+            def __call__(self):
+                return 123
+            def __getitem__(self, index):
+                return None
+
+        class X:
+            x = 'bye'
+
+        class B(A):
+            pass
+
+        subclasses = []
+        for i in range(1000):
+            sc = type(f'Sub{i}', (B,), {})
+            subclasses.append(sc)
+
+        self.assertEqual(subclasses[0]()(), 123)
+        self.assertEqual(subclasses[0]().x, 'hello')
+        B.__bases__ = (X,)
+        with self.assertRaises(TypeError):
+            subclasses[0]()()
+        self.assertEqual(subclasses[0]().x, 'bye')
+
     def test_builtin_bases(self):
         # Make sure all the builtin types can have their base queried without
         # segfaulting. See issue #5787.

From c01707e839116fb0a87d82ac6d66d1c288888d47 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Thu, 1 May 2025 10:41:59 -0700
Subject: [PATCH 11/18] Avoid releasing TYPE_LOCK when stopping the world.

---
 Objects/typeobject.c | 97 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 83 insertions(+), 14 deletions(-)

diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index cff8774405dda1..307302902d01bc 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -136,6 +136,50 @@ types_start_world(void)
     assert(!types_world_is_stopped());
 }
 
+// This is used to temporarily prevent the TYPE_LOCK from being suspended
+// when held by the topmost critical section.
+static void
+type_lock_prevent_release(void)
+{
+    PyThreadState *tstate = _PyThreadState_GET();
+    uintptr_t *tagptr = &tstate->critical_section;
+    PyCriticalSection *c = (PyCriticalSection *)(*tagptr & ~_Py_CRITICAL_SECTION_MASK);
+    if (c->_cs_mutex == TYPE_LOCK) {
+        c->_cs_mutex = NULL;
+    }
+    else {
+        assert(*tagptr & _Py_CRITICAL_SECTION_TWO_MUTEXES);
+        PyCriticalSection2 *c2 = (PyCriticalSection2 *)c;
+        if (c2->_cs_mutex2 == TYPE_LOCK) {
+                c2->_cs_mutex2 = NULL;
+        }
+        else {
+            assert(0); // TYPE_LOCK must be one of the mutexes
+        }
+    }
+}
+
+static void
+type_lock_allow_release(void)
+{
+    PyThreadState *tstate = _PyThreadState_GET();
+    uintptr_t *tagptr = &tstate->critical_section;
+    PyCriticalSection *c = (PyCriticalSection *)(*tagptr & ~_Py_CRITICAL_SECTION_MASK);
+    if (c->_cs_mutex == NULL) {
+        c->_cs_mutex = TYPE_LOCK;
+    }
+    else {
+        assert(*tagptr & _Py_CRITICAL_SECTION_TWO_MUTEXES);
+        PyCriticalSection2 *c2 = (PyCriticalSection2 *)c;
+        if (c2->_cs_mutex2 == NULL) {
+                c2->_cs_mutex2 = TYPE_LOCK;
+        }
+        else {
+            assert(0);
+        }
+    }
+}
+
 #else
 
 #define BEGIN_TYPE_LOCK()
@@ -3656,14 +3700,15 @@ solid_base(PyTypeObject *type)
 #ifdef Py_GIL_DISABLED
 
 // The structures and functions below are used in the free-threaded build
-// to safely make updates to type slots, when __bases__ is re-assigned.  Since
-// the slots are read without atomic operations and without locking, we can
-// only safely update them while the world is stopped.  However, with the
-// world stopped, we are very limited on which APIs can be safely used.  For
-// example, calling _PyObject_HashFast() or _PyDict_GetItemRef_KnownHash() are
-// not safe and can potentially cause deadlocks.  Hashing can be re-entrant
-// and _PyDict_GetItemRef_KnownHash can acquire a lock if the dictionary is
-// not owned by the current thread, to mark it shared on reading.
+// to safely make updates to type slots, on type_setattro() for a slot
+// or when __bases__ is re-assigned.  Since the slots are read without atomic
+// operations and without locking, we can only safely update them while the
+// world is stopped.  However, with the world stopped, we are very limited on
+// which APIs can be safely used.  For example, calling _PyObject_HashFast()
+// or _PyDict_GetItemRef_KnownHash() are not safe and can potentially cause
+// deadlocks.  Hashing can be re-entrant and _PyDict_GetItemRef_KnownHash can
+// acquire a lock if the dictionary is not owned by the current thread, to
+// mark it shared on reading.
 //
 // We do the slot updates in two steps.  First, with TYPE_LOCK held, we lookup
 // the descriptor for each slot, for each subclass. We build a queue of
@@ -3764,6 +3809,34 @@ apply_slot_updates(slot_update_t *updates)
     }
 }
 
+static void
+apply_slot_updates_world_stopped(slot_update_t *updates)
+{
+    // This must be done carefully to avoid data races and deadlocks.  We
+    // have just updated the type __dict__, while holding TYPE_LOCK.  We have
+    // collected all of the required type slot updates into the 'updates'
+    // queue.  Note that those updates can apply to multiple types since
+    // subclasses might also be affected by the dict change.
+    //
+    // We need to prevent other threads from writing to the dict before we can
+    // finish updating the slots. The actual stores to the slots are done
+    // with the world stopped.  If we block on the stop-the-world mutex then
+    // we could release TYPE_LOCK mutex and potentially allow other threads
+    // to update the dict.  That's because TYPE_LOCK was acquired using a
+    // critical section.
+    //
+    // The type_lock_prevent_release() call prevents the TYPE_LOCK mutex from
+    // being released even if we block on the STM mutex.  We need to take care
+    // that we do not deadlock because of that.  It is safe because we always
+    // acquire locks in the same order: first the TYPE_LOCK mutex and then the
+    // STM mutex.
+    type_lock_prevent_release();
+    types_stop_world();
+    apply_slot_updates(updates);
+    types_start_world();
+    type_lock_allow_release();
+}
+
 #else
 
 // dummy definition, this parameter is only NULL in the default build
@@ -6298,9 +6371,7 @@ update_slot_after_setattr(PyTypeObject *type, PyObject *name)
         return -1;
     }
     if (queued_updates.head->n > 0) {
-        types_stop_world();
-        apply_slot_updates(&queued_updates);
-        types_start_world();
+        apply_slot_updates_world_stopped(&queued_updates);
         ASSERT_TYPE_LOCK_HELD();
         // should never allocate another chunk
         assert(chunk.prev == NULL);
@@ -11616,9 +11687,7 @@ update_all_slots(PyTypeObject* type)
         }
     }
     if (queued_updates.head != NULL) {
-        types_stop_world();
-        apply_slot_updates(&queued_updates);
-        types_start_world();
+        apply_slot_updates_world_stopped(&queued_updates);
         ASSERT_TYPE_LOCK_HELD();
         slot_update_free_chunks(&queued_updates);
     }

From a1c6b054e8a73fa658a37fa6431621b058575d9f Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Mon, 5 May 2025 12:43:30 -0700
Subject: [PATCH 12/18] Add issue number for TSAN suppression.

---
 Tools/tsan/suppressions_free_threading.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tools/tsan/suppressions_free_threading.txt b/Tools/tsan/suppressions_free_threading.txt
index 3e4c70b636b585..ee2c4095eb87ab 100644
--- a/Tools/tsan/suppressions_free_threading.txt
+++ b/Tools/tsan/suppressions_free_threading.txt
@@ -42,5 +42,5 @@ race:list_inplace_repeat_lock_held
 # with non-locking reads. See #132070
 race:PyObject_Realloc
 
-# This function writes to the tp_getattr slot in an unsafe way.
+# This function writes to the tp_getattr slot in an unsafe way, see #133467.
 race:_Py_slot_tp_getattr_hook

From 3f6222b70eb3751ff309f9a56d5cbae92b1ac6b9 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Mon, 5 May 2025 14:16:14 -0700
Subject: [PATCH 13/18] Bug fix for type_lock_prevent_release().

If the two mutex form of the critical section is used, need to put the
other mutex into '_cs_mutex'.
---
 Objects/typeobject.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 0d1288a093b7c6..d419306fe78bd6 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -144,17 +144,18 @@ type_lock_prevent_release(void)
     PyThreadState *tstate = _PyThreadState_GET();
     uintptr_t *tagptr = &tstate->critical_section;
     PyCriticalSection *c = (PyCriticalSection *)(*tagptr & ~_Py_CRITICAL_SECTION_MASK);
-    if (c->_cs_mutex == TYPE_LOCK) {
+    if (!(*tagptr & _Py_CRITICAL_SECTION_TWO_MUTEXES)) {
+        assert(c->_cs_mutex == TYPE_LOCK);
         c->_cs_mutex = NULL;
     }
     else {
-        assert(*tagptr & _Py_CRITICAL_SECTION_TWO_MUTEXES);
         PyCriticalSection2 *c2 = (PyCriticalSection2 *)c;
-        if (c2->_cs_mutex2 == TYPE_LOCK) {
-                c2->_cs_mutex2 = NULL;
-        }
-        else {
-            assert(0); // TYPE_LOCK must be one of the mutexes
+        if (c->_cs_mutex == TYPE_LOCK) {
+            c->_cs_mutex = c2->_cs_mutex2;
+            c2->_cs_mutex2 = NULL;
+        } else {
+            assert(c2->_cs_mutex2 == TYPE_LOCK);
+            c2->_cs_mutex2 = NULL;
         }
     }
 }
@@ -165,18 +166,14 @@ type_lock_allow_release(void)
     PyThreadState *tstate = _PyThreadState_GET();
     uintptr_t *tagptr = &tstate->critical_section;
     PyCriticalSection *c = (PyCriticalSection *)(*tagptr & ~_Py_CRITICAL_SECTION_MASK);
-    if (c->_cs_mutex == NULL) {
+    if (!(*tagptr & _Py_CRITICAL_SECTION_TWO_MUTEXES)) {
+        assert(c->_cs_mutex == NULL);
         c->_cs_mutex = TYPE_LOCK;
     }
     else {
-        assert(*tagptr & _Py_CRITICAL_SECTION_TWO_MUTEXES);
         PyCriticalSection2 *c2 = (PyCriticalSection2 *)c;
-        if (c2->_cs_mutex2 == NULL) {
-                c2->_cs_mutex2 = TYPE_LOCK;
-        }
-        else {
-            assert(0);
-        }
+        assert(c2->_cs_mutex2 == NULL);
+        c2->_cs_mutex2 = TYPE_LOCK;
     }
 }
 

From 2bcf7ba6329f54d4162e5444c5f313c9d3cf304e Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Tue, 27 May 2025 10:54:39 -0700
Subject: [PATCH 14/18] Add additional assert.

---
 Objects/typeobject.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index d419306fe78bd6..e3affaade17ed9 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -1178,6 +1178,7 @@ type_modified_unlocked(PyTypeObject *type)
        needed.
      */
     ASSERT_NEW_TYPE_OR_LOCKED(type);
+    assert(!types_world_is_stopped());
     if (type->tp_version_tag == 0) {
         return;
     }

From 63b7ae41c3b088f9de780c0a1fcd4b8445994374 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Tue, 27 May 2025 11:30:21 -0700
Subject: [PATCH 15/18] Revert test_opcache item size change.

This test gets a bit slower, due to stop-the-world but it is not so
dramatic.
---
 Lib/test/test_opcache.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Lib/test/test_opcache.py b/Lib/test/test_opcache.py
index 987341aecc8508..30baa09048616c 100644
--- a/Lib/test/test_opcache.py
+++ b/Lib/test/test_opcache.py
@@ -583,7 +583,6 @@ class TestRacesDoNotCrash(TestBase):
     # Careful with these. Bigger numbers have a higher chance of catching bugs,
     # but you can also burn through a *ton* of type/dict/function versions:
     ITEMS = 1000
-    SMALL_ITEMS = 100
     LOOPS = 4
     WRITERS = 2
 
@@ -627,7 +626,7 @@ class C:
                 __getitem__ = lambda self, item: None
 
             items = []
-            for _ in range(self.SMALL_ITEMS):
+            for _ in range(self.ITEMS):
                 item = C()
                 items.append(item)
             return items
@@ -798,7 +797,7 @@ class C:
                 __getattribute__ = lambda self, name: None
 
             items = []
-            for _ in range(self.SMALL_ITEMS):
+            for _ in range(self.ITEMS):
                 item = C()
                 items.append(item)
             return items

From 1a2fee1048b292f9b30d491f2cb3d313b9976b7c Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Tue, 27 May 2025 11:34:05 -0700
Subject: [PATCH 16/18] Add comment for new unit test.

---
 Lib/test/test_descr.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Lib/test/test_descr.py b/Lib/test/test_descr.py
index 689edca64cb331..f6ec2cf5ce8b5c 100644
--- a/Lib/test/test_descr.py
+++ b/Lib/test/test_descr.py
@@ -4115,6 +4115,8 @@ class E(D):
             self.fail("shouldn't be able to create inheritance cycles")
 
     def test_assign_bases_many_subclasses(self):
+        # This is intended to check that typeobject.c:queue_slot_update() can
+        # handle updating many subclasses when a slot method is re-assigned.
         class A:
             x = 'hello'
             def __call__(self):

From c1f3ed56a21d93d2c66dddfd2268628ab91be715 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Tue, 27 May 2025 11:49:02 -0700
Subject: [PATCH 17/18] Fix assert for default build.

---
 Objects/typeobject.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index b03b99a942ed35..6251ca577b9e83 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -1178,7 +1178,11 @@ type_modified_unlocked(PyTypeObject *type)
        needed.
      */
     ASSERT_NEW_TYPE_OR_LOCKED(type);
+#ifdef Py_GIL_DISABLED
+    // This function is re-entrant and it's not safe to call it
+    // with the world stopped.
     assert(!types_world_is_stopped());
+#endif
     if (type->tp_version_tag == 0) {
         return;
     }

From 41e54e11fe92821ff1985ea1564f98c1f5a1b752 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Tue, 27 May 2025 15:47:50 -0700
Subject: [PATCH 18/18] Improve function name.

The apply_slot_updates_world_stopped() name implies that the world
is already stopped, based on convention.  Rename for clarity.
---
 Objects/typeobject.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 6251ca577b9e83..07f01e3bcb229c 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -3839,7 +3839,7 @@ apply_slot_updates(slot_update_t *updates)
 }
 
 static void
-apply_slot_updates_world_stopped(slot_update_t *updates)
+apply_type_slot_updates(slot_update_t *updates)
 {
     // This must be done carefully to avoid data races and deadlocks.  We
     // have just updated the type __dict__, while holding TYPE_LOCK.  We have
@@ -6402,7 +6402,7 @@ update_slot_after_setattr(PyTypeObject *type, PyObject *name)
         return -1;
     }
     if (queued_updates.head->n > 0) {
-        apply_slot_updates_world_stopped(&queued_updates);
+        apply_type_slot_updates(&queued_updates);
         ASSERT_TYPE_LOCK_HELD();
         // should never allocate another chunk
         assert(chunk.prev == NULL);
@@ -11718,7 +11718,7 @@ update_all_slots(PyTypeObject* type)
         }
     }
     if (queued_updates.head != NULL) {
-        apply_slot_updates_world_stopped(&queued_updates);
+        apply_type_slot_updates(&queued_updates);
         ASSERT_TYPE_LOCK_HELD();
         slot_update_free_chunks(&queued_updates);
     }