diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index c3e6c77405bfe7..7f5cc28d2b610f 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -677,8 +677,11 @@ struct _Py_interp_cached_objects { /* object.__reduce__ */ PyObject *objreduce; +#ifndef Py_GIL_DISABLED + /* resolve_slotdups() */ PyObject *type_slots_pname; pytype_slotdef *type_slots_ptrs[MAX_EQUIV]; +#endif /* TypeVar and related types */ PyTypeObject *generic_type; diff --git a/Include/internal/pycore_object.h b/Include/internal/pycore_object.h index b7e162c8abcabf..986bcc9fd08b1c 100644 --- a/Include/internal/pycore_object.h +++ b/Include/internal/pycore_object.h @@ -313,7 +313,7 @@ extern int _PyDict_CheckConsistency(PyObject *mp, int check_content); // Fast inlined version of PyType_HasFeature() static inline int _PyType_HasFeature(PyTypeObject *type, unsigned long feature) { - return ((FT_ATOMIC_LOAD_ULONG_RELAXED(type->tp_flags) & feature) != 0); + return ((type->tp_flags) & feature) != 0; } extern void _PyType_InitCache(PyInterpreterState *interp); diff --git a/Include/internal/pycore_typeobject.h b/Include/internal/pycore_typeobject.h index 1a4f89fd2449a0..0ee7d555c56cdd 100644 --- a/Include/internal/pycore_typeobject.h +++ b/Include/internal/pycore_typeobject.h @@ -134,7 +134,6 @@ extern int _PyType_AddMethod(PyTypeObject *, PyMethodDef *); extern void _PyType_SetFlagsRecursive(PyTypeObject *self, unsigned long mask, unsigned long flags); -extern unsigned int _PyType_GetVersionForCurrentState(PyTypeObject *tp); PyAPI_FUNC(void) _PyType_SetVersion(PyTypeObject *tp, unsigned int version); PyTypeObject *_PyType_LookupByVersion(unsigned int version); diff --git a/Include/object.h b/Include/object.h index 8cc83abb8574e3..994cac1ad17501 100644 --- a/Include/object.h +++ b/Include/object.h @@ -620,6 +620,12 @@ given type object has a specified feature. #define Py_TPFLAGS_HAVE_FINALIZE (1UL << 0) #define Py_TPFLAGS_HAVE_VERSION_TAG (1UL << 18) +// Flag values for ob_flags (16 bits available, if SIZEOF_VOID_P > 4). +#define _Py_IMMORTAL_FLAGS (1 << 0) +#define _Py_STATICALLY_ALLOCATED_FLAG (1 << 2) +#if defined(Py_GIL_DISABLED) && defined(Py_DEBUG) +#define _Py_TYPE_REVEALED_FLAG (1 << 3) +#endif #define Py_CONSTANT_NONE 0 #define Py_CONSTANT_FALSE 1 @@ -776,11 +782,7 @@ PyType_HasFeature(PyTypeObject *type, unsigned long feature) // PyTypeObject is opaque in the limited C API flags = PyType_GetFlags(type); #else -# ifdef Py_GIL_DISABLED - flags = _Py_atomic_load_ulong_relaxed(&type->tp_flags); -# else - flags = type->tp_flags; -# endif + flags = type->tp_flags; #endif return ((flags & feature) != 0); } diff --git a/Include/refcount.h b/Include/refcount.h index 177bbdaf0c5977..ebd1dba6d15e1a 100644 --- a/Include/refcount.h +++ b/Include/refcount.h @@ -19,9 +19,6 @@ immortal. The latter should be the only instances that require cleanup during runtime finalization. */ -#define _Py_STATICALLY_ALLOCATED_FLAG 4 -#define _Py_IMMORTAL_FLAGS 1 - #if SIZEOF_VOID_P > 4 /* In 64+ bit systems, any object whose 32 bit reference count is >= 2**31 diff --git a/Lib/test/test_descr.py b/Lib/test/test_descr.py index 76937432a43037..7e12220dd9d04e 100644 --- a/Lib/test/test_descr.py +++ b/Lib/test/test_descr.py @@ -4113,6 +4113,32 @@ class E(D): else: self.fail("shouldn't be able to create inheritance cycles") + def test_assign_bases_many_subclasses(self): + class A: + x = 'hello' + def __call__(self): + return 123 + def __getitem__(self, index): + return None + + class X: + x = 'bye' + + class B(A): + pass + + subclasses = [] + for i in range(1000): + sc = type(f'Sub{i}', (B,), {}) + subclasses.append(sc) + + self.assertEqual(subclasses[0]()(), 123) + self.assertEqual(subclasses[0]().x, 'hello') + B.__bases__ = (X,) + with self.assertRaises(TypeError): + subclasses[0]()() + self.assertEqual(subclasses[0]().x, 'bye') + def test_builtin_bases(self): # Make sure all the builtin types can have their base queried without # segfaulting. See issue #5787. diff --git a/Lib/test/test_opcache.py b/Lib/test/test_opcache.py index a45aafc63fa697..38fb1cbe8fa9df 100644 --- a/Lib/test/test_opcache.py +++ b/Lib/test/test_opcache.py @@ -583,6 +583,7 @@ class TestRacesDoNotCrash(TestBase): # Careful with these. Bigger numbers have a higher chance of catching bugs, # but you can also burn through a *ton* of type/dict/function versions: ITEMS = 1000 + SMALL_ITEMS = 100 LOOPS = 4 WRITERS = 2 @@ -626,7 +627,7 @@ class C: __getitem__ = lambda self, item: None items = [] - for _ in range(self.ITEMS): + for _ in range(self.SMALL_ITEMS): item = C() items.append(item) return items @@ -797,7 +798,7 @@ class C: __getattribute__ = lambda self, name: None items = [] - for _ in range(self.ITEMS): + for _ in range(self.SMALL_ITEMS): item = C() items.append(item) return items diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-03-14-13-08-20.gh-issue-127266._tyfBp.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-03-14-13-08-20.gh-issue-127266._tyfBp.rst new file mode 100644 index 00000000000000..b26977628de136 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-03-14-13-08-20.gh-issue-127266._tyfBp.rst @@ -0,0 +1,6 @@ +In the free-threaded build, avoid data races caused by updating type slots +or type flags after the type was initially created. For those (typically +rare) cases, use the stop-the-world mechanism. Remove the use of atomics +when reading or writing type flags. The use of atomics is not sufficient to +avoid races (since flags are sometimes read without a lock and without +atomics) and are no longer required. diff --git a/Objects/typeobject.c b/Objects/typeobject.c index a7ab69fef4c721..d419306fe78bd6 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -48,7 +48,7 @@ class object "PyObject *" "&PyBaseObject_Type" & ((1 << MCACHE_SIZE_EXP) - 1)) #define MCACHE_HASH_METHOD(type, name) \ - MCACHE_HASH(FT_ATOMIC_LOAD_UINT32_RELAXED((type)->tp_version_tag), \ + MCACHE_HASH(FT_ATOMIC_LOAD_UINT_RELAXED((type)->tp_version_tag), \ ((Py_ssize_t)(name)) >> 3) #define MCACHE_CACHEABLE_NAME(name) \ PyUnicode_CheckExact(name) && \ @@ -60,11 +60,19 @@ class object "PyObject *" "&PyBaseObject_Type" #ifdef Py_GIL_DISABLED -// There's a global lock for mutation of types. This avoids having to take -// additional locks while doing various subclass processing which may result -// in odd behaviors w.r.t. running with the GIL as the outer type lock could -// be released and reacquired during a subclass update if there's contention -// on the subclass lock. +// There's a global lock for types that ensures that tp_version_tag and +// _spec_cache are correctly updated if the type is modified. It also protects +// tp_mro, tp_bases, and tp_base. This avoids having to take additional locks +// while doing various subclass processing which may result in odd behaviors +// w.r.t. running with the GIL as the outer type lock could be released and +// reacquired during a subclass update if there's contention on the subclass +// lock. +// +// Note that this lock does not protect updates of other type slots or the +// tp_flags member. Instead, we either ensure those updates are done before +// the type has been revealed to other threads or we only do those updates +// while the stop-the-world mechanism is active. The slots and flags are read +// in many places without holding a lock and without atomics. #define TYPE_LOCK &PyInterpreterState_Get()->types.mutex #define BEGIN_TYPE_LOCK() Py_BEGIN_CRITICAL_SECTION_MUT(TYPE_LOCK) #define END_TYPE_LOCK() Py_END_CRITICAL_SECTION() @@ -74,8 +82,100 @@ class object "PyObject *" "&PyBaseObject_Type" #define END_TYPE_DICT_LOCK() Py_END_CRITICAL_SECTION2() +#ifdef Py_DEBUG +// Return true if the world is currently stopped. +static bool +types_world_is_stopped(void) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + return interp->stoptheworld.world_stopped; +} +#endif + +// Checks that the type has not yet been revealed (exposed) to other +// threads. The _Py_TYPE_REVEALED_FLAG flag is set by type_new() and +// PyType_FromMetaclass() to indicate that a newly initialized type might be +// revealed. We only have ob_flags on 64-bit platforms. +#if SIZEOF_VOID_P > 4 +#define TYPE_IS_REVEALED(tp) ((((PyObject *)(tp))->ob_flags & _Py_TYPE_REVEALED_FLAG) != 0) +#else +#define TYPE_IS_REVEALED(tp) 0 +#endif + +#ifdef Py_DEBUG #define ASSERT_TYPE_LOCK_HELD() \ - _Py_CRITICAL_SECTION_ASSERT_MUTEX_LOCKED(TYPE_LOCK) + if (!types_world_is_stopped()) { _Py_CRITICAL_SECTION_ASSERT_MUTEX_LOCKED(TYPE_LOCK); } + +// Checks if we can safely update type slots or tp_flags. +#define ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp) \ + assert(!TYPE_IS_REVEALED(tp) || types_world_is_stopped()) + +#define ASSERT_NEW_TYPE_OR_LOCKED(tp) \ + if (TYPE_IS_REVEALED(tp)) { ASSERT_TYPE_LOCK_HELD(); } +#else +#define ASSERT_TYPE_LOCK_HELD() +#define ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp) +#define ASSERT_NEW_TYPE_OR_LOCKED(tp) +#endif + +static void +types_stop_world(void) +{ + assert(!types_world_is_stopped()); + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyEval_StopTheWorld(interp); + assert(types_world_is_stopped()); +} + +static void +types_start_world(void) +{ + assert(types_world_is_stopped()); + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyEval_StartTheWorld(interp); + assert(!types_world_is_stopped()); +} + +// This is used to temporarily prevent the TYPE_LOCK from being suspended +// when held by the topmost critical section. +static void +type_lock_prevent_release(void) +{ + PyThreadState *tstate = _PyThreadState_GET(); + uintptr_t *tagptr = &tstate->critical_section; + PyCriticalSection *c = (PyCriticalSection *)(*tagptr & ~_Py_CRITICAL_SECTION_MASK); + if (!(*tagptr & _Py_CRITICAL_SECTION_TWO_MUTEXES)) { + assert(c->_cs_mutex == TYPE_LOCK); + c->_cs_mutex = NULL; + } + else { + PyCriticalSection2 *c2 = (PyCriticalSection2 *)c; + if (c->_cs_mutex == TYPE_LOCK) { + c->_cs_mutex = c2->_cs_mutex2; + c2->_cs_mutex2 = NULL; + } else { + assert(c2->_cs_mutex2 == TYPE_LOCK); + c2->_cs_mutex2 = NULL; + } + } +} + +static void +type_lock_allow_release(void) +{ + PyThreadState *tstate = _PyThreadState_GET(); + uintptr_t *tagptr = &tstate->critical_section; + PyCriticalSection *c = (PyCriticalSection *)(*tagptr & ~_Py_CRITICAL_SECTION_MASK); + if (!(*tagptr & _Py_CRITICAL_SECTION_TWO_MUTEXES)) { + assert(c->_cs_mutex == NULL); + c->_cs_mutex = TYPE_LOCK; + } + else { + PyCriticalSection2 *c2 = (PyCriticalSection2 *)c; + assert(c2->_cs_mutex2 == NULL); + c2->_cs_mutex2 = TYPE_LOCK; + } +} #else @@ -84,6 +184,12 @@ class object "PyObject *" "&PyBaseObject_Type" #define BEGIN_TYPE_DICT_LOCK(d) #define END_TYPE_DICT_LOCK() #define ASSERT_TYPE_LOCK_HELD() +#define TYPE_IS_REVEALED(tp) 0 +#define ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp) +#define ASSERT_NEW_TYPE_OR_LOCKED(tp) +#define types_world_is_stopped() 1 +#define types_stop_world() +#define types_start_world() #endif @@ -106,6 +212,9 @@ slot_tp_new(PyTypeObject *type, PyObject *args, PyObject *kwds); static int slot_tp_setattro(PyObject *self, PyObject *name, PyObject *value); +static PyObject * +slot_tp_call(PyObject *self, PyObject *args, PyObject *kwds); + static inline PyTypeObject * type_from_ref(PyObject *ref) { @@ -346,21 +455,14 @@ _PyStaticType_GetBuiltins(void) static void type_set_flags(PyTypeObject *tp, unsigned long flags) { - if (tp->tp_flags & Py_TPFLAGS_READY) { - // It's possible the type object has been exposed to other threads - // if it's been marked ready. In that case, the type lock should be - // held when flags are modified. - ASSERT_TYPE_LOCK_HELD(); - } - // Since PyType_HasFeature() reads the flags without holding the type - // lock, we need an atomic store here. - FT_ATOMIC_STORE_ULONG_RELAXED(tp->tp_flags, flags); + ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp); + tp->tp_flags = flags; } static void type_set_flags_with_mask(PyTypeObject *tp, unsigned long mask, unsigned long flags) { - ASSERT_TYPE_LOCK_HELD(); + ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp); unsigned long new_flags = (tp->tp_flags & ~mask) | flags; type_set_flags(tp, new_flags); } @@ -498,6 +600,7 @@ static inline void set_tp_bases(PyTypeObject *self, PyObject *bases, int initial) { assert(PyTuple_Check(bases)); + ASSERT_NEW_TYPE_OR_LOCKED(self); if (self->tp_flags & _Py_TPFLAGS_STATIC_BUILTIN) { // XXX tp_bases can probably be statically allocated for each // static builtin type. @@ -542,7 +645,7 @@ clear_tp_bases(PyTypeObject *self, int final) static inline PyObject * lookup_tp_mro(PyTypeObject *self) { - ASSERT_TYPE_LOCK_HELD(); + ASSERT_NEW_TYPE_OR_LOCKED(self); return self->tp_mro; } @@ -1027,7 +1130,6 @@ PyType_Unwatch(int watcher_id, PyObject* obj) static void set_version_unlocked(PyTypeObject *tp, unsigned int version) { - ASSERT_TYPE_LOCK_HELD(); assert(version == 0 || (tp->tp_versions_used != _Py_ATTR_CACHE_UNUSED)); #ifndef Py_GIL_DISABLED PyInterpreterState *interp = _PyInterpreterState_GET(); @@ -1075,7 +1177,7 @@ type_modified_unlocked(PyTypeObject *type) We don't assign new version tags eagerly, but only as needed. */ - ASSERT_TYPE_LOCK_HELD(); + ASSERT_NEW_TYPE_OR_LOCKED(type); if (type->tp_version_tag == 0) { return; } @@ -1106,6 +1208,8 @@ type_modified_unlocked(PyTypeObject *type) while (bits) { assert(i < TYPE_MAX_WATCHERS); if (bits & 1) { + // Note that PyErr_FormatUnraisable is potentially re-entrant + // and the watcher callback might be too. PyType_WatchCallback cb = interp->type_watchers[i]; if (cb && (cb(type) < 0)) { PyErr_FormatUnraisable( @@ -1245,14 +1349,6 @@ _PyType_LookupByVersion(unsigned int version) #endif } -unsigned int -_PyType_GetVersionForCurrentState(PyTypeObject *tp) -{ - return tp->tp_version_tag; -} - - - #define MAX_VERSIONS_PER_CLASS 1000 #if _Py_ATTR_CACHE_UNUSED < MAX_VERSIONS_PER_CLASS #error "_Py_ATTR_CACHE_UNUSED must be bigger than max" @@ -1586,10 +1682,13 @@ type_set_abstractmethods(PyObject *tp, PyObject *value, void *Py_UNUSED(closure) BEGIN_TYPE_LOCK(); type_modified_unlocked(type); + types_stop_world(); if (abstract) type_add_flags(type, Py_TPFLAGS_IS_ABSTRACT); else type_clear_flags(type, Py_TPFLAGS_IS_ABSTRACT); + types_start_world(); + ASSERT_TYPE_LOCK_HELD(); END_TYPE_LOCK(); return 0; @@ -1624,15 +1723,15 @@ type_get_mro(PyObject *tp, void *Py_UNUSED(closure)) return mro; } -static PyTypeObject *best_base(PyObject *); -static int mro_internal(PyTypeObject *, PyObject **); +static PyTypeObject *find_best_base(PyObject *); +static int mro_internal(PyTypeObject *, int, PyObject **); static int type_is_subtype_base_chain(PyTypeObject *, PyTypeObject *); static int compatible_for_assignment(PyTypeObject *, PyTypeObject *, const char *); static int add_subclass(PyTypeObject*, PyTypeObject*); static int add_all_subclasses(PyTypeObject *type, PyObject *bases); static void remove_subclass(PyTypeObject *, PyTypeObject *); static void remove_all_subclasses(PyTypeObject *type, PyObject *bases); -static void update_all_slots(PyTypeObject *); +static int update_all_slots(PyTypeObject *); typedef int (*update_callback)(PyTypeObject *, void *); static int update_subclasses(PyTypeObject *type, PyObject *attr_name, @@ -1640,13 +1739,15 @@ static int update_subclasses(PyTypeObject *type, PyObject *attr_name, static int recurse_down_subclasses(PyTypeObject *type, PyObject *name, update_callback callback, void *data); +// Compute tp_mro for this type and all of its subclasses. This +// is called after __bases__ is assigned to an existing type. static int mro_hierarchy(PyTypeObject *type, PyObject *temp) { ASSERT_TYPE_LOCK_HELD(); PyObject *old_mro; - int res = mro_internal(type, &old_mro); + int res = mro_internal(type, 0, &old_mro); if (res <= 0) { /* error / reentrance */ return res; @@ -1708,9 +1809,9 @@ mro_hierarchy(PyTypeObject *type, PyObject *temp) } static int -type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases) +type_check_new_bases(PyTypeObject *type, PyObject *new_bases, PyTypeObject **best_base) { - // Check arguments + // Check arguments, this is re-entrant due to the PySys_Audit() call if (!check_set_special_type_attr(type, new_bases, "__bases__")) { return -1; } @@ -1759,20 +1860,29 @@ type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases) } // Compute the new MRO and the new base class - PyTypeObject *new_base = best_base(new_bases); - if (new_base == NULL) + *best_base = find_best_base(new_bases); + if (*best_base == NULL) return -1; - if (!compatible_for_assignment(type->tp_base, new_base, "__bases__")) { + if (!compatible_for_assignment(type->tp_base, *best_base, "__bases__")) { return -1; } + return 0; +} + +static int +type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases, PyTypeObject *best_base) +{ + ASSERT_TYPE_LOCK_HELD(); + + Py_ssize_t n; PyObject *old_bases = lookup_tp_bases(type); assert(old_bases != NULL); PyTypeObject *old_base = type->tp_base; set_tp_bases(type, Py_NewRef(new_bases), 0); - type->tp_base = (PyTypeObject *)Py_NewRef(new_base); + type->tp_base = (PyTypeObject *)Py_NewRef(best_base); PyObject *temp = PyList_New(0); if (temp == NULL) { @@ -1796,7 +1906,11 @@ type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases) add to all new_bases */ remove_all_subclasses(type, old_bases); res = add_all_subclasses(type, new_bases); - update_all_slots(type); + if (update_all_slots(type) < 0) { + goto bail; + } + /* Clear the VALID_VERSION flag of 'type' and all its subclasses. */ + type_modified_unlocked(type); } else { res = 0; @@ -1827,13 +1941,13 @@ type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases) bail: if (lookup_tp_bases(type) == new_bases) { - assert(type->tp_base == new_base); + assert(type->tp_base == best_base); set_tp_bases(type, old_bases, 0); type->tp_base = old_base; Py_DECREF(new_bases); - Py_DECREF(new_base); + Py_DECREF(best_base); } else { Py_DECREF(old_bases); @@ -1848,9 +1962,13 @@ static int type_set_bases(PyObject *tp, PyObject *new_bases, void *Py_UNUSED(closure)) { PyTypeObject *type = PyTypeObject_CAST(tp); + PyTypeObject *best_base; int res; BEGIN_TYPE_LOCK(); - res = type_set_bases_unlocked(type, new_bases); + res = type_check_new_bases(type, new_bases, &best_base); + if (res == 0) { + res = type_set_bases_unlocked(type, new_bases, best_base); + } END_TYPE_LOCK(); return res; } @@ -3051,6 +3169,7 @@ static PyObject * class_name(PyObject *cls) { PyObject *name; + // Note that this is potentially re-entrant. if (PyObject_GetOptionalAttr(cls, &_Py_ID(__name__), &name) == 0) { name = PyObject_Repr(cls); } @@ -3387,9 +3506,13 @@ mro_invoke(PyTypeObject *type) const int custom = !Py_IS_TYPE(type, &PyType_Type); if (custom) { + // Custom mro() method on metaclass. This is potentially re-entrant. + // We are called either from type_ready() or from type_set_bases(). mro_result = call_method_noarg((PyObject *)type, &_Py_ID(mro)); } else { + // In this case, the mro() method on the type object is being used and + // we know that these calls are not re-entrant. mro_result = mro_implementation_unlocked(type); } if (mro_result == NULL) @@ -3437,7 +3560,7 @@ mro_invoke(PyTypeObject *type) - Returns -1 in case of an error. */ static int -mro_internal_unlocked(PyTypeObject *type, int initial, PyObject **p_old_mro) +mro_internal(PyTypeObject *type, int initial, PyObject **p_old_mro) { ASSERT_TYPE_LOCK_HELD(); @@ -3485,21 +3608,11 @@ mro_internal_unlocked(PyTypeObject *type, int initial, PyObject **p_old_mro) return 1; } -static int -mro_internal(PyTypeObject *type, PyObject **p_old_mro) -{ - int res; - BEGIN_TYPE_LOCK(); - res = mro_internal_unlocked(type, 0, p_old_mro); - END_TYPE_LOCK(); - return res; -} - /* Calculate the best base amongst multiple base classes. This is the first one that's on the path to the "solid base". */ static PyTypeObject * -best_base(PyObject *bases) +find_best_base(PyObject *bases) { Py_ssize_t i, n; PyTypeObject *base, *winner, *candidate; @@ -3581,13 +3694,167 @@ solid_base(PyTypeObject *type) } } +#ifdef Py_GIL_DISABLED + +// The structures and functions below are used in the free-threaded build +// to safely make updates to type slots, on type_setattro() for a slot +// or when __bases__ is re-assigned. Since the slots are read without atomic +// operations and without locking, we can only safely update them while the +// world is stopped. However, with the world stopped, we are very limited on +// which APIs can be safely used. For example, calling _PyObject_HashFast() +// or _PyDict_GetItemRef_KnownHash() are not safe and can potentially cause +// deadlocks. Hashing can be re-entrant and _PyDict_GetItemRef_KnownHash can +// acquire a lock if the dictionary is not owned by the current thread, to +// mark it shared on reading. +// +// We do the slot updates in two steps. First, with TYPE_LOCK held, we lookup +// the descriptor for each slot, for each subclass. We build a queue of +// updates to perform but don't actually update the type structures. After we +// are finished the lookups, we stop-the-world and apply all of the updates. +// The apply_slot_updates() code is simple and easy to confirm that it is +// safe. + +typedef struct { + PyTypeObject *type; + void **slot_ptr; + void *slot_value; +} slot_update_item_t; + +// The number of slot updates performed is based on the number of changed +// slots and the number of subclasses. It's possible there are many updates +// required if there are many subclasses (potentially an unbounded amount). +// Usually the number of slot updates is small, most often zero or one. When +// running the unit tests, we don't exceed 20. The chunk size is set to +// handle the common case with a single chunk and to not require too many +// chunk allocations if there are many subclasses. +#define SLOT_UPDATE_CHUNK_SIZE 30 + +typedef struct _slot_update { + struct _slot_update *prev; + Py_ssize_t n; + slot_update_item_t updates[SLOT_UPDATE_CHUNK_SIZE]; +} slot_update_chunk_t; + +// a queue of updates to be performed +typedef struct { + slot_update_chunk_t *head; +} slot_update_t; + +static slot_update_chunk_t * +slot_update_new_chunk(void) +{ + slot_update_chunk_t *chunk = PyMem_Malloc(sizeof(slot_update_chunk_t)); + if (chunk == NULL) { + PyErr_NoMemory(); + return NULL; + } + chunk->prev = NULL; + chunk->n = 0; + return chunk; +} + +static void +slot_update_free_chunks(slot_update_t *updates) +{ + slot_update_chunk_t *chunk = updates->head; + while (chunk != NULL) { + slot_update_chunk_t *prev = chunk->prev; + PyMem_Free(chunk); + chunk = prev; + } +} + +static int +queue_slot_update(slot_update_t *updates, PyTypeObject *type, + void **slot_ptr, void *slot_value) +{ + if (*slot_ptr == slot_value) { + return 0; // slot pointer not actually changed, don't queue update + } + if (updates->head == NULL || updates->head->n == SLOT_UPDATE_CHUNK_SIZE) { + slot_update_chunk_t *chunk = slot_update_new_chunk(); + if (chunk == NULL) { + return -1; // out-of-memory + } + chunk->prev = updates->head; + updates->head = chunk; + } + slot_update_item_t *item = &updates->head->updates[updates->head->n]; + item->type = type; + item->slot_ptr = slot_ptr; + item->slot_value = slot_value; + updates->head->n++; + assert(updates->head->n <= SLOT_UPDATE_CHUNK_SIZE); + return 0; +} + +static void +apply_slot_updates(slot_update_t *updates) +{ + assert(types_world_is_stopped()); + slot_update_chunk_t *chunk = updates->head; + while (chunk != NULL) { + for (Py_ssize_t i = 0; i < chunk->n; i++) { + slot_update_item_t *item = &chunk->updates[i]; + *(item->slot_ptr) = item->slot_value; + if (item->slot_value == slot_tp_call) { + /* A generic __call__ is incompatible with vectorcall */ + type_clear_flags(item->type, Py_TPFLAGS_HAVE_VECTORCALL); + } + } + chunk = chunk->prev; + } +} + +static void +apply_slot_updates_world_stopped(slot_update_t *updates) +{ + // This must be done carefully to avoid data races and deadlocks. We + // have just updated the type __dict__, while holding TYPE_LOCK. We have + // collected all of the required type slot updates into the 'updates' + // queue. Note that those updates can apply to multiple types since + // subclasses might also be affected by the dict change. + // + // We need to prevent other threads from writing to the dict before we can + // finish updating the slots. The actual stores to the slots are done + // with the world stopped. If we block on the stop-the-world mutex then + // we could release TYPE_LOCK mutex and potentially allow other threads + // to update the dict. That's because TYPE_LOCK was acquired using a + // critical section. + // + // The type_lock_prevent_release() call prevents the TYPE_LOCK mutex from + // being released even if we block on the STM mutex. We need to take care + // that we do not deadlock because of that. It is safe because we always + // acquire locks in the same order: first the TYPE_LOCK mutex and then the + // STM mutex. + type_lock_prevent_release(); + types_stop_world(); + apply_slot_updates(updates); + types_start_world(); + type_lock_allow_release(); +} + +#else + +// dummy definition, this parameter is only NULL in the default build +typedef void slot_update_t; + +#endif + +/// data passed to update_slots_callback() +typedef struct { + slot_update_t *queued_updates; + pytype_slotdef **defs; +} update_callback_data_t; + static void object_dealloc(PyObject *); static PyObject *object_new(PyTypeObject *, PyObject *, PyObject *); static int object_init(PyObject *, PyObject *, PyObject *); -static int update_slot(PyTypeObject *, PyObject *); +static int update_slot(PyTypeObject *, PyObject *, slot_update_t *update); static void fixup_slot_dispatchers(PyTypeObject *); static int type_new_set_names(PyTypeObject *); static int type_new_init_subclass(PyTypeObject *, PyObject *); +static bool has_slotdef(PyObject *); /* * Helpers for __dict__ descriptor. We don't want to expose the dicts @@ -3785,7 +4052,7 @@ type_init(PyObject *cls, PyObject *args, PyObject *kwds) unsigned long PyType_GetFlags(PyTypeObject *type) { - return FT_ATOMIC_LOAD_ULONG_RELAXED(type->tp_flags); + return type->tp_flags; } @@ -4563,6 +4830,10 @@ type_new_impl(type_new_ctx *ctx) } assert(_PyType_CheckConsistency(type)); +#if defined(Py_GIL_DISABLED) && defined(Py_DEBUG) && SIZEOF_VOID_P > 4 + // After this point, other threads can potentally use this type. + ((PyObject*)type)->ob_flags |= _Py_TYPE_REVEALED_FLAG; +#endif return (PyObject *)type; @@ -4625,7 +4896,7 @@ type_new_get_bases(type_new_ctx *ctx, PyObject **type) } /* Calculate best base, and check that all bases are type objects */ - PyTypeObject *base = best_base(ctx->bases); + PyTypeObject *base = find_best_base(ctx->bases); if (base == NULL) { return -1; } @@ -5040,12 +5311,12 @@ PyType_FromMetaclass( } /* Calculate best base, and check that all bases are type objects */ - PyTypeObject *base = best_base(bases); // borrowed ref + PyTypeObject *base = find_best_base(bases); // borrowed ref if (base == NULL) { goto finally; } - // best_base should check Py_TPFLAGS_BASETYPE & raise a proper exception, - // here we just check its work + // find_best_base() should check Py_TPFLAGS_BASETYPE & raise a proper + // exception, here we just check its work assert(_PyType_HasFeature(base, Py_TPFLAGS_BASETYPE)); /* Calculate sizes */ @@ -5276,6 +5547,10 @@ PyType_FromMetaclass( } assert(_PyType_CheckConsistency(type)); +#if defined(Py_GIL_DISABLED) && defined(Py_DEBUG) && SIZEOF_VOID_P > 4 + // After this point, other threads can potentally use this type. + ((PyObject*)type)->ob_flags |= _Py_TYPE_REVEALED_FLAG; +#endif finally: if (PyErr_Occurred()) { @@ -5571,8 +5846,6 @@ PyObject_GetItemData(PyObject *obj) static PyObject * find_name_in_mro(PyTypeObject *type, PyObject *name, int *error) { - ASSERT_TYPE_LOCK_HELD(); - Py_hash_t hash = _PyObject_HashFast(name); if (hash == -1) { *error = -1; @@ -5881,9 +6154,13 @@ _PyType_CacheGetItemForSpecialization(PyHeapTypeObject *ht, PyObject *descriptor void _PyType_SetFlags(PyTypeObject *self, unsigned long mask, unsigned long flags) { - BEGIN_TYPE_LOCK(); - type_set_flags_with_mask(self, mask, flags); - END_TYPE_LOCK(); + unsigned long new_flags = (self->tp_flags & ~mask) | flags; + if (new_flags != self->tp_flags) { + types_stop_world(); + // can't use new_flags here since they could be out-of-date + self->tp_flags = (self->tp_flags & ~mask) | flags; + types_start_world(); + } } int @@ -5930,9 +6207,9 @@ set_flags_recursive(PyTypeObject *self, unsigned long mask, unsigned long flags) void _PyType_SetFlagsRecursive(PyTypeObject *self, unsigned long mask, unsigned long flags) { - BEGIN_TYPE_LOCK(); + types_stop_world(); set_flags_recursive(self, mask, flags); - END_TYPE_LOCK(); + types_start_world(); } /* This is similar to PyObject_GenericGetAttr(), @@ -6046,6 +6323,8 @@ _Py_type_getattro(PyObject *tp, PyObject *name) return _Py_type_getattro_impl(type, name, NULL); } +// Called by type_setattro(). Updates both the type dict and +// the type versions. static int type_update_dict(PyTypeObject *type, PyDictObject *dict, PyObject *name, PyObject *value, PyObject **old_value) @@ -6075,10 +6354,30 @@ type_update_dict(PyTypeObject *type, PyDictObject *dict, PyObject *name, return -1; } - if (is_dunder_name(name)) { - return update_slot(type, name); - } + return 0; +} + +static int +update_slot_after_setattr(PyTypeObject *type, PyObject *name) +{ +#ifdef Py_GIL_DISABLED + // stack allocate one chunk since that's all we need + assert(SLOT_UPDATE_CHUNK_SIZE >= MAX_EQUIV); + slot_update_chunk_t chunk = {0}; + slot_update_t queued_updates = {&chunk}; + if (update_slot(type, name, &queued_updates) < 0) { + return -1; + } + if (queued_updates.head->n > 0) { + apply_slot_updates_world_stopped(&queued_updates); + ASSERT_TYPE_LOCK_HELD(); + // should never allocate another chunk + assert(chunk.prev == NULL); + } +#else + update_slot(type, name, NULL); +#endif return 0; } @@ -6136,7 +6435,9 @@ type_setattro(PyObject *self, PyObject *name, PyObject *value) PyObject *dict = type->tp_dict; if (dict == NULL) { - // We don't just do PyType_Ready because we could already be readying + // This is an unlikely case. PyType_Ready has not yet been done and + // we need to initialize tp_dict. We don't just do PyType_Ready + // because we could already be readying. BEGIN_TYPE_LOCK(); dict = type->tp_dict; if (dict == NULL) { @@ -6152,6 +6453,12 @@ type_setattro(PyObject *self, PyObject *name, PyObject *value) BEGIN_TYPE_DICT_LOCK(dict); res = type_update_dict(type, (PyDictObject *)dict, name, value, &old_value); assert(_PyType_CheckConsistency(type)); + if (res == 0) { + if (is_dunder_name(name) && has_slotdef(name)) { + // The name corresponds to a type slot. + res = update_slot_after_setattr(type, name); + } + } END_TYPE_DICT_LOCK(); done: @@ -7081,15 +7388,10 @@ object_set_class(PyObject *self, PyObject *value, void *closure) return -1; } -#ifdef Py_GIL_DISABLED - PyInterpreterState *interp = _PyInterpreterState_GET(); - _PyEval_StopTheWorld(interp); -#endif + types_stop_world(); PyTypeObject *oldto = Py_TYPE(self); int res = object_set_class_world_stopped(self, newto); -#ifdef Py_GIL_DISABLED - _PyEval_StartTheWorld(interp); -#endif + types_start_world(); if (res == 0) { if (oldto->tp_flags & Py_TPFLAGS_HEAPTYPE) { Py_DECREF(oldto); @@ -8497,7 +8799,7 @@ type_ready_mro(PyTypeObject *type, int initial) } /* Calculate method resolution order */ - if (mro_internal_unlocked(type, initial, NULL) < 0) { + if (mro_internal(type, initial, NULL) < 0) { return -1; } PyObject *mro = lookup_tp_mro(type); @@ -11020,12 +11322,21 @@ resolve_slotdups(PyTypeObject *type, PyObject *name) { /* XXX Maybe this could be optimized more -- but is it worth it? */ +#ifdef Py_GIL_DISABLED + pytype_slotdef *ptrs[MAX_EQUIV]; + pytype_slotdef **pp = ptrs; + /* Collect all slotdefs that match name into ptrs. */ + for (pytype_slotdef *p = slotdefs; p->name_strobj; p++) { + if (p->name_strobj == name) + *pp++ = p; + } + *pp = NULL; +#else /* pname and ptrs act as a little cache */ PyInterpreterState *interp = _PyInterpreterState_GET(); #define pname _Py_INTERP_CACHED_OBJECT(interp, type_slots_pname) #define ptrs _Py_INTERP_CACHED_OBJECT(interp, type_slots_ptrs) pytype_slotdef *p, **pp; - void **res, **ptr; if (pname != name) { /* Collect all slotdefs that match name into ptrs. */ @@ -11037,10 +11348,12 @@ resolve_slotdups(PyTypeObject *type, PyObject *name) } *pp = NULL; } +#endif /* Look in all slots of the type matching the name. If exactly one of these has a filled-in slot, return a pointer to that slot. Otherwise, return NULL. */ + void **res, **ptr; res = NULL; for (pp = ptrs; *pp; pp++) { ptr = slotptr(type, (*pp)->offset); @@ -11050,11 +11363,25 @@ resolve_slotdups(PyTypeObject *type, PyObject *name) return NULL; res = ptr; } - return res; +#ifndef Py_GIL_DISABLED #undef pname #undef ptrs +#endif + return res; } +// Return true if "name" corresponds to at least one slot definition. This is +// a more accurate but more expensive test compared to is_dunder_name(). +static bool +has_slotdef(PyObject *name) +{ + for (pytype_slotdef *p = slotdefs; p->name_strobj; p++) { + if (p->name_strobj == name) { + return true; + } + } + return false; +} /* Common code for update_slots_callback() and fixup_slot_dispatchers(). * @@ -11107,13 +11434,22 @@ resolve_slotdups(PyTypeObject *type, PyObject *name) * There are some further special cases for specific slots, like supporting * __hash__ = None for tp_hash and special code for tp_new. * - * When done, return a pointer to the next slotdef with a different offset, - * because that's convenient for fixup_slot_dispatchers(). This function never - * sets an exception: if an internal error happens (unlikely), it's ignored. */ -static pytype_slotdef * -update_one_slot(PyTypeObject *type, pytype_slotdef *p) + * When done, next_p is set to the next slotdef with a different offset, + * because that's convenient for fixup_slot_dispatchers(). + * + * If the queued_updates pointer is provided, the actual updates to the slot + * pointers are queued, rather than being immediately performed. That argument + * is only used for the free-threaded build since those updates need to be + * done while the world is stopped. + * + * This function will only return an error if the queued_updates argument is + * provided and allocating memory for the queue fails. Other exceptions that + * occur internally are ignored, such as when looking up descriptors. */ +static int +update_one_slot(PyTypeObject *type, pytype_slotdef *p, pytype_slotdef **next_p, + slot_update_t *queued_updates) { - ASSERT_TYPE_LOCK_HELD(); + ASSERT_NEW_TYPE_OR_LOCKED(type); PyObject *descr; PyWrapperDescrObject *d; @@ -11136,7 +11472,10 @@ update_one_slot(PyTypeObject *type, pytype_slotdef *p) do { ++p; } while (p->offset == offset); - return p; + if (next_p != NULL) { + *next_p = p; + } + return 0; } /* We may end up clearing live exceptions below, so make sure it's ours. */ assert(!PyErr_Occurred()); @@ -11219,16 +11558,41 @@ update_one_slot(PyTypeObject *type, pytype_slotdef *p) } if (p->function == slot_tp_call) { /* A generic __call__ is incompatible with vectorcall */ - type_clear_flags(type, Py_TPFLAGS_HAVE_VECTORCALL); + if (queued_updates == NULL) { + type_clear_flags(type, Py_TPFLAGS_HAVE_VECTORCALL); + } } } Py_DECREF(descr); } while ((++p)->offset == offset); - if (specific && !use_generic) - *ptr = specific; - else - *ptr = generic; - return p; + + void *slot_value; + if (specific && !use_generic) { + slot_value = specific; + } else { + slot_value = generic; + } + +#ifdef Py_GIL_DISABLED + if (queued_updates != NULL) { + // queue the update to perform later, while world is stopped + if (queue_slot_update(queued_updates, type, ptr, slot_value) < 0) { + return -1; + } + } else { + // do the update to the type structure now + *ptr = slot_value; + } +#else + // always do the update immediately + assert(queued_updates == NULL); + *ptr = slot_value; +#endif + + if (next_p != NULL) { + *next_p = p; + } + return 0; } /* In the type, update the slots whose slotdefs are gathered in the pp array. @@ -11236,18 +11600,21 @@ update_one_slot(PyTypeObject *type, pytype_slotdef *p) static int update_slots_callback(PyTypeObject *type, void *data) { - ASSERT_TYPE_LOCK_HELD(); + ASSERT_NEW_TYPE_OR_LOCKED(type); - pytype_slotdef **pp = (pytype_slotdef **)data; + update_callback_data_t *update_data = (update_callback_data_t *)data; + pytype_slotdef **pp = update_data->defs; for (; *pp; pp++) { - update_one_slot(type, *pp); + if (update_one_slot(type, *pp, NULL, update_data->queued_updates) < 0) { + return -1; + } } return 0; } /* Update the slots after assignment to a class (type) attribute. */ static int -update_slot(PyTypeObject *type, PyObject *name) +update_slot(PyTypeObject *type, PyObject *name, slot_update_t *queued_updates) { pytype_slotdef *ptrs[MAX_EQUIV]; pytype_slotdef *p; @@ -11278,8 +11645,12 @@ update_slot(PyTypeObject *type, PyObject *name) } if (ptrs[0] == NULL) return 0; /* Not an attribute that affects any slots */ + + update_callback_data_t callback_data; + callback_data.defs = ptrs; + callback_data.queued_updates = queued_updates; return update_subclasses(type, name, - update_slots_callback, (void *)ptrs); + update_slots_callback, (void *)&callback_data); } /* Store the proper functions in the slot dispatches at class (type) @@ -11288,35 +11659,56 @@ update_slot(PyTypeObject *type, PyObject *name) static void fixup_slot_dispatchers(PyTypeObject *type) { - // This lock isn't strictly necessary because the type has not been - // exposed to anyone else yet, but update_ont_slot calls find_name_in_mro - // where we'd like to assert that the type is locked. - BEGIN_TYPE_LOCK(); - assert(!PyErr_Occurred()); for (pytype_slotdef *p = slotdefs; p->name; ) { - p = update_one_slot(type, p); + update_one_slot(type, p, &p, NULL); } - - END_TYPE_LOCK(); } -static void +#ifdef Py_GIL_DISABLED + +// Called when __bases__ is re-assigned. +static int update_all_slots(PyTypeObject* type) { - pytype_slotdef *p; - - ASSERT_TYPE_LOCK_HELD(); + // Note that update_slot() can fail due to out-of-memory when allocating + // the queue chunks to hold the updates. That's unlikely since the number + // of updates is normally small but we handle that case. update_slot() + // can fail internally for other reasons (a lookup fails) but those + // errors are suppressed. + slot_update_t queued_updates = {0}; + for (pytype_slotdef *p = slotdefs; p->name; p++) { + if (update_slot(type, p->name_strobj, &queued_updates) < 0) { + if (queued_updates.head) { + slot_update_free_chunks(&queued_updates); + } + return -1; + } + } + if (queued_updates.head != NULL) { + apply_slot_updates_world_stopped(&queued_updates); + ASSERT_TYPE_LOCK_HELD(); + slot_update_free_chunks(&queued_updates); + } + return 0; +} - /* Clear the VALID_VERSION flag of 'type' and all its subclasses. */ - type_modified_unlocked(type); +#else +// Called when __bases__ is re-assigned. +static int +update_all_slots(PyTypeObject* type) +{ + pytype_slotdef *p; for (p = slotdefs; p->name; p++) { - /* update_slot returns int but can't actually fail */ - update_slot(type, p->name_strobj); + /* update_slot returns int but can't actually fail in this case*/ + update_slot(type, p->name_strobj, NULL); } + return 0; } +#endif + PyObject * _PyType_GetSlotWrapperNames(void) @@ -11586,7 +11978,10 @@ PyType_Freeze(PyTypeObject *type) } BEGIN_TYPE_LOCK(); + types_stop_world(); type_add_flags(type, Py_TPFLAGS_IMMUTABLETYPE); + types_start_world(); + ASSERT_TYPE_LOCK_HELD(); type_modified_unlocked(type); END_TYPE_LOCK(); diff --git a/Python/ceval.c b/Python/ceval.c index 490b653f132a6a..56589ec48a58a6 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -139,6 +139,19 @@ #endif +static void +check_invalid_reentrancy(void) +{ +#if defined(Py_DEBUG) && defined(Py_GIL_DISABLED) + // In the free-threaded build, the interpreter must not be re-entered if + // the world-is-stopped. If so, that's a bug somewhere (quite likely in + // the painfully complex typeobject code). + PyInterpreterState *interp = _PyInterpreterState_GET(); + assert(!interp->stoptheworld.world_stopped); +#endif +} + + #ifdef Py_DEBUG static void dump_item(_PyStackRef item) @@ -999,6 +1012,7 @@ PyObject* _Py_HOT_FUNCTION DONT_SLP_VECTORIZE _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwflag) { _Py_EnsureTstateNotNULL(tstate); + check_invalid_reentrancy(); CALL_STAT_INC(pyeval_calls); #if USE_COMPUTED_GOTOS && !Py_TAIL_CALL_INTERP diff --git a/Tools/tsan/suppressions_free_threading.txt b/Tools/tsan/suppressions_free_threading.txt index 3230f969436c82..93421b623b92f9 100644 --- a/Tools/tsan/suppressions_free_threading.txt +++ b/Tools/tsan/suppressions_free_threading.txt @@ -12,15 +12,12 @@ # These warnings trigger directly in a CPython function. -race_top:assign_version_tag -race_top:_Py_slot_tp_getattr_hook race_top:dump_traceback race_top:fatal_error race_top:_PyFrame_GetCode race_top:_PyFrame_Initialize race_top:_PyObject_TryGetInstanceAttribute race_top:PyUnstable_InterpreterFrame_GetLine -race_top:type_modified_unlocked race_top:write_thread_id # gh-129068: race on shared range iterators (test_free_threading.test_zip.ZipThreading.test_threading) @@ -29,9 +26,6 @@ race_top:rangeiter_next # gh-129748: test.test_free_threading.test_slots.TestSlots.test_object race_top:mi_block_set_nextx -# gh-127266: type slot updates are not thread-safe (test_opcache.test_load_attr_method_lazy_dict) -race_top:update_one_slot - # https://gist.github.com/mpage/6962e8870606cfc960e159b407a0cb40 thread:pthread_create @@ -49,7 +43,6 @@ race:list_inplace_repeat_lock_held race:PyObject_Realloc # gh-133467. Some of these could be hard to trigger. -race_top:update_one_slot race_top:_Py_slot_tp_getattr_hook race_top:slot_tp_descr_get race_top:type_set_name