Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

gh-115103: Implement delayed memory reclamation (QSBR) #115180

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Feb 16, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix potential deadlock and other changes from review
  • Loading branch information
colesbury committed Feb 15, 2024
commit 94a95d5bf7ff81574c2e27a5ddb2461fadd37066
8 changes: 8 additions & 0 deletions 8 Include/internal/pycore_qsbr.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ extern "C" {
# error "this header requires Py_BUILD_CORE define"
#endif

// The shared write sequence is always odd and incremented by two. Detached
// threads are indicated by a read sequence of zero. This avoids collisions
// between the offline state and any valid sequence number even if the
// sequences numbers wrap around.
#define QSBR_OFFLINE 0
#define QSBR_INITIAL 1
#define QSBR_INCR 2

struct _qsbr_shared;
struct _PyThreadStateImpl; // forward declare to avoid circular dependency

Expand Down
5 changes: 3 additions & 2 deletions 5 Include/internal/pycore_runtime_init.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ extern "C" {
#include "pycore_pyhash.h" // pyhash_state_INIT
#include "pycore_pymem_init.h" // _pymem_allocators_standard_INIT
#include "pycore_pythread.h" // _pythread_RUNTIME_INIT
#include "pycore_qsbr.h" // QSBR_INITIAL
#include "pycore_runtime_init_generated.h" // _Py_bytes_characters_INIT
#include "pycore_signal.h" // _signals_RUNTIME_INIT
#include "pycore_tracemalloc.h" // _tracemalloc_runtime_state_INIT
Expand Down Expand Up @@ -170,8 +171,8 @@ extern PyTypeObject _PyExc_MemoryError;
}, \
}, \
.qsbr = { \
.wr_seq = 1, \
.rd_seq = 1, \
.wr_seq = QSBR_INITIAL, \
.rd_seq = QSBR_INITIAL, \
}, \
.dtoa = _dtoa_state_INIT(&(INTERP)), \
.dict_state = _dict_state_INIT, \
Expand Down
9 changes: 6 additions & 3 deletions 9 Python/pystate.c
Original file line number Diff line number Diff line change
Expand Up @@ -1412,9 +1412,6 @@ new_threadstate(PyInterpreterState *interp, int whence)
sizeof(*tstate));
}

#ifdef Py_GIL_DISABLED
_Py_qsbr_register(tstate, interp, qsbr_idx);
#endif
init_threadstate(tstate, interp, id, whence);
add_threadstate(interp, (PyThreadState *)tstate, old_head);

Expand All @@ -1423,6 +1420,12 @@ new_threadstate(PyInterpreterState *interp, int whence)
// Must be called with lock unlocked to avoid re-entrancy deadlock.
PyMem_RawFree(new_tstate);
}

#ifdef Py_GIL_DISABLED
// Must be called with lock unlocked to avoid lock ordering deadlocks.
_Py_qsbr_register(tstate, interp, qsbr_idx);
#endif

return (PyThreadState *)tstate;
}

Expand Down
22 changes: 8 additions & 14 deletions 22 Python/qsbr.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,6 @@
// Starting size of the array of qsbr thread states
#define MIN_ARRAY_SIZE 8

// The shared write sequence is always odd and incremented by two. Detached
// threads are indicated by a read sequence of zero.
#define QSBR_OFFLINE 0
#define QSBR_INITIAL 1
#define QSBR_INCR 2

// For _Py_qsbr_deferred_advance(): the number of deferrals before advancing
// the write sequence.
#define QSBR_DEFERRED_LIMIT 10
Expand All @@ -72,7 +66,7 @@ qsbr_allocate(struct _qsbr_shared *shared)

// Initialize (or reintialize) the freelist of QSBR thread states
static void
initialize_freelist(struct _qsbr_shared *shared)
initialize_new_array(struct _qsbr_shared *shared)
{
for (Py_ssize_t i = 0; i != shared->size; i++) {
struct _qsbr_thread_state *qsbr = &shared->array[i].qsbr;
Expand Down Expand Up @@ -111,7 +105,7 @@ grow_thread_array(struct _qsbr_shared *shared)
shared->array = array;
shared->size = new_size;
shared->freelist = NULL;
initialize_freelist(shared);
initialize_new_array(shared);

PyMem_RawFree(old);
return 0;
Expand All @@ -120,6 +114,9 @@ grow_thread_array(struct _qsbr_shared *shared)
uint64_t
_Py_qsbr_advance(struct _qsbr_shared *shared)
{
// NOTE: with 64-bit sequence numbers, we don't have to worry too much
// about the wr_seq getting too far ahead of rd_seq, but if we ever use
// 32-bit sequence numbers, we'll need to be more careful.
return _Py_atomic_add_uint64(&shared->wr_seq, QSBR_INCR) + QSBR_INCR;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The FreeBSD version has logic to avoid making sure that wr_seq doesn't get too far ahead from rd_seq and if so blocks until it catches up to avoid extreme wrap around issues... Is there a reason we don't need that? :). Or should we at least add a TODO and/or an assertion?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are using 64-bit sequence counters to avoid worrying about wrap around. FreeBSD uses 32-bit counters. (2^62 ns = ~146 years; 2^30 ns = ~1 sec)

We could consider using 32-bit counters and handling wrap around in the future. It wouldn't matter much for x86-64 and aarch64, but would probably be more efficient on some other platforms.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, that makes sense, FreeBSD's is buried in a typedef so I hadn't noticed it was 32-bit.

}

Expand Down Expand Up @@ -200,7 +197,7 @@ _Py_qsbr_reserve(PyInterpreterState *interp)
{
struct _qsbr_shared *shared = &interp->qsbr;

PyMutex_LockFlags(&shared->mutex, _Py_LOCK_DONT_DETACH);
PyMutex_Lock(&shared->mutex);
// Try allocating from our internal freelist
struct _qsbr_thread_state *qsbr = qsbr_allocate(shared);

Expand Down Expand Up @@ -231,10 +228,7 @@ _Py_qsbr_register(_PyThreadStateImpl *tstate, PyInterpreterState *interp,
// Associate the QSBR state with the thread state
struct _qsbr_shared *shared = &interp->qsbr;

// NOTE: this function is called with runtime locked, so we don't detach
// while waiting for the lock. This prevents a stop-the-world pause
// while the runtime lock is held, which could lead to deadlock.
PyMutex_LockFlags(&shared->mutex, _Py_LOCK_DONT_DETACH);
PyMutex_Lock(&shared->mutex);
struct _qsbr_thread_state *qsbr = &interp->qsbr.array[index].qsbr;
assert(qsbr->allocated && qsbr->tstate == NULL);
qsbr->tstate = (PyThreadState *)tstate;
Expand All @@ -250,7 +244,7 @@ _Py_qsbr_unregister(_PyThreadStateImpl *tstate)

assert(qsbr->seq == 0 && "thread state must be detached");

PyMutex_LockFlags(&shared->mutex, _Py_LOCK_DONT_DETACH);
PyMutex_Lock(&shared->mutex);
assert(qsbr->allocated && qsbr->tstate == (PyThreadState *)tstate);
tstate->qsbr = NULL;
qsbr->tstate = NULL;
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.