diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index 2c00efc255dc3c6..15632014b88e8be 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -2,7 +2,7 @@
 #include "pycore_pymem.h"
 
 #include <stdbool.h>
-
+#include <stddef.h>
 
 /* Defined in tracemalloc.c */
 extern void _PyMem_DumpTraceback(int fd, const void *ptr);
@@ -849,9 +849,16 @@ static int running_on_valgrind = -1;
 #define ALIGNMENT_SHIFT         3
 #endif
 
+#if ALIGNMENT != 1 << ALIGNMENT_SHIFT
+#   error "ALIGNMENT inconsistent with ALIGNMENT_SHIFT"
+#endif
+
 /* Return the number of bytes in size class I, as a uint. */
 #define INDEX2SIZE(I) (((uint)(I) + 1) << ALIGNMENT_SHIFT)
 
+/* true iff integer N is > 0 and a power of 2 */
+#define IS_POWEROF2(N) ((N) > 0 && ((N) & (N-1)) == 0)
+
 /*
  * Max size threshold below which malloc requests are considered to be
  * small enough in order to use preallocated memory pools. You can tune
@@ -870,57 +877,88 @@ static int running_on_valgrind = -1;
 #define SMALL_REQUEST_THRESHOLD 512
 #define NB_SMALL_SIZE_CLASSES   (SMALL_REQUEST_THRESHOLD / ALIGNMENT)
 
-/*
- * The system's VMM page size can be obtained on most unices with a
- * getpagesize() call or deduced from various header files. To make
- * things simpler, we assume that it is 4K, which is OK for most systems.
- * It is probably better if this is the native page size, but it doesn't
- * have to be.  In theory, if SYSTEM_PAGE_SIZE is larger than the native page
- * size, then `POOL_ADDR(p)->arenaindex' could rarely cause a segmentation
- * violation fault.  4K is apparently OK for all the platforms that python
- * currently targets.
- */
-#define SYSTEM_PAGE_SIZE        (4 * 1024)
+#if !(ALIGNMENT <= SMALL_REQUEST_THRESHOLD && SMALL_REQUEST_THRESHOLD <= 512)
+#   error "SMALL_REQUEST_THRESHOLD out of bounds"
+#endif
+#if NB_SMALL_SIZE_CLASSES * ALIGNMENT != SMALL_REQUEST_THRESHOLD
+#   error "SMALL_REQUEST_THRESHOLD must be a multiple of ALIGNMENT"
+#endif
+
+/* -----------------------------------------------------------------------
+SYSTEM_PAGE_SIZE must be a power of 2 and no larger than the OS's page size.
+4K is the largest value that works on all known systems (in particular,
+Windows uses 4K pages).  obmalloc stores a uint of bookkeeping info at the
+start of every page it controls, and uses address arithmetic on pointers to
+find the page an address belongs to.  If SYSTEM_PAGE_SIZE is larger than the
+OS's page size, this can segfault.  See address_in_range() for details.
+
+POOL_SIZE must be a power of 2, and at least as large as SYSTEM_PAGE_SIZE.
+It's fine if they have the same value.  The larger it is, the more small
+objects can be obtained from a pool, maximizing the times obmalloc can stay in
+its fastest code paths.  But a pool being used for objects of a specific size
+class can't be used for objects of other size classes, so very large POOL_SIZE
+can have bad effects too.  For example, if only a single object in the size
+class of range(160, 160 + ALIGNMENT) bytes is needed, the entire pool devoted
+to the size class will be devoted to holding that single object.  POOL_SIZE
+should also be large enough so that at least several objects of the largest
+"small object" size class can be obtained from a pool.
+
+ARENA_SIZE must be a power of 2, and should be large enough to hold at least
+a few dozen pools.  Arenas are obtained from the system, and obmalloc carves
+them up itself into pools, and in turn carves up pools into blocks of memory
+for small objects.  Arenas are obtained from mmap() on systems that support it,
+by VirtualAlloc() on Windows, or from malloc() if necessary.  A "large" value
+doesn't necessarily "waste memory" - these system calls typically reserve
+virtual address space for the current process, but don't consume physical RAM
+until pages within them are accessed.  obmalloc consumes arenas from lowest
+address to highest, recycling memory whenever possible, and never accesses a
+higher-addressed byte unless there's not enough contiguous free memory to
+satisfy a request from bytes already accessed.
+----------------------------------------------------------------------- */
+
+#define SYSTEM_PAGE_SIZE        (4 * 1024)             /*   4 KiB */
+
+#if SIZEOF_VOID_P > 4
+#define POOL_SIZE               (SYSTEM_PAGE_SIZE * 4) /*  16 KiB */
+#define ARENA_SIZE              (1 << 20)              /*   1 MiB */
+#else
+#define POOL_SIZE               SYSTEM_PAGE_SIZE       /*   4 KiB */
+#define ARENA_SIZE              (1 << 18)              /* 256 KiB */
+#endif
+
 #define SYSTEM_PAGE_SIZE_MASK   (SYSTEM_PAGE_SIZE - 1)
+#define POOL_SIZE_MASK          (POOL_SIZE - 1)
 
-/*
- * Maximum amount of memory managed by the allocator for small requests.
- */
-#ifdef WITH_MEMORY_LIMITS
-#ifndef SMALL_MEMORY_LIMIT
-#define SMALL_MEMORY_LIMIT      (64 * 1024 * 1024)      /* 64 MB -- more? */
+#if ! IS_POWEROF2(SYSTEM_PAGE_SIZE)
+#   error "SYSTEM_PAGE_SIZE must be power of 2"
 #endif
+
+#if ! IS_POWEROF2(POOL_SIZE)
+#   error "POOL_SIZE must be power of 2"
 #endif
 
-/*
- * The allocator sub-allocates <Big> blocks of memory (called arenas) aligned
- * on a page boundary. This is a reserved virtual address space for the
- * current process (obtained through a malloc()/mmap() call). In no way this
- * means that the memory arenas will be used entirely. A malloc(<Big>) is
- * usually an address range reservation for <Big> bytes, unless all pages within
- * this space are referenced subsequently. So malloc'ing big blocks and not
- * using them does not mean "wasting memory". It's an addressable range
- * wastage...
- *
- * Arenas are allocated with mmap() on systems supporting anonymous memory
- * mappings to reduce heap fragmentation.
- */
-#define ARENA_SIZE              (256 << 10)     /* 256KB */
+#if ! IS_POWEROF2(ARENA_SIZE)
+#   error "ARENA_SIZE must be power of 2"
+#endif
+
+#if ! (SYSTEM_PAGE_SIZE <= POOL_SIZE && POOL_SIZE < ARENA_SIZE)
+#   error "must have SYSTEM_PAGE_SIZE <= POOL_SIZE < ARENA_SIZE"
+#endif
+
+#define PAGES_PER_POOL (POOL_SIZE / SYSTEM_PAGE_SIZE)
+#define MAX_POOLS_IN_ARENA  (ARENA_SIZE / POOL_SIZE)
 
 #ifdef WITH_MEMORY_LIMITS
 #define MAX_ARENAS              (SMALL_MEMORY_LIMIT / ARENA_SIZE)
 #endif
 
 /*
- * Size of the pools used for small blocks. Should be a power of 2,
- * between 1K and SYSTEM_PAGE_SIZE, that is: 1k, 2k, 4k.
+ * Maximum amount of memory managed by the allocator for small requests.
  */
-#define POOL_SIZE               SYSTEM_PAGE_SIZE        /* must be 2^N */
-#define POOL_SIZE_MASK          SYSTEM_PAGE_SIZE_MASK
-
-#define MAX_POOLS_IN_ARENA  (ARENA_SIZE / POOL_SIZE)
-#if MAX_POOLS_IN_ARENA * POOL_SIZE != ARENA_SIZE
-#   error "arena size not an exact multiple of pool size"
+#ifdef WITH_MEMORY_LIMITS
+#ifndef SMALL_MEMORY_LIMIT
+#define SMALL_MEMORY_LIMIT      (64 * 1024 * 1024)      /* 64 MB -- more? */
+#endif
 #endif
 
 /*
@@ -934,15 +972,18 @@ typedef uint8_t block;
 
 /* Pool for small blocks. */
 struct pool_header {
-    union { block *_padding;
-            uint count; } ref;          /* number of allocated blocks    */
+    /* NOTE:  arenaindex must come first!  The arena index is copied into the
+     * start of every page, not just pages that start with a pool header.
+     */
+    uint arenaindex;                    /* index into arenas of base adr */
+    uint nalloc;                        /* number of allocated blocks    */
     block *freeblock;                   /* pool's free list head         */
     struct pool_header *nextpool;       /* next pool of this size class  */
     struct pool_header *prevpool;       /* previous pool       ""        */
-    uint arenaindex;                    /* index into arenas of base adr */
     uint szidx;                         /* block size class index        */
     uint nextoffset;                    /* bytes to virgin block         */
     uint maxnextoffset;                 /* largest valid nextoffset      */
+    uint nextpage;                      /* bytes to next page boundary   */
 };
 
 typedef struct pool_header *poolp;
@@ -995,8 +1036,22 @@ struct arena_object {
 /* Round pointer P down to the closest pool-aligned address <= P, as a poolp */
 #define POOL_ADDR(P) ((poolp)_Py_ALIGN_DOWN((P), POOL_SIZE))
 
-/* Return total number of blocks in pool of size index I, as a uint. */
-#define NUMBLOCKS(I) ((uint)(POOL_SIZE - POOL_OVERHEAD) / INDEX2SIZE(I))
+/* Round pointer P down to the closest page-aligned address <= P, as a void* */
+#define PAGE_ADDR(P) _Py_ALIGN_DOWN((P), SYSTEM_PAGE_SIZE)
+
+/* Return total number of blocks in pool of size index `i`, as a uint. */
+static uint
+NUMBLOCKS(int i)
+{
+    assert(0 <= i && i < NB_SMALL_SIZE_CLASSES);
+    /* The first page burns space for a pool header, and remaining pages
+     * burn ALIGNMENT bytes for the arena index.
+     */
+    const uint size = INDEX2SIZE(i);
+    uint usable1 = SYSTEM_PAGE_SIZE - POOL_OVERHEAD;
+    uint usable2 = SYSTEM_PAGE_SIZE - ALIGNMENT;
+    return usable1 / size + (usable2 / size) * (PAGES_PER_POOL - 1);
+}
 
 /*==========================================================================*/
 
@@ -1063,8 +1118,8 @@ blocks.  The offset from the pool_header to the start of "the next" virgin
 block is stored in the pool_header nextoffset member, and the largest value
 of nextoffset that makes sense is stored in the maxnextoffset member when a
 pool is initialized.  All the blocks in a pool have been passed out at least
-once when and only when nextoffset > maxnextoffset.
-
+once when and only when nextoffset > maxnextoffset (although this later got
+complicated a bit to account for that pools can span multiple pages now).
 
 Major obscurity:  While the usedpools vector is declared to have poolp
 entries, it doesn't really.  It really contains two pointers per (conceptual)
@@ -1074,29 +1129,20 @@ excruciating initialization code below fools C so that
     usedpool[i+i]
 
 "acts like" a genuine poolp, but only so long as you only reference its
-nextpool and prevpool members.  The "- 2*sizeof(block *)" gibberish is
-compensating for that a pool_header's nextpool and prevpool members
-immediately follow a pool_header's first two members:
-
-    union { block *_padding;
-            uint count; } ref;
-    block *freeblock;
-
-each of which consume sizeof(block *) bytes.  So what usedpools[i+i] really
-contains is a fudged-up pointer p such that *if* C believes it's a poolp
-pointer, then p->nextpool and p->prevpool are both p (meaning that the headed
-circular list is empty).
+nextpool and prevpool members.  What usedpools[i+i] really contains is a
+fudged-up pointer p such that *if* C believes it's a poolp pointer, then
+p->nextpool and p->prevpool are both p (meaning that the headed circular list
+is empty).
 
 It's unclear why the usedpools setup is so convoluted.  It could be to
 minimize the amount of cache required to hold this heavily-referenced table
 (which only *needs* the two interpool pointer members of a pool_header). OTOH,
 referencing code has to remember to "double the index" and doing so isn't
-free, usedpools[0] isn't a strictly legal pointer, and we're crucially relying
-on that C doesn't insert any padding anywhere in a pool_header at or before
-the prevpool member.
+free, and usedpools[0] isn't a strictly legal pointer.
 **************************************************************************** */
 
-#define PTA(x)  ((poolp )((uint8_t *)&(usedpools[2*(x)]) - 2*sizeof(block *)))
+#define PTA(x)  ((poolp )((uint8_t *)&(usedpools[2*(x)]) - \
+                           offsetof(struct pool_header, nextpool)))
 #define PT(x)   PTA(x), PTA(x)
 
 static poolp usedpools[2 * ((NB_SMALL_SIZE_CLASSES + 7) / 8) * 8] = {
@@ -1225,7 +1271,7 @@ _Py_GetAllocatedBlocks(void)
         assert(base <= (uintptr_t) arenas[i].pool_address);
         for (; base < (uintptr_t) arenas[i].pool_address; base += POOL_SIZE) {
             poolp p = (poolp)base;
-            n += p->ref.count;
+            n += p->nalloc;
         }
     }
     return n;
@@ -1331,17 +1377,15 @@ new_arena(void)
 
 
 /*
-address_in_range(P, POOL)
+address_in_range(P)
 
-Return true if and only if P is an address that was allocated by pymalloc.
-POOL must be the pool address associated with P, i.e., POOL = POOL_ADDR(P)
-(the caller is asked to compute this because the macro expands POOL more than
-once, and for efficiency it's best for the caller to assign POOL_ADDR(P) to a
-variable and pass the latter to the macro; because address_in_range is
-called on every alloc/realloc/free, micro-efficiency is important here).
+P must be an address that was previously returned by a call to an obmalloc
+malloc or realloc spelling.  Return true if and only if P is an address
+that was obtained from an obmalloc pool (so false if and only if P was obtained
+from the system malloc/realloc instead).
 
-Tricky:  Let B be the arena base address associated with the pool, B =
-arenas[(POOL)->arenaindex].address.  Then P belongs to the arena if and only if
+Tricky:  Let B be the arena base address of the page associated with P.  Then
+P belongs to the arena if and only if
 
     B <= P < B + ARENA_SIZE
 
@@ -1357,36 +1401,37 @@ case.  We're relying on that maxarenas is also 0 in that case, so that
 (POOL)->arenaindex < maxarenas  must be false, saving us from trying to index
 into a NULL arenas.
 
-Details:  given P and POOL, the arena_object corresponding to P is AO =
-arenas[(POOL)->arenaindex].  Suppose obmalloc controls P.  Then (barring wild
-stores, etc), POOL is the correct address of P's pool, AO.address is the
-correct base address of the pool's arena, and P must be within ARENA_SIZE of
-AO.address.  In addition, AO.address is not 0 (no arena can start at address 0
-(NULL)).  Therefore address_in_range correctly reports that obmalloc
-controls P.
-
-Now suppose obmalloc does not control P (e.g., P was obtained via a direct
-call to the system malloc() or realloc()).  (POOL)->arenaindex may be anything
-in this case -- it may even be uninitialized trash.  If the trash arenaindex
-is >= maxarenas, the macro correctly concludes at once that obmalloc doesn't
-control P.
+Details:  obmalloc stores the arena index at the start of every page in every
+pool.  So, assuming P did come from an obmalloc pool, its correct arena index
+AI is read from the start of the page P belongs to, and the arena_object
+corresponding to P is AO = arenas[AI].  Suppose obmalloc controls P.  Then
+(barring wild stores, etc), AO.address is the correct base address of P's
+arena, and P must be within ARENA_SIZE of AO.address.  In addition, AO.address
+is not 0 (no arena can start at address 0 (NULL)).  Therefore address_in_range
+correctly reports that obmalloc controls P.
+
+Now suppose obmalloc does not control P (e.g., P was obtained via a call to
+the system malloc() or realloc()).  The arena index read up from the start of
+P's page may be anything in this case -- it may even be uninitialized trash.
+If the trash arenaindex >= maxarenas, the function correctly concludes at once
+that obmalloc doesn't control P.
 
 Else arenaindex is < maxarena, and AO is read up.  If AO corresponds to an
 allocated arena, obmalloc controls all the memory in slice AO.address :
 AO.address+ARENA_SIZE.  By case assumption, P is not controlled by obmalloc,
-so P doesn't lie in that slice, so the macro correctly reports that P is not
+so P doesn't lie in that slice, so the function correctly reports that P is not
 controlled by obmalloc.
 
 Finally, if P is not controlled by obmalloc and AO corresponds to an unused
 arena_object (one not currently associated with an allocated arena),
-AO.address is 0, and the second test in the macro reduces to:
+AO.address is 0, and the second test in the function reduces to:
 
     P < ARENA_SIZE
 
-If P >= ARENA_SIZE (extremely likely), the macro again correctly concludes
+If P >= ARENA_SIZE (extremely likely), the function again correctly concludes
 that P is not controlled by obmalloc.  However, if P < ARENA_SIZE, this part
 of the test still passes, and the third clause (AO.address != 0) is necessary
-to get the correct result:  AO.address is 0 in this case, so the macro
+to get the correct result:  AO.address is 0 in this case, so the function
 correctly reports that P is not controlled by obmalloc (despite that P lies in
 slice AO.address : AO.address + ARENA_SIZE).
 
@@ -1397,28 +1442,29 @@ obmalloc, AO corresponds to an unused arena_object, and P < ARENA_SIZE" case
 was impossible.
 
 Note that the logic is excruciating, and reading up possibly uninitialized
-memory when P is not controlled by obmalloc (to get at (POOL)->arenaindex)
+memory when P is not controlled by obmalloc (to get at the arena index)
 creates problems for some memory debuggers.  The overwhelming advantage is
 that this test determines whether an arbitrary address is controlled by
 obmalloc in a small constant time, independent of the number of arenas
-obmalloc controls.  Since this test is needed at every entry point, it's
+obmalloc controls.  Since this test is needed in every free/realloc, it's
 extremely desirable that it be this fast.
 */
 
 static bool _Py_NO_ADDRESS_SAFETY_ANALYSIS
             _Py_NO_SANITIZE_THREAD
             _Py_NO_SANITIZE_MEMORY
-address_in_range(void *p, poolp pool)
+address_in_range(void *p)
 {
-    // Since address_in_range may be reading from memory which was not allocated
-    // by Python, it is important that pool->arenaindex is read only once, as
+    // An arenaindex is stored at the start of every page obmalloc controls.
+    // Since address_in_range may be reading from memory not under obmalloc's
+    // control, it is important that the arenaindex is read only once, as
     // another thread may be concurrently modifying the value without holding
-    // the GIL. The following dance forces the compiler to read pool->arenaindex
-    // only once.
-    uint arenaindex = *((volatile uint *)&pool->arenaindex);
+    // the GIL.  Ensuring it's read only once is the purpose of "volatile".
+    uint arenaindex = *((volatile uint *)PAGE_ADDR(p));
+    uintptr_t base;
     return arenaindex < maxarenas &&
-        (uintptr_t)p - arenas[arenaindex].address < ARENA_SIZE &&
-        arenas[arenaindex].address != 0;
+        (uintptr_t)p - (base = arenas[arenaindex].address) < ARENA_SIZE &&
+        base != 0;
 }
 
 
@@ -1471,7 +1517,7 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
          * There is a used pool for this size class.
          * Pick up the head block of its free list.
          */
-        ++pool->ref.count;
+        ++pool->nalloc;
         bp = pool->freeblock;
         assert(bp != NULL);
         if ((pool->freeblock = *(block **)bp) != NULL) {
@@ -1481,16 +1527,29 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
         /*
          * Reached the end of the free list, try to extend it.
          */
-        if (pool->nextoffset <= pool->maxnextoffset) {
-            /* There is room for another block. */
-            pool->freeblock = (block*)pool +
-                              pool->nextoffset;
+        if (pool->nextoffset < POOL_SIZE) {
+            /* There's probably room for another block. */
+            if (pool->nextoffset > pool->maxnextoffset) {
+                /* Need to move to next page. */
+                pool->nextoffset = pool->nextpage + ALIGNMENT;
+                if (pool->nextoffset >= POOL_SIZE) {
+                    /* Nope!  There are no more blocks in this pool. */
+                    assert(pool->nextpage == POOL_SIZE);
+                    goto cant_extend;
+                }
+                *(uint *)((block *)pool + pool->nextpage) = pool->arenaindex;
+                pool->nextpage += SYSTEM_PAGE_SIZE;
+                pool->maxnextoffset = pool->nextpage - INDEX2SIZE(size);
+                assert(pool->nextoffset <= pool->maxnextoffset);
+            }
+            pool->freeblock = (block *)pool + pool->nextoffset;
             pool->nextoffset += INDEX2SIZE(size);
             *(block **)(pool->freeblock) = NULL;
             goto success;
         }
 
         /* Pool is full, unlink from used pools. */
+cant_extend:
         next = pool->nextpool;
         pool = pool->prevpool;
         next->prevpool = pool;
@@ -1572,7 +1631,7 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
         pool->prevpool = next;
         next->nextpool = pool;
         next->prevpool = pool;
-        pool->ref.count = 1;
+        pool->nalloc = 1;
         if (pool->szidx == size) {
             /* Luckily, this pool last contained blocks
              * of the same size class, so its header
@@ -1592,9 +1651,10 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
         size = INDEX2SIZE(size);
         bp = (block *)pool + POOL_OVERHEAD;
         pool->nextoffset = POOL_OVERHEAD + (size << 1);
-        pool->maxnextoffset = POOL_SIZE - size;
+        pool->maxnextoffset = SYSTEM_PAGE_SIZE - size;
         pool->freeblock = bp + size;
         *(block **)(pool->freeblock) = NULL;
+        pool->nextpage = SYSTEM_PAGE_SIZE;
         goto success;
     }
 
@@ -1626,6 +1686,7 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
 
 success:
     assert(bp != NULL);
+    assert(_Py_IS_ALIGNED(bp, ALIGNMENT));
     *ptr_p = (void *)bp;
     return 1;
 
@@ -1683,6 +1744,7 @@ pymalloc_free(void *ctx, void *p)
     uint size;
 
     assert(p != NULL);
+    assert(_Py_IS_ALIGNED(p, ALIGNMENT));
 
 #ifdef WITH_VALGRIND
     if (UNLIKELY(running_on_valgrind > 0)) {
@@ -1690,19 +1752,19 @@ pymalloc_free(void *ctx, void *p)
     }
 #endif
 
-    pool = POOL_ADDR(p);
-    if (!address_in_range(p, pool)) {
+    if (!address_in_range(p)) {
         return 0;
     }
     /* We allocated this address. */
 
+    pool = POOL_ADDR(p);
     /* Link p to the start of the pool's freeblock list.  Since
      * the pool had at least the p block outstanding, the pool
      * wasn't empty (so it's already in a usedpools[] list, or
      * was full and is in no list -- it's not in the freeblocks
      * list in any case).
      */
-    assert(pool->ref.count > 0);            /* else it was empty */
+    assert(pool->nalloc > 0);            /* else it was empty */
     *(block **)p = lastfree = pool->freeblock;
     pool->freeblock = (block *)p;
     if (!lastfree) {
@@ -1712,8 +1774,8 @@ pymalloc_free(void *ctx, void *p)
          * targets optimal filling when several pools contain
          * blocks of the same size class.
          */
-        --pool->ref.count;
-        assert(pool->ref.count > 0);            /* else the pool is empty */
+        --pool->nalloc;
+        assert(pool->nalloc > 0);            /* else the pool is empty */
         size = pool->szidx;
         next = usedpools[size + size];
         prev = next->prevpool;
@@ -1732,7 +1794,7 @@ pymalloc_free(void *ctx, void *p)
     /* freeblock wasn't NULL, so the pool wasn't full,
      * and the pool is in a usedpools[] list.
      */
-    if (--pool->ref.count != 0) {
+    if (--pool->nalloc != 0) {
         /* pool isn't empty:  leave it in usedpools */
         goto success;
     }
@@ -1939,6 +2001,7 @@ pymalloc_realloc(void *ctx, void **newptr_p, void *p, size_t nbytes)
     size_t size;
 
     assert(p != NULL);
+    assert(_Py_IS_ALIGNED(p, ALIGNMENT));
 
 #ifdef WITH_VALGRIND
     /* Treat running_on_valgrind == -1 the same as 0 */
@@ -1947,8 +2010,7 @@ pymalloc_realloc(void *ctx, void **newptr_p, void *p, size_t nbytes)
     }
 #endif
 
-    pool = POOL_ADDR(p);
-    if (!address_in_range(p, pool)) {
+    if (!address_in_range(p)) {
         /* pymalloc is not managing this block.
 
            If nbytes <= SMALL_REQUEST_THRESHOLD, it's tempting to try to take
@@ -1965,6 +2027,7 @@ pymalloc_realloc(void *ctx, void **newptr_p, void *p, size_t nbytes)
     }
 
     /* pymalloc is in charge of this block */
+    pool = POOL_ADDR(p);
     size = INDEX2SIZE(pool->szidx);
     if (nbytes <= size) {
         /* The block is staying the same or shrinking.
@@ -1987,6 +2050,7 @@ pymalloc_realloc(void *ctx, void **newptr_p, void *p, size_t nbytes)
         memcpy(bp, p, size);
         _PyObject_Free(ctx, p);
     }
+    assert(_Py_IS_ALIGNED(bp, ALIGNMENT));
     *newptr_p = bp;
     return 1;
 }
@@ -2630,10 +2694,12 @@ _PyObject_DebugMallocStats(FILE *out)
     uint numfreepools = 0;
     /* # of bytes for arena alignment padding */
     size_t arena_alignment = 0;
-    /* # of bytes in used and full pools used for pool_headers */
+    /* # of bytes in used and full pools used for pool_headers, and for
+     * storing the arena index at the start of all interior pages
+     */
     size_t pool_header_bytes = 0;
     /* # of bytes in used and full pools wasted due to quantization,
-     * i.e. the necessarily leftover space at the ends of used and
+     * i.e. the necessarily leftover space at the ends of pages in used and
      * full pools.
      */
     size_t quantization = 0;
@@ -2679,7 +2745,7 @@ _PyObject_DebugMallocStats(FILE *out)
             const uint sz = p->szidx;
             uint freeblocks;
 
-            if (p->ref.count == 0) {
+            if (p->nalloc == 0) {
                 /* currently unused */
 #ifdef Py_DEBUG
                 assert(pool_is_in_list(p, arenas[i].freepools));
@@ -2687,8 +2753,8 @@ _PyObject_DebugMallocStats(FILE *out)
                 continue;
             }
             ++numpools[sz];
-            numblocks[sz] += p->ref.count;
-            freeblocks = NUMBLOCKS(sz) - p->ref.count;
+            numblocks[sz] += p->nalloc;
+            freeblocks = NUMBLOCKS(sz) - p->nalloc;
             numfreeblocks[sz] += freeblocks;
 #ifdef Py_DEBUG
             if (freeblocks > 0)
@@ -2719,8 +2785,10 @@ _PyObject_DebugMallocStats(FILE *out)
                 i, size, p, b, f);
         allocated_bytes += b * size;
         available_bytes += f * size;
-        pool_header_bytes += p * POOL_OVERHEAD;
-        quantization += p * ((POOL_SIZE - POOL_OVERHEAD) % size);
+        pool_header_bytes += p * (POOL_OVERHEAD +
+                                  ALIGNMENT * (PAGES_PER_POOL - 1));
+        quantization += p * ((SYSTEM_PAGE_SIZE - POOL_OVERHEAD) % size +
+            (SYSTEM_PAGE_SIZE - ALIGNMENT) % size * (PAGES_PER_POOL - 1));
     }
     fputc('\n', out);
 #ifdef PYMEM_DEBUG_SERIALNO