Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 41c51f0

Browse filesBrowse files
Optimize visibilitymap_count() with AVX-512 instructions.
Commit 792752a added infrastructure for using AVX-512 intrinsic functions, and this commit uses that infrastructure to optimize visibilitymap_count(). Specificially, a new pg_popcount_masked() function is introduced that applies a bitmask to every byte in the buffer prior to calculating the population count, which is used to filter out the all-visible or all-frozen bits as needed. Platforms without AVX-512 support should also see a nice speedup due to the reduced number of calls to a function pointer. Co-authored-by: Ants Aasma Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
1 parent 792752a commit 41c51f0
Copy full SHA for 41c51f0

File tree

Expand file treeCollapse file tree

4 files changed

+225
-20
lines changed
Filter options
Expand file treeCollapse file tree

4 files changed

+225
-20
lines changed

‎src/backend/access/heap/visibilitymap.c

Copy file name to clipboardExpand all lines: src/backend/access/heap/visibilitymap.c
+5-20Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,8 @@
119119
#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
120120

121121
/* Masks for counting subsets of bits in the visibility map. */
122-
#define VISIBLE_MASK64 UINT64CONST(0x5555555555555555) /* The lower bit of each
123-
* bit pair */
124-
#define FROZEN_MASK64 UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each
125-
* bit pair */
122+
#define VISIBLE_MASK8 (0x55) /* The lower bit of each bit pair */
123+
#define FROZEN_MASK8 (0xaa) /* The upper bit of each bit pair */
126124

127125
/* prototypes for internal routines */
128126
static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
@@ -396,7 +394,6 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
396394
{
397395
Buffer mapBuffer;
398396
uint64 *map;
399-
int i;
400397

401398
/*
402399
* Read till we fall off the end of the map. We assume that any extra
@@ -414,21 +411,9 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
414411
*/
415412
map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer));
416413

417-
StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0,
418-
"unsupported MAPSIZE");
419-
if (all_frozen == NULL)
420-
{
421-
for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
422-
nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
423-
}
424-
else
425-
{
426-
for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
427-
{
428-
nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
429-
nfrozen += pg_popcount64(map[i] & FROZEN_MASK64);
430-
}
431-
}
414+
nvisible += pg_popcount_masked((const char *) map, MAPSIZE, VISIBLE_MASK8);
415+
if (all_frozen)
416+
nfrozen += pg_popcount_masked((const char *) map, MAPSIZE, FROZEN_MASK8);
432417

433418
ReleaseBuffer(mapBuffer);
434419
}

‎src/include/port/pg_bitutils.h

Copy file name to clipboardExpand all lines: src/include/port/pg_bitutils.h
+34Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ pg_ceil_log2_64(uint64 num)
303303
extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
304304
extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
305305
extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
306+
extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask);
306307

307308
/*
308309
* We can also try to use the AVX-512 popcount instruction on some systems.
@@ -313,13 +314,15 @@ extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
313314
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
314315
extern bool pg_popcount_avx512_available(void);
315316
extern uint64 pg_popcount_avx512(const char *buf, int bytes);
317+
extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
316318
#endif
317319

318320
#else
319321
/* Use a portable implementation -- no need for a function pointer. */
320322
extern int pg_popcount32(uint32 word);
321323
extern int pg_popcount64(uint64 word);
322324
extern uint64 pg_popcount_optimized(const char *buf, int bytes);
325+
extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask);
323326

324327
#endif /* TRY_POPCNT_FAST */
325328

@@ -357,6 +360,37 @@ pg_popcount(const char *buf, int bytes)
357360
return pg_popcount_optimized(buf, bytes);
358361
}
359362

363+
/*
364+
* Returns the number of 1-bits in buf after applying the mask to each byte.
365+
*
366+
* Similar to pg_popcount(), we only take on the function pointer overhead when
367+
* it's likely to be faster.
368+
*/
369+
static inline uint64
370+
pg_popcount_masked(const char *buf, int bytes, bits8 mask)
371+
{
372+
/*
373+
* We set the threshold to the point at which we'll first use special
374+
* instructions in the optimized version.
375+
*/
376+
#if SIZEOF_VOID_P >= 8
377+
int threshold = 8;
378+
#else
379+
int threshold = 4;
380+
#endif
381+
382+
if (bytes < threshold)
383+
{
384+
uint64 popcnt = 0;
385+
386+
while (bytes--)
387+
popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
388+
return popcnt;
389+
}
390+
391+
return pg_popcount_masked_optimized(buf, bytes, mask);
392+
}
393+
360394
/*
361395
* Rotate the bits of "word" to the right/left by n bits.
362396
*/

‎src/port/pg_bitutils.c

Copy file name to clipboardExpand all lines: src/port/pg_bitutils.c
+126Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,19 +106,23 @@ const uint8 pg_number_of_ones[256] = {
106106
static inline int pg_popcount32_slow(uint32 word);
107107
static inline int pg_popcount64_slow(uint64 word);
108108
static uint64 pg_popcount_slow(const char *buf, int bytes);
109+
static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
109110

110111
#ifdef TRY_POPCNT_FAST
111112
static bool pg_popcount_available(void);
112113
static int pg_popcount32_choose(uint32 word);
113114
static int pg_popcount64_choose(uint64 word);
114115
static uint64 pg_popcount_choose(const char *buf, int bytes);
116+
static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
115117
static inline int pg_popcount32_fast(uint32 word);
116118
static inline int pg_popcount64_fast(uint64 word);
117119
static uint64 pg_popcount_fast(const char *buf, int bytes);
120+
static uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask);
118121

119122
int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
120123
int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
121124
uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
125+
uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
122126
#endif /* TRY_POPCNT_FAST */
123127

124128
#ifdef TRY_POPCNT_FAST
@@ -156,17 +160,22 @@ choose_popcount_functions(void)
156160
pg_popcount32 = pg_popcount32_fast;
157161
pg_popcount64 = pg_popcount64_fast;
158162
pg_popcount_optimized = pg_popcount_fast;
163+
pg_popcount_masked_optimized = pg_popcount_masked_fast;
159164
}
160165
else
161166
{
162167
pg_popcount32 = pg_popcount32_slow;
163168
pg_popcount64 = pg_popcount64_slow;
164169
pg_popcount_optimized = pg_popcount_slow;
170+
pg_popcount_masked_optimized = pg_popcount_masked_slow;
165171
}
166172

167173
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
168174
if (pg_popcount_avx512_available())
175+
{
169176
pg_popcount_optimized = pg_popcount_avx512;
177+
pg_popcount_masked_optimized = pg_popcount_masked_avx512;
178+
}
170179
#endif
171180
}
172181

@@ -191,6 +200,13 @@ pg_popcount_choose(const char *buf, int bytes)
191200
return pg_popcount_optimized(buf, bytes);
192201
}
193202

203+
static uint64
204+
pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
205+
{
206+
choose_popcount_functions();
207+
return pg_popcount_masked(buf, bytes, mask);
208+
}
209+
194210
/*
195211
* pg_popcount32_fast
196212
* Return the number of 1 bits set in word
@@ -271,6 +287,56 @@ pg_popcount_fast(const char *buf, int bytes)
271287
return popcnt;
272288
}
273289

290+
/*
291+
* pg_popcount_masked_fast
292+
* Returns the number of 1-bits in buf after applying the mask to each byte
293+
*/
294+
static uint64
295+
pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)
296+
{
297+
uint64 popcnt = 0;
298+
299+
#if SIZEOF_VOID_P >= 8
300+
/* Process in 64-bit chunks if the buffer is aligned */
301+
uint64 maskv = ~UINT64CONST(0) / 0xFF * mask;
302+
303+
if (buf == (const char *) TYPEALIGN(8, buf))
304+
{
305+
const uint64 *words = (const uint64 *) buf;
306+
307+
while (bytes >= 8)
308+
{
309+
popcnt += pg_popcount64_fast(*words++ & maskv);
310+
bytes -= 8;
311+
}
312+
313+
buf = (const char *) words;
314+
}
315+
#else
316+
/* Process in 32-bit chunks if the buffer is aligned. */
317+
uint32 maskv = ~((uint32) 0) / 0xFF * mask;
318+
319+
if (buf == (const char *) TYPEALIGN(4, buf))
320+
{
321+
const uint32 *words = (const uint32 *) buf;
322+
323+
while (bytes >= 4)
324+
{
325+
popcnt += pg_popcount32_fast(*words++ & maskv);
326+
bytes -= 4;
327+
}
328+
329+
buf = (const char *) words;
330+
}
331+
#endif
332+
333+
/* Process any remaining bytes */
334+
while (bytes--)
335+
popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
336+
337+
return popcnt;
338+
}
339+
274340
#endif /* TRY_POPCNT_FAST */
275341

276342

@@ -370,6 +436,56 @@ pg_popcount_slow(const char *buf, int bytes)
370436
return popcnt;
371437
}
372438

439+
/*
440+
* pg_popcount_masked_slow
441+
* Returns the number of 1-bits in buf after applying the mask to each byte
442+
*/
443+
static uint64
444+
pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask)
445+
{
446+
uint64 popcnt = 0;
447+
448+
#if SIZEOF_VOID_P >= 8
449+
/* Process in 64-bit chunks if the buffer is aligned */
450+
uint64 maskv = ~UINT64CONST(0) / 0xFF * mask;
451+
452+
if (buf == (const char *) TYPEALIGN(8, buf))
453+
{
454+
const uint64 *words = (const uint64 *) buf;
455+
456+
while (bytes >= 8)
457+
{
458+
popcnt += pg_popcount64_slow(*words++ & maskv);
459+
bytes -= 8;
460+
}
461+
462+
buf = (const char *) words;
463+
}
464+
#else
465+
/* Process in 32-bit chunks if the buffer is aligned. */
466+
uint32 maskv = ~((uint32) 0) / 0xFF * mask;
467+
468+
if (buf == (const char *) TYPEALIGN(4, buf))
469+
{
470+
const uint32 *words = (const uint32 *) buf;
471+
472+
while (bytes >= 4)
473+
{
474+
popcnt += pg_popcount32_slow(*words++ & maskv);
475+
bytes -= 4;
476+
}
477+
478+
buf = (const char *) words;
479+
}
480+
#endif
481+
482+
/* Process any remaining bytes */
483+
while (bytes--)
484+
popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
485+
486+
return popcnt;
487+
}
488+
373489
#ifndef TRY_POPCNT_FAST
374490

375491
/*
@@ -401,4 +517,14 @@ pg_popcount_optimized(const char *buf, int bytes)
401517
return pg_popcount_slow(buf, bytes);
402518
}
403519

520+
/*
521+
* pg_popcount_masked_optimized
522+
* Returns the number of 1-bits in buf after applying the mask to each byte
523+
*/
524+
uint64
525+
pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
526+
{
527+
return pg_popcount_masked_slow(buf, bytes, mask);
528+
}
529+
404530
#endif /* !TRY_POPCNT_FAST */

‎src/port/pg_popcount_avx512.c

Copy file name to clipboardExpand all lines: src/port/pg_popcount_avx512.c
+60Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,64 @@ pg_popcount_avx512(const char *buf, int bytes)
7878
return _mm512_reduce_add_epi64(accum);
7979
}
8080

81+
/*
82+
* pg_popcount_masked_avx512
83+
* Returns the number of 1-bits in buf after applying the mask to each byte
84+
*/
85+
uint64
86+
pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
87+
{
88+
__m512i val,
89+
vmasked,
90+
cnt;
91+
__m512i accum = _mm512_setzero_si512();
92+
const char *final;
93+
int tail_idx;
94+
__mmask64 bmask = ~UINT64CONST(0);
95+
const __m512i maskv = _mm512_set1_epi8(mask);
96+
97+
/*
98+
* Align buffer down to avoid double load overhead from unaligned access.
99+
* Calculate a mask to ignore preceding bytes. Find start offset of final
100+
* iteration and ensure it is not empty.
101+
*/
102+
bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
103+
tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
104+
final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
105+
buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
106+
107+
/*
108+
* Iterate through all but the final iteration. Starting from the second
109+
* iteration, the mask is ignored.
110+
*/
111+
if (buf < final)
112+
{
113+
val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
114+
vmasked = _mm512_and_si512(val, maskv);
115+
cnt = _mm512_popcnt_epi64(vmasked);
116+
accum = _mm512_add_epi64(accum, cnt);
117+
118+
buf += sizeof(__m512i);
119+
bmask = ~UINT64CONST(0);
120+
121+
for (; buf < final; buf += sizeof(__m512i))
122+
{
123+
val = _mm512_load_si512((const __m512i *) buf);
124+
vmasked = _mm512_and_si512(val, maskv);
125+
cnt = _mm512_popcnt_epi64(vmasked);
126+
accum = _mm512_add_epi64(accum, cnt);
127+
}
128+
}
129+
130+
/* Final iteration needs to ignore bytes that are not within the length */
131+
bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
132+
133+
val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
134+
vmasked = _mm512_and_si512(val, maskv);
135+
cnt = _mm512_popcnt_epi64(vmasked);
136+
accum = _mm512_add_epi64(accum, cnt);
137+
138+
return _mm512_reduce_add_epi64(accum);
139+
}
140+
81141
#endif /* TRY_POPCNT_FAST */

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.