postgresql/src/port/pg_popcount_avx512.c

142 lines
4.0 KiB
C

/*-------------------------------------------------------------------------
*
* pg_popcount_avx512.c
* Holds the AVX-512 pg_popcount() implementation.
*
* Copyright (c) 2024, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/port/pg_popcount_avx512.c
*
*-------------------------------------------------------------------------
*/
#include "c.h"
#include <immintrin.h>
#include "port/pg_bitutils.h"
/*
* It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
* use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
* the function pointers that are only used when TRY_POPCNT_FAST is set.
*/
#ifdef TRY_POPCNT_FAST
/*
* pg_popcount_avx512
* Returns the number of 1-bits in buf
*/
uint64
pg_popcount_avx512(const char *buf, int bytes)
{
__m512i val,
cnt;
__m512i accum = _mm512_setzero_si512();
const char *final;
int tail_idx;
__mmask64 mask = ~UINT64CONST(0);
/*
* Align buffer down to avoid double load overhead from unaligned access.
* Calculate a mask to ignore preceding bytes. Find start offset of final
* iteration and ensure it is not empty.
*/
mask <<= ((uintptr_t) buf) % sizeof(__m512i);
tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
/*
* Iterate through all but the final iteration. Starting from the second
* iteration, the mask is ignored.
*/
if (buf < final)
{
val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
cnt = _mm512_popcnt_epi64(val);
accum = _mm512_add_epi64(accum, cnt);
buf += sizeof(__m512i);
mask = ~UINT64CONST(0);
for (; buf < final; buf += sizeof(__m512i))
{
val = _mm512_load_si512((const __m512i *) buf);
cnt = _mm512_popcnt_epi64(val);
accum = _mm512_add_epi64(accum, cnt);
}
}
/* Final iteration needs to ignore bytes that are not within the length */
mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
cnt = _mm512_popcnt_epi64(val);
accum = _mm512_add_epi64(accum, cnt);
return _mm512_reduce_add_epi64(accum);
}
/*
* pg_popcount_masked_avx512
* Returns the number of 1-bits in buf after applying the mask to each byte
*/
uint64
pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
{
__m512i val,
vmasked,
cnt;
__m512i accum = _mm512_setzero_si512();
const char *final;
int tail_idx;
__mmask64 bmask = ~UINT64CONST(0);
const __m512i maskv = _mm512_set1_epi8(mask);
/*
* Align buffer down to avoid double load overhead from unaligned access.
* Calculate a mask to ignore preceding bytes. Find start offset of final
* iteration and ensure it is not empty.
*/
bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
/*
* Iterate through all but the final iteration. Starting from the second
* iteration, the mask is ignored.
*/
if (buf < final)
{
val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
vmasked = _mm512_and_si512(val, maskv);
cnt = _mm512_popcnt_epi64(vmasked);
accum = _mm512_add_epi64(accum, cnt);
buf += sizeof(__m512i);
bmask = ~UINT64CONST(0);
for (; buf < final; buf += sizeof(__m512i))
{
val = _mm512_load_si512((const __m512i *) buf);
vmasked = _mm512_and_si512(val, maskv);
cnt = _mm512_popcnt_epi64(vmasked);
accum = _mm512_add_epi64(accum, cnt);
}
}
/* Final iteration needs to ignore bytes that are not within the length */
bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
vmasked = _mm512_and_si512(val, maskv);
cnt = _mm512_popcnt_epi64(vmasked);
accum = _mm512_add_epi64(accum, cnt);
return _mm512_reduce_add_epi64(accum);
}
#endif /* TRY_POPCNT_FAST */