@ -12,26 +12,74 @@
*/
# include "c.h"
# ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
# include "port/pg_bitutils.h"
# ifdef TRY_POPCNT_X86_64
# if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
# include <cpuid.h>
# endif
# ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
# include <immintrin.h>
# endif
# if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
# include <intrin.h>
# endif
# include "port/pg_bitutils.h"
/*
* The SSE4 .2 versions are built regardless of whether we are building the
* AVX - 512 versions .
*/
static inline int pg_popcount32_fast ( uint32 word ) ;
static inline int pg_popcount64_fast ( uint64 word ) ;
static uint64 pg_popcount_fast ( const char * buf , int bytes ) ;
static uint64 pg_popcount_masked_fast ( const char * buf , int bytes , bits8 mask ) ;
/*
* It ' s probably unlikely that TRY_POPCNT_X86_64 won ' t be set if we are able to
* use AVX - 512 intrinsics , but we check it anyway to be sure . We piggy - back on
* the function pointers that are only used when TRY_POPCNT_X86_64 is set .
* These are the AVX - 512 implementations of the popcount functions .
*/
# ifdef TRY_POPCNT_X86_64
# ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
static uint64 pg_popcount_avx512 ( const char * buf , int bytes ) ;
static uint64 pg_popcount_masked_avx512 ( const char * buf , int bytes , bits8 mask ) ;
# endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
/*
* The function pointers are initially set to " choose " functions . These
* functions will first set the pointers to the right implementations ( base on
* what the current CPU supports ) and then will call the pointer to fulfill the
* caller ' s request .
*/
static int pg_popcount32_choose ( uint32 word ) ;
static int pg_popcount64_choose ( uint64 word ) ;
static uint64 pg_popcount_choose ( const char * buf , int bytes ) ;
static uint64 pg_popcount_masked_choose ( const char * buf , int bytes , bits8 mask ) ;
int ( * pg_popcount32 ) ( uint32 word ) = pg_popcount32_choose ;
int ( * pg_popcount64 ) ( uint64 word ) = pg_popcount64_choose ;
uint64 ( * pg_popcount_optimized ) ( const char * buf , int bytes ) = pg_popcount_choose ;
uint64 ( * pg_popcount_masked_optimized ) ( const char * buf , int bytes , bits8 mask ) = pg_popcount_masked_choose ;
/*
* Return true if CPUID indicates that the POPCNT instruction is available .
*/
static bool
pg_popcount_available ( void )
{
unsigned int exx [ 4 ] = { 0 , 0 , 0 , 0 } ;
# if defined(HAVE__GET_CPUID)
__get_cpuid ( 1 , & exx [ 0 ] , & exx [ 1 ] , & exx [ 2 ] , & exx [ 3 ] ) ;
# elif defined(HAVE__CPUID)
__cpuid ( exx , 1 ) ;
# else
# error cpuid instruction not available
# endif
return ( exx [ 2 ] & ( 1 < < 23 ) ) ! = 0 ; /* POPCNT */
}
# ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
/*
* Does CPUID say there ' s support for XSAVE instructions ?
@ -94,7 +142,7 @@ avx512_popcnt_available(void)
* Returns true if the CPU supports the instructions required for the AVX - 512
* pg_popcount ( ) implementation .
*/
bool
static bool
pg_popcount_avx512_available ( void )
{
return xsave_available ( ) & &
@ -102,12 +150,77 @@ pg_popcount_avx512_available(void)
avx512_popcnt_available ( ) ;
}
# endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
/*
* These functions get called on the first call to pg_popcount32 etc .
* They detect whether we can use the asm implementations , and replace
* the function pointers so that subsequent calls are routed directly to
* the chosen implementation .
*/
static inline void
choose_popcount_functions ( void )
{
if ( pg_popcount_available ( ) )
{
pg_popcount32 = pg_popcount32_fast ;
pg_popcount64 = pg_popcount64_fast ;
pg_popcount_optimized = pg_popcount_fast ;
pg_popcount_masked_optimized = pg_popcount_masked_fast ;
}
else
{
pg_popcount32 = pg_popcount32_slow ;
pg_popcount64 = pg_popcount64_slow ;
pg_popcount_optimized = pg_popcount_slow ;
pg_popcount_masked_optimized = pg_popcount_masked_slow ;
}
# ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
if ( pg_popcount_avx512_available ( ) )
{
pg_popcount_optimized = pg_popcount_avx512 ;
pg_popcount_masked_optimized = pg_popcount_masked_avx512 ;
}
# endif
}
static int
pg_popcount32_choose ( uint32 word )
{
choose_popcount_functions ( ) ;
return pg_popcount32 ( word ) ;
}
static int
pg_popcount64_choose ( uint64 word )
{
choose_popcount_functions ( ) ;
return pg_popcount64 ( word ) ;
}
static uint64
pg_popcount_choose ( const char * buf , int bytes )
{
choose_popcount_functions ( ) ;
return pg_popcount_optimized ( buf , bytes ) ;
}
static uint64
pg_popcount_masked_choose ( const char * buf , int bytes , bits8 mask )
{
choose_popcount_functions ( ) ;
return pg_popcount_masked ( buf , bytes , mask ) ;
}
# ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
/*
* pg_popcount_avx512
* Returns the number of 1 - bits in buf
*/
pg_attribute_target ( " avx512vpopcntdq,avx512bw " )
uint64
static uint64
pg_popcount_avx512 ( const char * buf , int bytes )
{
__m512i val ,
@ -163,7 +276,7 @@ pg_popcount_avx512(const char *buf, int bytes)
* Returns the number of 1 - bits in buf after applying the mask to each byte
*/
pg_attribute_target ( " avx512vpopcntdq,avx512bw " )
uint64
static uint64
pg_popcount_masked_avx512 ( const char * buf , int bytes , bits8 mask )
{
__m512i val ,
@ -219,5 +332,136 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
return _mm512_reduce_add_epi64 ( accum ) ;
}
# endif /* TRY_POPCNT_X86_64 */
# endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
/*
* pg_popcount32_fast
* Return the number of 1 bits set in word
*/
static inline int
pg_popcount32_fast ( uint32 word )
{
# ifdef _MSC_VER
return __popcnt ( word ) ;
# else
uint32 res ;
__asm__ __volatile__ ( " popcntl %1,%0 \n " : " =q " ( res ) : " rm " ( word ) : " cc " ) ;
return ( int ) res ;
# endif
}
/*
* pg_popcount64_fast
* Return the number of 1 bits set in word
*/
static inline int
pg_popcount64_fast ( uint64 word )
{
# ifdef _MSC_VER
return __popcnt64 ( word ) ;
# else
uint64 res ;
__asm__ __volatile__ ( " popcntq %1,%0 \n " : " =q " ( res ) : " rm " ( word ) : " cc " ) ;
return ( int ) res ;
# endif
}
/*
* pg_popcount_fast
* Returns the number of 1 - bits in buf
*/
static uint64
pg_popcount_fast ( const char * buf , int bytes )
{
uint64 popcnt = 0 ;
# if SIZEOF_VOID_P >= 8
/* Process in 64-bit chunks if the buffer is aligned. */
if ( buf = = ( const char * ) TYPEALIGN ( 8 , buf ) )
{
const uint64 * words = ( const uint64 * ) buf ;
while ( bytes > = 8 )
{
popcnt + = pg_popcount64_fast ( * words + + ) ;
bytes - = 8 ;
}
buf = ( const char * ) words ;
}
# else
/* Process in 32-bit chunks if the buffer is aligned. */
if ( buf = = ( const char * ) TYPEALIGN ( 4 , buf ) )
{
const uint32 * words = ( const uint32 * ) buf ;
while ( bytes > = 4 )
{
popcnt + = pg_popcount32_fast ( * words + + ) ;
bytes - = 4 ;
}
buf = ( const char * ) words ;
}
# endif
/* Process any remaining bytes */
while ( bytes - - )
popcnt + = pg_number_of_ones [ ( unsigned char ) * buf + + ] ;
return popcnt ;
}
/*
* pg_popcount_masked_fast
* Returns the number of 1 - bits in buf after applying the mask to each byte
*/
static uint64
pg_popcount_masked_fast ( const char * buf , int bytes , bits8 mask )
{
uint64 popcnt = 0 ;
# if SIZEOF_VOID_P >= 8
/* Process in 64-bit chunks if the buffer is aligned */
uint64 maskv = ~ UINT64CONST ( 0 ) / 0xFF * mask ;
if ( buf = = ( const char * ) TYPEALIGN ( 8 , buf ) )
{
const uint64 * words = ( const uint64 * ) buf ;
while ( bytes > = 8 )
{
popcnt + = pg_popcount64_fast ( * words + + & maskv ) ;
bytes - = 8 ;
}
buf = ( const char * ) words ;
}
# else
/* Process in 32-bit chunks if the buffer is aligned. */
uint32 maskv = ~ ( ( uint32 ) 0 ) / 0xFF * mask ;
if ( buf = = ( const char * ) TYPEALIGN ( 4 , buf ) )
{
const uint32 * words = ( const uint32 * ) buf ;
while ( bytes > = 4 )
{
popcnt + = pg_popcount32_fast ( * words + + & maskv ) ;
bytes - = 4 ;
}
buf = ( const char * ) words ;
}
# endif
/* Process any remaining bytes */
while ( bytes - - )
popcnt + = pg_number_of_ones [ ( unsigned char ) * buf + + & mask ] ;
return popcnt ;
}
# endif /* TRY_POPCNT_X86_64 */