@ -18,6 +18,275 @@
# include <arm_neon.h>
# ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
# include <arm_sve.h>
# if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL)
# include <sys/auxv.h>
# endif
# endif
/*
* The Neon versions are built regardless of whether we are building the SVE
* versions .
*/
static uint64 pg_popcount_neon ( const char * buf , int bytes ) ;
static uint64 pg_popcount_masked_neon ( const char * buf , int bytes , bits8 mask ) ;
# ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
/*
* These are the SVE implementations of the popcount functions .
*/
static uint64 pg_popcount_sve ( const char * buf , int bytes ) ;
static uint64 pg_popcount_masked_sve ( const char * buf , int bytes , bits8 mask ) ;
/*
* The function pointers are initially set to " choose " functions . These
* functions will first set the pointers to the right implementations ( based on
* what the current CPU supports ) and then will call the pointer to fulfill the
* caller ' s request .
*/
static uint64 pg_popcount_choose ( const char * buf , int bytes ) ;
static uint64 pg_popcount_masked_choose ( const char * buf , int bytes , bits8 mask ) ;
uint64 ( * pg_popcount_optimized ) ( const char * buf , int bytes ) = pg_popcount_choose ;
uint64 ( * pg_popcount_masked_optimized ) ( const char * buf , int bytes , bits8 mask ) = pg_popcount_masked_choose ;
static inline bool
pg_popcount_sve_available ( void )
{
# ifdef HAVE_ELF_AUX_INFO
unsigned long value ;
return elf_aux_info ( AT_HWCAP , & value , sizeof ( value ) ) = = 0 & &
( value & HWCAP_SVE ) ! = 0 ;
# elif defined(HAVE_GETAUXVAL)
return ( getauxval ( AT_HWCAP ) & HWCAP_SVE ) ! = 0 ;
# else
return false ;
# endif
}
static inline void
choose_popcount_functions ( void )
{
if ( pg_popcount_sve_available ( ) )
{
pg_popcount_optimized = pg_popcount_sve ;
pg_popcount_masked_optimized = pg_popcount_masked_sve ;
}
else
{
pg_popcount_optimized = pg_popcount_neon ;
pg_popcount_masked_optimized = pg_popcount_masked_neon ;
}
}
static uint64
pg_popcount_choose ( const char * buf , int bytes )
{
choose_popcount_functions ( ) ;
return pg_popcount_optimized ( buf , bytes ) ;
}
static uint64
pg_popcount_masked_choose ( const char * buf , int bytes , bits8 mask )
{
choose_popcount_functions ( ) ;
return pg_popcount_masked_optimized ( buf , bytes , mask ) ;
}
/*
* pg_popcount_sve
* Returns number of 1 bits in buf
*/
pg_attribute_target ( " arch=armv8-a+sve " )
static uint64
pg_popcount_sve ( const char * buf , int bytes )
{
svbool_t pred = svptrue_b64 ( ) ;
svuint64_t accum1 = svdup_u64 ( 0 ) ,
accum2 = svdup_u64 ( 0 ) ,
accum3 = svdup_u64 ( 0 ) ,
accum4 = svdup_u64 ( 0 ) ;
uint32 vec_len = svcntb ( ) ,
bytes_per_iteration = 4 * vec_len ;
uint64 popcnt = 0 ;
/*
* For better instruction - level parallelism , each loop iteration operates
* on a block of four registers .
*/
for ( ; bytes > = bytes_per_iteration ; bytes - = bytes_per_iteration )
{
svuint64_t vec ;
vec = svld1_u64 ( pred , ( const uint64 * ) buf ) ;
accum1 = svadd_u64_x ( pred , accum1 , svcnt_u64_x ( pred , vec ) ) ;
buf + = vec_len ;
vec = svld1_u64 ( pred , ( const uint64 * ) buf ) ;
accum2 = svadd_u64_x ( pred , accum2 , svcnt_u64_x ( pred , vec ) ) ;
buf + = vec_len ;
vec = svld1_u64 ( pred , ( const uint64 * ) buf ) ;
accum3 = svadd_u64_x ( pred , accum3 , svcnt_u64_x ( pred , vec ) ) ;
buf + = vec_len ;
vec = svld1_u64 ( pred , ( const uint64 * ) buf ) ;
accum4 = svadd_u64_x ( pred , accum4 , svcnt_u64_x ( pred , vec ) ) ;
buf + = vec_len ;
}
/*
* If enough data remains , do another iteration on a block of two
* registers .
*/
bytes_per_iteration = 2 * vec_len ;
if ( bytes > = bytes_per_iteration )
{
svuint64_t vec ;
vec = svld1_u64 ( pred , ( const uint64 * ) buf ) ;
accum1 = svadd_u64_x ( pred , accum1 , svcnt_u64_x ( pred , vec ) ) ;
buf + = vec_len ;
vec = svld1_u64 ( pred , ( const uint64 * ) buf ) ;
accum2 = svadd_u64_x ( pred , accum2 , svcnt_u64_x ( pred , vec ) ) ;
buf + = vec_len ;
bytes - = bytes_per_iteration ;
}
/*
* Add the accumulators .
*/
popcnt + = svaddv_u64 ( pred , svadd_u64_x ( pred , accum1 , accum2 ) ) ;
popcnt + = svaddv_u64 ( pred , svadd_u64_x ( pred , accum3 , accum4 ) ) ;
/*
* Process any remaining data .
*/
for ( ; bytes > 0 ; bytes - = vec_len )
{
svuint8_t vec ;
pred = svwhilelt_b8_s32 ( 0 , bytes ) ;
vec = svld1_u8 ( pred , ( const uint8 * ) buf ) ;
popcnt + = svaddv_u8 ( pred , svcnt_u8_x ( pred , vec ) ) ;
buf + = vec_len ;
}
return popcnt ;
}
/*
* pg_popcount_masked_sve
* Returns number of 1 bits in buf after applying the mask to each byte
*/
pg_attribute_target ( " arch=armv8-a+sve " )
static uint64
pg_popcount_masked_sve ( const char * buf , int bytes , bits8 mask )
{
svbool_t pred = svptrue_b64 ( ) ;
svuint64_t accum1 = svdup_u64 ( 0 ) ,
accum2 = svdup_u64 ( 0 ) ,
accum3 = svdup_u64 ( 0 ) ,
accum4 = svdup_u64 ( 0 ) ;
uint32 vec_len = svcntb ( ) ,
bytes_per_iteration = 4 * vec_len ;
uint64 popcnt = 0 ,
mask64 = ~ UINT64CONST ( 0 ) / 0xFF * mask ;
/*
* For better instruction - level parallelism , each loop iteration operates
* on a block of four registers .
*/
for ( ; bytes > = bytes_per_iteration ; bytes - = bytes_per_iteration )
{
svuint64_t vec ;
vec = svand_n_u64_x ( pred , svld1_u64 ( pred , ( const uint64 * ) buf ) , mask64 ) ;
accum1 = svadd_u64_x ( pred , accum1 , svcnt_u64_x ( pred , vec ) ) ;
buf + = vec_len ;
vec = svand_n_u64_x ( pred , svld1_u64 ( pred , ( const uint64 * ) buf ) , mask64 ) ;
accum2 = svadd_u64_x ( pred , accum2 , svcnt_u64_x ( pred , vec ) ) ;
buf + = vec_len ;
vec = svand_n_u64_x ( pred , svld1_u64 ( pred , ( const uint64 * ) buf ) , mask64 ) ;
accum3 = svadd_u64_x ( pred , accum3 , svcnt_u64_x ( pred , vec ) ) ;
buf + = vec_len ;
vec = svand_n_u64_x ( pred , svld1_u64 ( pred , ( const uint64 * ) buf ) , mask64 ) ;
accum4 = svadd_u64_x ( pred , accum4 , svcnt_u64_x ( pred , vec ) ) ;
buf + = vec_len ;
}
/*
* If enough data remains , do another iteration on a block of two
* registers .
*/
bytes_per_iteration = 2 * vec_len ;
if ( bytes > = bytes_per_iteration )
{
svuint64_t vec ;
vec = svand_n_u64_x ( pred , svld1_u64 ( pred , ( const uint64 * ) buf ) , mask64 ) ;
accum1 = svadd_u64_x ( pred , accum1 , svcnt_u64_x ( pred , vec ) ) ;
buf + = vec_len ;
vec = svand_n_u64_x ( pred , svld1_u64 ( pred , ( const uint64 * ) buf ) , mask64 ) ;
accum2 = svadd_u64_x ( pred , accum2 , svcnt_u64_x ( pred , vec ) ) ;
buf + = vec_len ;
bytes - = bytes_per_iteration ;
}
/*
* Add the accumulators .
*/
popcnt + = svaddv_u64 ( pred , svadd_u64_x ( pred , accum1 , accum2 ) ) ;
popcnt + = svaddv_u64 ( pred , svadd_u64_x ( pred , accum3 , accum4 ) ) ;
/*
* Process any remaining data .
*/
for ( ; bytes > 0 ; bytes - = vec_len )
{
svuint8_t vec ;
pred = svwhilelt_b8_s32 ( 0 , bytes ) ;
vec = svand_n_u8_x ( pred , svld1_u8 ( pred , ( const uint8 * ) buf ) , mask ) ;
popcnt + = svaddv_u8 ( pred , svcnt_u8_x ( pred , vec ) ) ;
buf + = vec_len ;
}
return popcnt ;
}
# else /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
/*
* When the SVE version isn ' t available , there ' s no point in using function
* pointers to vary the implementation . We instead just make these actual
* external functions when USE_SVE_POPCNT_WITH_RUNTIME_CHECK is not defined .
* The compiler should be able to inline the Neon versions here .
*/
uint64
pg_popcount_optimized ( const char * buf , int bytes )
{
return pg_popcount_neon ( buf , bytes ) ;
}
uint64
pg_popcount_masked_optimized ( const char * buf , int bytes , bits8 mask )
{
return pg_popcount_masked_neon ( buf , bytes , mask ) ;
}
# endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
/*
* pg_popcount32
* Return number of 1 bits in word
@ -44,11 +313,11 @@ pg_popcount64(uint64 word)
}
/*
* pg_popcount_optimized
* pg_popcount_neon
* Returns number of 1 bits in buf
*/
uint64
pg_popcount_optimized ( const char * buf , int bytes )
static uint64
pg_popcount_neon ( const char * buf , int bytes )
{
uint8x16_t vec ;
uint64x2_t accum1 = vdupq_n_u64 ( 0 ) ,
@ -124,11 +393,11 @@ pg_popcount_optimized(const char *buf, int bytes)
}
/*
* pg_popcount_masked_optimized
* pg_popcount_masked_neon
* Returns number of 1 bits in buf after applying the mask to each byte
*/
uint64
pg_popcount_masked_optimized ( const char * buf , int bytes , bits8 mask )
static uint64
pg_popcount_masked_neon ( const char * buf , int bytes , bits8 mask )
{
uint8x16_t vec ,
maskv = vdupq_n_u8 ( mask ) ;