|
|
|
|
@ -79,6 +79,7 @@ static inline bool vector8_has_le(const Vector8 v, const uint8 c); |
|
|
|
|
static inline bool vector8_is_highbit_set(const Vector8 v); |
|
|
|
|
#ifndef USE_NO_SIMD |
|
|
|
|
static inline bool vector32_is_highbit_set(const Vector32 v); |
|
|
|
|
static inline uint32 vector8_highbit_mask(const Vector8 v); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
/* arithmetic operations */ |
|
|
|
|
@ -96,6 +97,7 @@ static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2); |
|
|
|
|
*/ |
|
|
|
|
#ifndef USE_NO_SIMD |
|
|
|
|
static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2); |
|
|
|
|
static inline Vector8 vector8_min(const Vector8 v1, const Vector8 v2); |
|
|
|
|
static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
@ -299,6 +301,36 @@ vector32_is_highbit_set(const Vector32 v) |
|
|
|
|
} |
|
|
|
|
#endif /* ! USE_NO_SIMD */ |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Return a bitmask formed from the high-bit of each element. |
|
|
|
|
*/ |
|
|
|
|
#ifndef USE_NO_SIMD |
|
|
|
|
static inline uint32 |
|
|
|
|
vector8_highbit_mask(const Vector8 v) |
|
|
|
|
{ |
|
|
|
|
#ifdef USE_SSE2 |
|
|
|
|
return (uint32) _mm_movemask_epi8(v); |
|
|
|
|
#elif defined(USE_NEON) |
|
|
|
|
/*
|
|
|
|
|
* Note: It would be faster to use vget_lane_u64 and vshrn_n_u16, but that |
|
|
|
|
* returns a uint64, making it inconvenient to combine mask values from |
|
|
|
|
* multiple vectors. |
|
|
|
|
*/ |
|
|
|
|
static const uint8 mask[16] = { |
|
|
|
|
1 << 0, 1 << 1, 1 << 2, 1 << 3, |
|
|
|
|
1 << 4, 1 << 5, 1 << 6, 1 << 7, |
|
|
|
|
1 << 0, 1 << 1, 1 << 2, 1 << 3, |
|
|
|
|
1 << 4, 1 << 5, 1 << 6, 1 << 7, |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
uint8x16_t masked = vandq_u8(vld1q_u8(mask), (uint8x16_t) vshrq_n_s8(v, 7)); |
|
|
|
|
uint8x16_t maskedhi = vextq_u8(masked, masked, 8); |
|
|
|
|
|
|
|
|
|
return (uint32) vaddvq_u16((uint16x8_t) vzip1q_u8(masked, maskedhi)); |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
#endif /* ! USE_NO_SIMD */ |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Return the bitwise OR of the inputs |
|
|
|
|
*/ |
|
|
|
|
@ -372,4 +404,19 @@ vector32_eq(const Vector32 v1, const Vector32 v2) |
|
|
|
|
} |
|
|
|
|
#endif /* ! USE_NO_SIMD */ |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Given two vectors, return a vector with the minimum element of each. |
|
|
|
|
*/ |
|
|
|
|
#ifndef USE_NO_SIMD |
|
|
|
|
static inline Vector8 |
|
|
|
|
vector8_min(const Vector8 v1, const Vector8 v2) |
|
|
|
|
{ |
|
|
|
|
#ifdef USE_SSE2 |
|
|
|
|
return _mm_min_epu8(v1, v2); |
|
|
|
|
#elif defined(USE_NEON) |
|
|
|
|
return vminq_u8(v1, v2); |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
#endif /* ! USE_NO_SIMD */ |
|
|
|
|
|
|
|
|
|
#endif /* SIMD_H */ |
|
|
|
|
|