@@ -79,6 +79,7 @@ static inline bool vector8_has_le(const Vector8 v, const uint8 c);
7979static inline bool vector8_is_highbit_set (const Vector8 v );
8080#ifndef USE_NO_SIMD
8181static inline bool vector32_is_highbit_set (const Vector32 v );
82+ static inline uint32 vector8_highbit_mask (const Vector8 v );
8283#endif
8384
8485/* arithmetic operations */
@@ -96,6 +97,7 @@ static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2);
9697 */
9798#ifndef USE_NO_SIMD
9899static inline Vector8 vector8_eq (const Vector8 v1 , const Vector8 v2 );
100+ static inline Vector8 vector8_min (const Vector8 v1 , const Vector8 v2 );
99101static inline Vector32 vector32_eq (const Vector32 v1 , const Vector32 v2 );
100102#endif
101103
@@ -299,6 +301,36 @@ vector32_is_highbit_set(const Vector32 v)
299301}
300302#endif /* ! USE_NO_SIMD */
301303
304+ /*
305+ * Return a bitmask formed from the high-bit of each element.
306+ */
307+ #ifndef USE_NO_SIMD
308+ static inline uint32
309+ vector8_highbit_mask (const Vector8 v )
310+ {
311+ #ifdef USE_SSE2
312+ return (uint32 ) _mm_movemask_epi8 (v );
313+ #elif defined(USE_NEON )
314+ /*
315+ * Note: It would be faster to use vget_lane_u64 and vshrn_n_u16, but that
316+ * returns a uint64, making it inconvenient to combine mask values from
317+ * multiple vectors.
318+ */
319+ static const uint8 mask [16 ] = {
320+ 1 << 0 , 1 << 1 , 1 << 2 , 1 << 3 ,
321+ 1 << 4 , 1 << 5 , 1 << 6 , 1 << 7 ,
322+ 1 << 0 , 1 << 1 , 1 << 2 , 1 << 3 ,
323+ 1 << 4 , 1 << 5 , 1 << 6 , 1 << 7 ,
324+ };
325+
326+ uint8x16_t masked = vandq_u8 (vld1q_u8 (mask ), (uint8x16_t ) vshrq_n_s8 (v , 7 ));
327+ uint8x16_t maskedhi = vextq_u8 (masked , masked , 8 );
328+
329+ return (uint32 ) vaddvq_u16 ((uint16x8_t ) vzip1q_u8 (masked , maskedhi ));
330+ #endif
331+ }
332+ #endif /* ! USE_NO_SIMD */
333+
302334/*
303335 * Return the bitwise OR of the inputs
304336 */
@@ -372,4 +404,19 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
372404}
373405#endif /* ! USE_NO_SIMD */
374406
407+ /*
408+ * Given two vectors, return a vector with the minimum element of each.
409+ */
410+ #ifndef USE_NO_SIMD
411+ static inline Vector8
412+ vector8_min (const Vector8 v1 , const Vector8 v2 )
413+ {
414+ #ifdef USE_SSE2
415+ return _mm_min_epu8 (v1 , v2 );
416+ #elif defined(USE_NEON )
417+ return vminq_u8 (v1 , v2 );
418+ #endif
419+ }
420+ #endif /* ! USE_NO_SIMD */
421+
375422#endif /* SIMD_H */
0 commit comments