3131#include <emmintrin.h>
3232#define USE_SSE2
3333typedef __m128i Vector8 ;
34+ typedef __m128i Vector32 ;
3435
3536#else
3637/*
3738 * If no SIMD instructions are available, we can in some cases emulate vector
38- * operations using bitwise operations on unsigned integers.
39+ * operations using bitwise operations on unsigned integers. Note that many
40+ * of the functions in this file presently do not have non-SIMD
41+ * implementations. In particular, none of the functions involving Vector32
42+ * are implemented without SIMD since it's likely not worthwhile to represent
43+ * two 32-bit integers using a uint64.
3944 */
4045#define USE_NO_SIMD
4146typedef uint64 Vector8 ;
4247#endif
4348
44-
4549/* load/store operations */
4650static inline void vector8_load (Vector8 * v , const uint8 * s );
51+ #ifndef USE_NO_SIMD
52+ static inline void vector32_load (Vector32 * v , const uint32 * s );
53+ #endif
4754
4855/* assignment operations */
4956static inline Vector8 vector8_broadcast (const uint8 c );
57+ #ifndef USE_NO_SIMD
58+ static inline Vector32 vector32_broadcast (const uint32 c );
59+ #endif
5060
5161/* element-wise comparisons to a scalar */
5262static inline bool vector8_has (const Vector8 v , const uint8 c );
@@ -56,14 +66,21 @@ static inline bool vector8_is_highbit_set(const Vector8 v);
5666
5767/* arithmetic operations */
5868static inline Vector8 vector8_or (const Vector8 v1 , const Vector8 v2 );
59-
60- /* Different semantics for SIMD architectures. */
6169#ifndef USE_NO_SIMD
70+ static inline Vector32 vector32_or (const Vector32 v1 , const Vector32 v2 );
71+ static inline Vector8 vector8_ssub (const Vector8 v1 , const Vector8 v2 );
72+ #endif
6273
63- /* comparisons between vectors */
74+ /*
75+ * comparisons between vectors
76+ *
77+ * Note: These return a vector rather than booloan, which is why we don't
78+ * have non-SIMD implementations.
79+ */
80+ #ifndef USE_NO_SIMD
6481static inline Vector8 vector8_eq (const Vector8 v1 , const Vector8 v2 );
65-
66- #endif /* ! USE_NO_SIMD */
82+ static inline Vector32 vector32_eq ( const Vector32 v1 , const Vector32 v2 );
83+ #endif
6784
6885/*
6986 * Load a chunk of memory into the given vector.
@@ -78,6 +95,15 @@ vector8_load(Vector8 *v, const uint8 *s)
7895#endif
7996}
8097
98+ #ifndef USE_NO_SIMD
99+ static inline void
100+ vector32_load (Vector32 * v , const uint32 * s )
101+ {
102+ #ifdef USE_SSE2
103+ * v = _mm_loadu_si128 ((const __m128i * ) s );
104+ #endif
105+ }
106+ #endif /* ! USE_NO_SIMD */
81107
82108/*
83109 * Create a vector with all elements set to the same value.
@@ -92,6 +118,16 @@ vector8_broadcast(const uint8 c)
92118#endif
93119}
94120
121+ #ifndef USE_NO_SIMD
122+ static inline Vector32
123+ vector32_broadcast (const uint32 c )
124+ {
125+ #ifdef USE_SSE2
126+ return _mm_set1_epi32 (c );
127+ #endif
128+ }
129+ #endif /* ! USE_NO_SIMD */
130+
95131/*
96132 * Return true if any elements in the vector are equal to the given scalar.
97133 */
@@ -118,7 +154,7 @@ vector8_has(const Vector8 v, const uint8 c)
118154 /* any bytes in v equal to c will evaluate to zero via XOR */
119155 result = vector8_has_zero (v ^ vector8_broadcast (c ));
120156#elif defined(USE_SSE2 )
121- result = _mm_movemask_epi8 ( _mm_cmpeq_epi8 (v , vector8_broadcast (c )));
157+ result = vector8_is_highbit_set ( vector8_eq (v , vector8_broadcast (c )));
122158#endif
123159
124160 Assert (assert_result == result );
@@ -133,8 +169,8 @@ vector8_has_zero(const Vector8 v)
133169{
134170#if defined(USE_NO_SIMD )
135171 /*
136- * We cannot call vector8_has() here, because that would lead to a circular
137- * definition.
172+ * We cannot call vector8_has() here, because that would lead to a
173+ * circular definition.
138174 */
139175 return vector8_has_le (v , 0 );
140176#elif defined(USE_SSE2 )
@@ -150,9 +186,6 @@ static inline bool
150186vector8_has_le (const Vector8 v , const uint8 c )
151187{
152188 bool result = false;
153- #if defined(USE_SSE2 )
154- __m128i sub ;
155- #endif
156189
157190 /* pre-compute the result for assert checking */
158191#ifdef USE_ASSERT_CHECKING
@@ -194,10 +227,10 @@ vector8_has_le(const Vector8 v, const uint8 c)
194227
195228 /*
196229 * Use saturating subtraction to find bytes <= c, which will present as
197- * NUL bytes in 'sub'.
230+ * NUL bytes. This approach is a workaround for the lack of unsigned
231+ * comparison instructions on some architectures.
198232 */
199- sub = _mm_subs_epu8 (v , vector8_broadcast (c ));
200- result = vector8_has_zero (sub );
233+ result = vector8_has_zero (vector8_ssub (v , vector8_broadcast (c )));
201234#endif
202235
203236 Assert (assert_result == result );
@@ -230,22 +263,54 @@ vector8_or(const Vector8 v1, const Vector8 v2)
230263#endif
231264}
232265
266+ #ifndef USE_NO_SIMD
267+ static inline Vector32
268+ vector32_or (const Vector32 v1 , const Vector32 v2 )
269+ {
270+ #ifdef USE_SSE2
271+ return _mm_or_si128 (v1 , v2 );
272+ #endif
273+ }
274+ #endif /* ! USE_NO_SIMD */
233275
234- /* Different semantics for SIMD architectures. */
276+ /*
277+ * Return the result of subtracting the respective elements of the input
278+ * vectors using saturation (i.e., if the operation would yield a value less
279+ * than zero, zero is returned instead). For more information on saturation
280+ * arithmetic, see https://en.wikipedia.org/wiki/Saturation_arithmetic
281+ */
235282#ifndef USE_NO_SIMD
283+ static inline Vector8
284+ vector8_ssub (const Vector8 v1 , const Vector8 v2 )
285+ {
286+ #ifdef USE_SSE2
287+ return _mm_subs_epu8 (v1 , v2 );
288+ #endif
289+ }
290+ #endif /* ! USE_NO_SIMD */
236291
237292/*
238293 * Return a vector with all bits set in each lane where the the corresponding
239294 * lanes in the inputs are equal.
240295 */
296+ #ifndef USE_NO_SIMD
241297static inline Vector8
242298vector8_eq (const Vector8 v1 , const Vector8 v2 )
243299{
244300#ifdef USE_SSE2
245301 return _mm_cmpeq_epi8 (v1 , v2 );
246302#endif
247303}
304+ #endif /* ! USE_NO_SIMD */
248305
306+ #ifndef USE_NO_SIMD
307+ static inline Vector32
308+ vector32_eq (const Vector32 v1 , const Vector32 v2 )
309+ {
310+ #ifdef USE_SSE2
311+ return _mm_cmpeq_epi32 (v1 , v2 );
312+ #endif
313+ }
249314#endif /* ! USE_NO_SIMD */
250315
251316#endif /* SIMD_H */
0 commit comments