3333typedef __m128i Vector8 ;
3434typedef __m128i Vector32 ;
3535
36+ #elif defined(__aarch64__ ) && defined(__ARM_NEON )
37+ /*
38+ * We use the Neon instructions if the compiler provides access to them (as
39+ * indicated by __ARM_NEON) and we are on aarch64. While Neon support is
40+ * technically optional for aarch64, it appears that all available 64-bit
41+ * hardware does have it. Neon exists in some 32-bit hardware too, but we
42+ * could not realistically use it there without a run-time check, which seems
43+ * not worth the trouble for now.
44+ */
45+ #include <arm_neon.h>
46+ #define USE_NEON
47+ typedef uint8x16_t Vector8 ;
48+ typedef uint32x4_t Vector32 ;
49+
3650#else
3751/*
3852 * If no SIMD instructions are available, we can in some cases emulate vector
@@ -90,6 +104,8 @@ vector8_load(Vector8 *v, const uint8 *s)
90104{
91105#if defined(USE_SSE2 )
92106 * v = _mm_loadu_si128 ((const __m128i * ) s );
107+ #elif defined(USE_NEON )
108+ * v = vld1q_u8 (s );
93109#else
94110 memcpy (v , s , sizeof (Vector8 ));
95111#endif
@@ -101,6 +117,8 @@ vector32_load(Vector32 *v, const uint32 *s)
101117{
102118#ifdef USE_SSE2
103119 * v = _mm_loadu_si128 ((const __m128i * ) s );
120+ #elif defined(USE_NEON )
121+ * v = vld1q_u32 (s );
104122#endif
105123}
106124#endif /* ! USE_NO_SIMD */
@@ -113,6 +131,8 @@ vector8_broadcast(const uint8 c)
113131{
114132#if defined(USE_SSE2 )
115133 return _mm_set1_epi8 (c );
134+ #elif defined(USE_NEON )
135+ return vdupq_n_u8 (c );
116136#else
117137 return ~UINT64CONST (0 ) / 0xFF * c ;
118138#endif
@@ -124,6 +144,8 @@ vector32_broadcast(const uint32 c)
124144{
125145#ifdef USE_SSE2
126146 return _mm_set1_epi32 (c );
147+ #elif defined(USE_NEON )
148+ return vdupq_n_u32 (c );
127149#endif
128150}
129151#endif /* ! USE_NO_SIMD */
@@ -153,7 +175,7 @@ vector8_has(const Vector8 v, const uint8 c)
153175#if defined(USE_NO_SIMD )
154176 /* any bytes in v equal to c will evaluate to zero via XOR */
155177 result = vector8_has_zero (v ^ vector8_broadcast (c ));
156- #elif defined( USE_SSE2 )
178+ #else
157179 result = vector8_is_highbit_set (vector8_eq (v , vector8_broadcast (c )));
158180#endif
159181
@@ -173,7 +195,7 @@ vector8_has_zero(const Vector8 v)
173195 * circular definition.
174196 */
175197 return vector8_has_le (v , 0 );
176- #elif defined( USE_SSE2 )
198+ #else
177199 return vector8_has (v , 0 );
178200#endif
179201}
@@ -223,7 +245,7 @@ vector8_has_le(const Vector8 v, const uint8 c)
223245 }
224246 }
225247 }
226- #elif defined( USE_SSE2 )
248+ #else
227249
228250 /*
229251 * Use saturating subtraction to find bytes <= c, which will present as
@@ -245,6 +267,8 @@ vector8_is_highbit_set(const Vector8 v)
245267{
246268#ifdef USE_SSE2
247269 return _mm_movemask_epi8 (v ) != 0 ;
270+ #elif defined(USE_NEON )
271+ return vmaxvq_u8 (v ) > 0x7F ;
248272#else
249273 return v & vector8_broadcast (0x80 );
250274#endif
@@ -258,6 +282,8 @@ vector8_or(const Vector8 v1, const Vector8 v2)
258282{
259283#ifdef USE_SSE2
260284 return _mm_or_si128 (v1 , v2 );
285+ #elif defined(USE_NEON )
286+ return vorrq_u8 (v1 , v2 );
261287#else
262288 return v1 | v2 ;
263289#endif
@@ -269,6 +295,8 @@ vector32_or(const Vector32 v1, const Vector32 v2)
269295{
270296#ifdef USE_SSE2
271297 return _mm_or_si128 (v1 , v2 );
298+ #elif defined(USE_NEON )
299+ return vorrq_u32 (v1 , v2 );
272300#endif
273301}
274302#endif /* ! USE_NO_SIMD */
@@ -285,6 +313,8 @@ vector8_ssub(const Vector8 v1, const Vector8 v2)
285313{
286314#ifdef USE_SSE2
287315 return _mm_subs_epu8 (v1 , v2 );
316+ #elif defined(USE_NEON )
317+ return vqsubq_u8 (v1 , v2 );
288318#endif
289319}
290320#endif /* ! USE_NO_SIMD */
@@ -299,6 +329,8 @@ vector8_eq(const Vector8 v1, const Vector8 v2)
299329{
300330#ifdef USE_SSE2
301331 return _mm_cmpeq_epi8 (v1 , v2 );
332+ #elif defined(USE_NEON )
333+ return vceqq_u8 (v1 , v2 );
302334#endif
303335}
304336#endif /* ! USE_NO_SIMD */
@@ -309,6 +341,8 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
309341{
310342#ifdef USE_SSE2
311343 return _mm_cmpeq_epi32 (v1 , v2 );
344+ #elif defined(USE_NEON )
345+ return vceqq_u32 (v1 , v2 );
312346#endif
313347}
314348#endif /* ! USE_NO_SIMD */
0 commit comments