As an exercise, I re-implemented the algorithm with execution performance in mind, which can be seen on godbolt.
I vectorized the code by hand, so included a non-vectorized version (reading only the first 8, not 16 bytes) for portability... and to explain what the vector version is doing.
/// Returns the index of the keyword.
///
/// # Safety
///
/// `line` should be readable for at least 16 characters.
unsigned int IdentifyKeyword(unsigned char const* line) {
#if defined(__SSE2__)
__m128i const bytes = _mm_loadu_si128((__m128i const*) line);
// Mark < 'a'.
__m128i const high_pass_mask = _mm_cmplt_epi8(bytes, _mm_set1_epi8('a'));
unsigned int const high_pass = _mm_movemask_epi8(high_pass_mask);
// Mark > 'z'.
__m128i const low_pass_mask = _mm_cmpgt_epi8(bytes, _mm_set1_epi8('z'));
unsigned int const low_pass = _mm_movemask_epi8(low_pass_mask);
#else
// Mark < 'a'.
unsigned int const high_pass =
(line[0] < 'a' ? (1u << 0) : 0) |
(line[1] < 'a' ? (1u << 1) : 0) |
(line[2] < 'a' ? (1u << 2) : 0) |
(line[3] < 'a' ? (1u << 3) : 0) |
(line[4] < 'a' ? (1u << 4) : 0) |
(line[5] < 'a' ? (1u << 5) : 0) |
(line[6] < 'a' ? (1u << 6) : 0) |
(line[7] < 'a' ? (1u << 7) : 0)
;
// Mark > 'z'.
unsigned int const low_pass =
(line[0] > 'z' ? (1u << 0) : 0) |
(line[1] > 'z' ? (1u << 1) : 0) |
(line[2] > 'z' ? (1u << 2) : 0) |
(line[3] > 'z' ? (1u << 3) : 0) |
(line[4] > 'z' ? (1u << 4) : 0) |
(line[5] > 'z' ? (1u << 5) : 0) |
(line[6] > 'z' ? (1u << 6) : 0) |
(line[7] > 'z' ? (1u << 7) : 0)
;
#endif
// Number of leading lower-alpha characters in the string,
// encoded as the number of trailing zeroes in `high_pass | low_pass`.
size_t const leading = __builtin_ctz(high_pass | low_pass | (1u << 7));
// Read first 8 bytes.
union {
unsigned char bytes[8];
uint64_t u;
} retained;
memcpy(retained.bytes, line, sizeof(retained.bytes));
uint64_t const mask = (((uint64_t) 1) << (leading * 8)) - 1;
uint64_t const candidate = retained.u & mask;
switch (candidate) {
// "if"
case 0x0000000000006669:
return 1;
// "ifdef"
case 0x0000006665646669:
return 2;
// "ifndef"
case 0x00006665646e6669:
return 3;
// "elif"
case 0x0000000066696c65:
return 4;
// "else"
case 0x0000000065736c65:
return 5;
// "endif"
case 0x0000006669646e65:
return 6;
// "define"
case 0x0000656e69666564:
return 7;
default:
return 0;
}
}
The assembly generated is fairly straightforward:
sw::IdentifyKeyword(unsigned char const*):
movdqu xmm1, XMMWORD PTR [rdi] ; load 16 first bytes of line
mov eax, 1633771873 ; load "aaaa" into eax
mov esi, 2054847098 ; load "zzzz" into esi
movabs rdx, 439787742825 ; load "ifdef" into rdx
movd xmm0, eax ; move eax ("aaaa") into xmm0
pshufd xmm0, xmm0, 0 ; fill xmm0 with 16 x 'a'
pcmpgtb xmm0, xmm1 ; xmm0 < xmm1 ('a' < line[i])
pmovmskb eax, xmm0 ; high_pass now in eax
movd xmm0, esi ; move esi ("zzzz") into xmm0
pshufd xmm0, xmm0, 0 ; fill xmm0 with 16 x 'b'
pcmpgtb xmm1, xmm0 ; xmm1 > xmm0 (line[i] > 'z')
pmovmskb ecx, xmm1 ; low_pass now in ecx
or ecx, eax ; high_pass | low_pass in ecx
mov rax, -1
or cl, -128
rep bsf ecx, ecx ; count trailing 0s of ecx
sal rcx, 3
sal rax, cl
not rax
and rax, QWORD PTR [rdi]
cmp rax, rdx
je .L21 ; jump if candidate == "ifdef"
cmp rdx, rax
jb .L20 ; jump if candidate < "ifdef"
mov edx, 5
cmp rax, 1702063205 ; load "else" in rax
je .L18 ; jump if candidate == "else"
mov edx, 4
cmp rax, 1718185061 ; load "elif" in rax
je .L18 ; jump if candidate == "elif"
xor edx, edx
cmp rax, 26217 ; compare candidate with "if"
sete dl ; set to 1 if candidate == "if"
.L18:
mov eax, edx
ret
.L20:
movabs rcx, 111524889126244 ; load "define" in rcx
mov edx, 7
cmp rax, rcx
je .L18 ; jump if candidate == "define"
movabs rcx, 112585662686825 ; load "ifndef" in rcx
mov edx, 3
cmp rax, rcx
je .L18 ; jump if candidate == "ifndef"
movabs rdx, 439854853733 ; load "endif" in rcx
cmp rax, rdx
mov eax, 0
mov edx, 6
cmovne edx, eax ; move 0 in edx if candidate != "endif"
mov eax, edx
ret
.L21:
mov edx, 2
mov eax, edx
ret
Note how the assembly is fully linear -- no loop -- and only includes 6 branches, ever.