Revisions to Arcane algorithm to find a keyword in a lookup table

Added assembly for optimized version.

Source Link

edited Apr 18 at 11:37

6.4k
1
21
34

As an exercise, I re-implemented the algorithm with execution performance in mind, which can be seen on godbolt.

I vectorized the code by hand, so included a non-vectorized version (reading only the first 8, not 16 bytes) for portability... and to explain what the vector version is doing.

    /// Returns the index of the keyword.
    ///
    /// #   Safety
    ///
    /// `line` should be readable for at least 16 characters.
    unsigned int IdentifyKeyword(unsigned char const* line) {
#if defined(__SSE2__)
        __m128i const bytes = _mm_loadu_si128((__m128i const*) line);

        //  Mark < 'a'.
        __m128i const high_pass_mask = _mm_cmplt_epi8(bytes, _mm_set1_epi8('a'));
        unsigned int const high_pass = _mm_movemask_epi8(high_pass_mask);

        //  Mark > 'z'.
        __m128i const low_pass_mask = _mm_cmpgt_epi8(bytes, _mm_set1_epi8('z'));
        unsigned int const low_pass = _mm_movemask_epi8(low_pass_mask);
#else
        //  Mark < 'a'.
        unsigned int const high_pass =
            (line[0] < 'a' ? (1u << 0) : 0) |
            (line[1] < 'a' ? (1u << 1) : 0) |
            (line[2] < 'a' ? (1u << 2) : 0) |
            (line[3] < 'a' ? (1u << 3) : 0) |
            (line[4] < 'a' ? (1u << 4) : 0) |
            (line[5] < 'a' ? (1u << 5) : 0) |
            (line[6] < 'a' ? (1u << 6) : 0) |
            (line[7] < 'a' ? (1u << 7) : 0)
        ;

        //  Mark > 'z'.
        unsigned int const low_pass =
            (line[0] > 'z' ? (1u << 0) : 0) |
            (line[1] > 'z' ? (1u << 1) : 0) |
            (line[2] > 'z' ? (1u << 2) : 0) |
            (line[3] > 'z' ? (1u << 3) : 0) |
            (line[4] > 'z' ? (1u << 4) : 0) |
            (line[5] > 'z' ? (1u << 5) : 0) |
            (line[6] > 'z' ? (1u << 6) : 0) |
            (line[7] > 'z' ? (1u << 7) : 0) 
        ;
#endif

        //  Number of leading lower-alpha characters in the string,
        //  encoded as the number of trailing zeroes in `high_pass | low_pass`.
        size_t const leading = __builtin_ctz(high_pass | low_pass | (1u << 7));

        //  Read first 8 bytes.
        union {
            unsigned char bytes[8];
            uint64_t u;
        } retained;

        memcpy(retained.bytes, line, sizeof(retained.bytes));

        uint64_t const mask = (((uint64_t) 1) << (leading * 8)) - 1;
        uint64_t const candidate = retained.u & mask;

        switch (candidate) {
        //  "if"
        case 0x0000000000006669:
            return 1;
        //  "ifdef"
        case 0x0000006665646669:
            return 2;
        //  "ifndef"
        case 0x00006665646e6669:
            return 3;
        //  "elif"
        case 0x0000000066696c65:
            return 4;
        //  "else"
        case 0x0000000065736c65:
            return 5;
        //  "endif"
        case 0x0000006669646e65:
            return 6;
        //  "define"
        case 0x0000656e69666564:
            return 7;
        default:
            return 0;
        }
    }

The assembly generated is fairly straightforward:

sw::IdentifyKeyword(unsigned char const*):
        movdqu  xmm1, XMMWORD PTR [rdi]   ; load 16 first bytes of line
        mov     eax, 1633771873           ; load "aaaa" into eax
        mov     esi, 2054847098           ; load "zzzz" into esi
        movabs  rdx, 439787742825         ; load "ifdef" into rdx
        movd    xmm0, eax                 ; move eax ("aaaa") into xmm0
        pshufd  xmm0, xmm0, 0             ; fill xmm0 with 16 x 'a'
        pcmpgtb xmm0, xmm1                ; xmm0 < xmm1 ('a' < line[i])
        pmovmskb        eax, xmm0         ; high_pass now in eax
        movd    xmm0, esi                 ; move esi ("zzzz") into xmm0
        pshufd  xmm0, xmm0, 0             ; fill xmm0 with 16 x 'b'
        pcmpgtb xmm1, xmm0                ; xmm1 > xmm0 (line[i] > 'z')
        pmovmskb        ecx, xmm1         ; low_pass now in ecx
        or      ecx, eax                  ; high_pass | low_pass in ecx
        mov     rax, -1
        or      cl, -128
        rep bsf ecx, ecx                  ; count trailing 0s of ecx
        sal     rcx, 3
        sal     rax, cl
        not     rax
        and     rax, QWORD PTR [rdi]
        cmp     rax, rdx
        je      .L21                      ; jump if candidate == "ifdef"
        cmp     rdx, rax
        jb      .L20                      ; jump if candidate < "ifdef"
        mov     edx, 5
        cmp     rax, 1702063205           ; load "else" in rax
        je      .L18                      ; jump if candidate == "else"
        mov     edx, 4
        cmp     rax, 1718185061           ; load "elif" in rax
        je      .L18                      ; jump if candidate == "elif"
        xor     edx, edx
        cmp     rax, 26217                ; compare candidate with "if"
        sete    dl                        ; set to 1 if candidate == "if"
.L18:
        mov     eax, edx
        ret
.L20:
        movabs  rcx, 111524889126244      ; load "define" in rcx
        mov     edx, 7
        cmp     rax, rcx
        je      .L18                      ; jump if candidate == "define"
        movabs  rcx, 112585662686825      ; load "ifndef" in rcx
        mov     edx, 3
        cmp     rax, rcx
        je      .L18                      ; jump if candidate == "ifndef"
        movabs  rdx, 439854853733         ; load "endif" in rcx
        cmp     rax, rdx
        mov     eax, 0
        mov     edx, 6
        cmovne  edx, eax                  ; move 0 in edx if candidate != "endif"
        mov     eax, edx
        ret
.L21:
        mov     edx, 2
        mov     eax, edx
        ret

Note how the assembly is fully linear -- no loop -- and only includes 6 branches, ever.

As an exercise, I re-implemented the algorithm with execution performance in mind, which can be seen on godbolt.

I vectorized the code by hand, so included a non-vectorized version (reading only the first 8, not 16 bytes) for portability... and to explain what the vector version is doing.

    /// Returns the index of the keyword.
    ///
    /// #   Safety
    ///
    /// `line` should be readable for at least 16 characters.
    unsigned int IdentifyKeyword(unsigned char const* line) {
#if defined(__SSE2__)
        __m128i const bytes = _mm_loadu_si128((__m128i const*) line);

        //  Mark < 'a'.
        __m128i const high_pass_mask = _mm_cmplt_epi8(bytes, _mm_set1_epi8('a'));
        unsigned int const high_pass = _mm_movemask_epi8(high_pass_mask);

        //  Mark > 'z'.
        __m128i const low_pass_mask = _mm_cmpgt_epi8(bytes, _mm_set1_epi8('z'));
        unsigned int const low_pass = _mm_movemask_epi8(low_pass_mask);
#else
        //  Mark < 'a'.
        unsigned int const high_pass =
            (line[0] < 'a' ? (1u << 0) : 0) |
            (line[1] < 'a' ? (1u << 1) : 0) |
            (line[2] < 'a' ? (1u << 2) : 0) |
            (line[3] < 'a' ? (1u << 3) : 0) |
            (line[4] < 'a' ? (1u << 4) : 0) |
            (line[5] < 'a' ? (1u << 5) : 0) |
            (line[6] < 'a' ? (1u << 6) : 0) |
            (line[7] < 'a' ? (1u << 7) : 0)
        ;

        //  Mark > 'z'.
        unsigned int const low_pass =
            (line[0] > 'z' ? (1u << 0) : 0) |
            (line[1] > 'z' ? (1u << 1) : 0) |
            (line[2] > 'z' ? (1u << 2) : 0) |
            (line[3] > 'z' ? (1u << 3) : 0) |
            (line[4] > 'z' ? (1u << 4) : 0) |
            (line[5] > 'z' ? (1u << 5) : 0) |
            (line[6] > 'z' ? (1u << 6) : 0) |
            (line[7] > 'z' ? (1u << 7) : 0) 
        ;
#endif

        //  Number of leading lower-alpha characters in the string,
        //  encoded as the number of trailing zeroes in `high_pass | low_pass`.
        size_t const leading = __builtin_ctz(high_pass | low_pass | (1u << 7));

        //  Read first 8 bytes.
        union {
            unsigned char bytes[8];
            uint64_t u;
        } retained;

        memcpy(retained.bytes, line, sizeof(retained.bytes));

        uint64_t const mask = (((uint64_t) 1) << (leading * 8)) - 1;
        uint64_t const candidate = retained.u & mask;

        switch (candidate) {
        //  "if"
        case 0x0000000000006669:
            return 1;
        //  "ifdef"
        case 0x0000006665646669:
            return 2;
        //  "ifndef"
        case 0x00006665646e6669:
            return 3;
        //  "elif"
        case 0x0000000066696c65:
            return 4;
        //  "else"
        case 0x0000000065736c65:
            return 5;
        //  "endif"
        case 0x0000006669646e65:
            return 6;
        //  "define"
        case 0x0000656e69666564:
            return 7;
        default:
            return 0;
        }
    }

The assembly generated is fairly straightforward:

sw::IdentifyKeyword(unsigned char const*):
        movdqu  xmm1, XMMWORD PTR [rdi]   ; load 16 first bytes of line
        mov     eax, 1633771873           ; load "aaaa" into eax
        mov     esi, 2054847098           ; load "zzzz" into esi
        movabs  rdx, 439787742825         ; load "ifdef" into rdx
        movd    xmm0, eax                 ; move eax ("aaaa") into xmm0
        pshufd  xmm0, xmm0, 0             ; fill xmm0 with 16 x 'a'
        pcmpgtb xmm0, xmm1                ; xmm0 < xmm1 ('a' < line[i])
        pmovmskb        eax, xmm0         ; high_pass now in eax
        movd    xmm0, esi                 ; move esi ("zzzz") into xmm0
        pshufd  xmm0, xmm0, 0             ; fill xmm0 with 16 x 'b'
        pcmpgtb xmm1, xmm0                ; xmm1 > xmm0 (line[i] > 'z')
        pmovmskb        ecx, xmm1         ; low_pass now in ecx
        or      ecx, eax                  ; high_pass | low_pass in ecx
        mov     rax, -1
        or      cl, -128
        rep bsf ecx, ecx                  ; count trailing 0s of ecx
        sal     rcx, 3
        sal     rax, cl
        not     rax
        and     rax, QWORD PTR [rdi]
        cmp     rax, rdx
        je      .L21                      ; jump if candidate == "ifdef"
        cmp     rdx, rax
        jb      .L20                      ; jump if candidate < "ifdef"
        mov     edx, 5
        cmp     rax, 1702063205           ; load "else" in rax
        je      .L18                      ; jump if candidate == "else"
        mov     edx, 4
        cmp     rax, 1718185061           ; load "elif" in rax
        je      .L18                      ; jump if candidate == "elif"
        xor     edx, edx
        cmp     rax, 26217                ; compare candidate with "if"
        sete    dl                        ; set to 1 if candidate == "if"
.L18:
        mov     eax, edx
        ret
.L20:
        movabs  rcx, 111524889126244      ; load "define" in rcx
        mov     edx, 7
        cmp     rax, rcx
        je      .L18                      ; jump if candidate == "define"
        movabs  rcx, 112585662686825      ; load "ifndef" in rcx
        mov     edx, 3
        cmp     rax, rcx
        je      .L18                      ; jump if candidate == "ifndef"
        movabs  rdx, 439854853733         ; load "endif" in rcx
        cmp     rax, rdx
        mov     eax, 0
        mov     edx, 6
        cmovne  edx, eax                  ; move 0 in edx if candidate != "endif"
        mov     eax, edx
        ret
.L21:
        mov     edx, 2
        mov     eax, edx
        ret

Note how the assembly is fully linear -- no loop -- and only includes 6 branches, ever.

Expand on what tests should look like.

Source Link

edited Apr 18 at 7:17

Matthieu M.

6.4k
1
21
34

Documentation & Tests

There's no explanation of what the function is supposed to do in the code, typically found as a comment above the function.

For example, the if in "#if 0" is not matched by your code, even though from the description you gave in text, and the name of the function, it seems like it should match.

Tests

For example, the if in "#if 0" is not matched by your codeA simple way to perform unit-tests, even though from the description you gave in textC, and the name ofis the functionfollowing:

static void CheckFindKeyword(unsigned int expected, TCHAR const* line) {
    static size_t const MAX_DISPLAY = 16;

    unsigned int actual = FindKeyword(line);

    if (actual == expected) { return; }

    size_t len = strlen(line);
    TCHAR const* tail = "";

    if (len > MAX_DISPLAY) {
        len = MAX_DISPLAY;
        tail = "...";
    }

    fprintf(stderr, "Error in FindKeyword(\"%.*s%s\"): expected %u, got %u", (int) len, line, tail, expected, actual);

    exit(EXIT_FAILURE);
}

And then, it seemsmain looks like it should match.:

int _tmain(int argc, TCHAR* argv[]) {
    static unsigned int const NOT_FOUND = 0;

    CheckFindKeyword(NOT_FOUND, _T("")));
    CheckFindKeyword(NOT_FOUND, _T(" ")));
    CheckFindKeyword(NOT_FOUND, _T(" aaa")));

    // ...
}

Documentation & Tests

There's no explanation of what the function is supposed to do in the code, typically found as a comment above the function.

For example, the if in "#if 0" is not matched by your code, even though from the description you gave in text, and the name of the function, it seems like it should match.

Documentation

There's no explanation of what the function is supposed to do in the code, typically found as a comment above the function.

For example, the if in "#if 0" is not matched by your code, even though from the description you gave in text, and the name of the function, it seems like it should match.

Tests

A simple way to perform unit-tests, in C, is the following:

static void CheckFindKeyword(unsigned int expected, TCHAR const* line) {
    static size_t const MAX_DISPLAY = 16;

    unsigned int actual = FindKeyword(line);

    if (actual == expected) { return; }

    size_t len = strlen(line);
    TCHAR const* tail = "";

    if (len > MAX_DISPLAY) {
        len = MAX_DISPLAY;
        tail = "...";
    }

    fprintf(stderr, "Error in FindKeyword(\"%.*s%s\"): expected %u, got %u", (int) len, line, tail, expected, actual);

    exit(EXIT_FAILURE);
}

And then, main looks like:

int _tmain(int argc, TCHAR* argv[]) {
    static unsigned int const NOT_FOUND = 0;

    CheckFindKeyword(NOT_FOUND, _T("")));
    CheckFindKeyword(NOT_FOUND, _T(" ")));
    CheckFindKeyword(NOT_FOUND, _T(" aaa")));

    // ...
}

edited body

Source Link

edited Apr 17 at 16:01

Matthieu M.

6.4k
1
21
34

Documentation & Tests

There's no explanation of what the function is supposed to do in the code, typically found as a comment above the function.

There's no test (in your submission). Test-suites are very useful for code-review as they allow visualizing what the developer intended, and potentially what they didn't think of covering.

For example, the if in "#if 0" is not matched by your code, even though from the description you gave in text, and the name of the function, it seems like it should match.

Space?

The choice of space (or less) for termination is weird, a brief look at the ASCII table shows:

at 0x20.
( and ) at 0x28 and 0x29.
0 and 9 at 0x30 and 0x39.
; at 0x3b.
A and Z at 0x41 and 0x5a.
[ and ] at 0x5b and 0x5c.
_ at 0x5f.
a and z at 0x61 and 0x7a.
{ and } at 0x7b and 0x7d.

Note that (, ;, [, and { are all fairly natural "delimiters", for example, it's quite usual to type break;, continue; or return;.

On the other hand, do note that _ is typically allowed for identifiers, and _define shouldn't match the define keyword.

Therefore the choice of < ' ' is odd.

If keywords are limited to lowercase ASCII characters -- a common restriction -- then one can simply check byte < 'a' &&|| byte > 'z' instead for the end of keyword check.

Algorithmic complexity vs Execution performance

The idea of reading every single character once (and only once) or of never backtracking are powerful ideas to guarantee algorithmic complexity.

Unfortunately, execution performance is not strictly tied to algorithmic complexity, and byte-by-byte branching, especially with the high number of branches you have here tends to be painfully slow.

Documentation & Tests

There's no explanation of what the function is supposed to do in the code, typically found as a comment above the function.

There's no test (in your submission). Test-suites are very useful for code-review as they allow visualizing what the developer intended, and potentially what they didn't think of covering.

For example, the if in "#if 0" is not matched by your code, even though from the description you gave in text, and the name of the function, it seems like it should match.

Space?

The choice of space (or less) for termination is weird, a brief look at the ASCII table shows:

at 0x20.
( and ) at 0x28 and 0x29.
0 and 9 at 0x30 and 0x39.
; at 0x3b.
A and Z at 0x41 and 0x5a.
[ and ] at 0x5b and 0x5c.
_ at 0x5f.
a and z at 0x61 and 0x7a.
{ and } at 0x7b and 0x7d.

Note that (, ;, [, and { are all fairly natural "delimiters", for example, it's quite usual to type break;, continue; or return;.

On the other hand, do note that _ is typically allowed for identifiers, and _define shouldn't match the define keyword.

Therefore the choice of < ' ' is odd.

If keywords are limited to lowercase ASCII characters -- a common restriction -- then one can simply check byte < 'a' && byte > 'z' instead for the end of keyword check.

Algorithmic complexity vs Execution performance

The idea of reading every single character once (and only once) or of never backtracking are powerful ideas to guarantee algorithmic complexity.

Unfortunately, execution performance is not strictly tied to algorithmic complexity, and byte-by-byte branching, especially with the high number of branches you have here tends to be painfully slow.

Documentation & Tests

There's no explanation of what the function is supposed to do in the code, typically found as a comment above the function.

There's no test (in your submission). Test-suites are very useful for code-review as they allow visualizing what the developer intended, and potentially what they didn't think of covering.

For example, the if in "#if 0" is not matched by your code, even though from the description you gave in text, and the name of the function, it seems like it should match.

Space?

The choice of space (or less) for termination is weird, a brief look at the ASCII table shows:

at 0x20.
( and ) at 0x28 and 0x29.
0 and 9 at 0x30 and 0x39.
; at 0x3b.
A and Z at 0x41 and 0x5a.
[ and ] at 0x5b and 0x5c.
_ at 0x5f.
a and z at 0x61 and 0x7a.
{ and } at 0x7b and 0x7d.

Note that (, ;, [, and { are all fairly natural "delimiters", for example, it's quite usual to type break;, continue; or return;.

On the other hand, do note that _ is typically allowed for identifiers, and _define shouldn't match the define keyword.

Therefore the choice of < ' ' is odd.

If keywords are limited to lowercase ASCII characters -- a common restriction -- then one can simply check byte < 'a' || byte > 'z' instead for the end of keyword check.

Algorithmic complexity vs Execution performance

The idea of reading every single character once (and only once) or of never backtracking are powerful ideas to guarantee algorithmic complexity.

Unfortunately, execution performance is not strictly tied to algorithmic complexity, and byte-by-byte branching, especially with the high number of branches you have here tends to be painfully slow.

added 529 characters in body

Source Link

edited Apr 17 at 14:11

Matthieu M.

6.4k
1
21
34

Loading

Source Link

answered Apr 17 at 12:05

Matthieu M.

6.4k
1
21
34

Loading

Stack Exchange Network

Return to Answer

Documentation & Tests

Tests

Documentation & Tests

Documentation

Tests

Documentation & Tests

Space?

Algorithmic complexity vs Execution performance

Documentation & Tests

Space?

Algorithmic complexity vs Execution performance

Documentation & Tests

Space?

Algorithmic complexity vs Execution performance