2

I'm trying to edit an existing C++ program to search text for multiple strings rather than a single string. I'm a complete C++ noob so I'm hoping someone can point me to a function that will work and is fast. I tried using regular expressions (see my earlier post) and this works but regular expressions are way too slow for my purpose. Take the following code (courtesy of @WiktorStribiżew):

#include <iostream>
#include <regex>
using namespace std;

int main() {
    std::string pattern("A|D");         // Regex expression
    std::regex rx(pattern);             // Getting the regex object 

    std::string s("ABCDEABCABD");       // Defining the string input
    std::ptrdiff_t number_of_matches = std::distance(  // Count the number of matches inside the iterator
        std::sregex_iterator(s.begin(), s.end(), rx),
        std::sregex_iterator());

    std::cout << number_of_matches << std::endl;  // Displaying results
    return 0;
}

This outputs the wanted answer (5) but is too slow in my purpose (searching every line in ~100+ GB files). How can I make this faster? The input subject string s is read from a file and the query string pattern is supplied on the command line, so it comes as a string (separated by pipes into A|D, for example).

9
  • 2
    "How can I make this faster?" - Multithreading. And for such a simple regex as A|D, just loop through all characters of the string and check for both A and D. Commented Apr 1, 2016 at 15:22
  • @OMGtechy have not tried anything else, not familiar with C or C++ at all so not sure where to start. Commented Apr 1, 2016 at 15:29
  • @Siguza Ideally something that's optimized for speed in the code, it's slightly more complicated than A|D (in fact the nucleotide sequence, TTAGGG|TCAGGG). Already using multiple cores to run several of these at the same time. Commented Apr 1, 2016 at 15:33
  • Can there be more than two patterns? (The subject says multiple) Commented Apr 1, 2016 at 15:44
  • Yea, anywhere from 2 to 4 Commented Apr 1, 2016 at 15:49

1 Answer 1

4

Regex and performance bite each other.
I rewrote your example to not use Regex, and compiled both versions (on OS X, using g++ 5.3.0 with -O3 -fwhole-program):

#include <iostream>
using namespace std;

#define NUM_PATTERNS 2

int main() {
    std::string patterns[] = {std::string("A"), std::string("D")};
    std::string str("ABCDEABCABD");
    unsigned int count = 0;

    for(int i = 0; i < NUM_PATTERNS; ++i)
    {
        for(int pos = 0; ; )
        {
            pos = str.find(patterns[i], pos);
            if(pos == std::string::npos)
            {
                break;
            }
            ++pos;      // in order to not match again
            ++count;
        }
    }

    std::cout << count << std::endl;  // Displaying results
    return 0;
}

The generated assembly was 545 lines:

    .cstring
    .align 3
LC0:
    .ascii "basic_string::_M_construct null not valid\0"
    .section __TEXT,__text_cold,regular,pure_instructions
    .align 1
LCOLDB1:
    .text
LHOTB1:
    .align 1,0x90
    .align 4,0x90
__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18:
LFB1645:
    pushq   %r13
LCFI0:
    pushq   %r12
LCFI1:
    leaq    16(%rdi), %r12
    pushq   %rbp
LCFI2:
    pushq   %rbx
LCFI3:
    subq    $24, %rsp
LCFI4:
    testq   %rsi, %rsi
    movq    %r12, (%rdi)
    je  L2
    movq    %rdi, %rbx
    movq    %rsi, %rdi
    movq    %rsi, %r13
    call    _strlen
    cmpq    $15, %rax
    movq    %rax, %rbp
    movq    %rax, 8(%rsp)
    ja  L13
    cmpq    $1, %rax
    je  L14
    testq   %rax, %rax
    jne L15
L6:
    movq    8(%rsp), %rax
    movq    (%rbx), %rdx
    movq    %rax, 8(%rbx)
    movb    $0, (%rdx,%rax)
    addq    $24, %rsp
LCFI5:
    popq    %rbx
LCFI6:
    popq    %rbp
LCFI7:
    popq    %r12
LCFI8:
    popq    %r13
LCFI9:
    ret
L13:
LCFI10:
    leaq    8(%rsp), %rsi
    xorl    %edx, %edx
    movq    %rbx, %rdi
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm
    movq    8(%rsp), %rdx
    movq    %rax, (%rbx)
    movq    %rax, %rdi
    movq    %rdx, 16(%rbx)
L4:
    movq    %rbp, %rdx
    movq    %r13, %rsi
    call    _memcpy
    jmp L6
L14:
    movzbl  0(%r13), %eax
    movb    %al, 16(%rbx)
    jmp L6
L2:
    leaq    LC0(%rip), %rdi
    call    __ZSt19__throw_logic_errorPKc
L15:
    movq    %r12, %rdi
    jmp L4
LFE1645:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDE1:
    .text
LHOTE1:
    .cstring
LC2:
    .ascii "A\0"
LC3:
    .ascii "D\0"
LC4:
    .ascii "ABCDEABCABD\0"
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDB5:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTB5:
    .align 4
    .globl _main
_main:
LFB1425:
    pushq   %r15
LCFI11:
    leaq    LC2(%rip), %rsi
    pushq   %r14
LCFI12:
    pushq   %r13
LCFI13:
    pushq   %r12
LCFI14:
    pushq   %rbp
LCFI15:
    pushq   %rbx
LCFI16:
    subq    $104, %rsp
LCFI17:
    leaq    32(%rsp), %rbp
    movq    %rbp, %rdi
LEHB0:
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18
LEHE0:
    leaq    32(%rbp), %rdi
    leaq    LC3(%rip), %rsi
LEHB1:
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18
LEHE1:
    leaq    LC4(%rip), %rsi
    movq    %rsp, %rdi
    movq    %rsp, %r12
LEHB2:
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18
LEHE2:
    leaq    8(%rbp), %r14
    xorl    %r15d, %r15d
    xorl    %ebx, %ebx
L22:
    leaq    (%r15,%r14), %r13
    xorl    %eax, %eax
    jmp L21
    .align 4
L44:
    addl    $1, %eax
    addl    $1, %ebx
L21:
    movq    0(%rbp,%r15), %rsi
    movslq  %eax, %rdx
    movq    %r12, %rdi
    movq    0(%r13), %rcx
    call    __ZNKSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE4findEPKcmm
    cmpl    $-1, %eax
    jne L44
    addq    $32, %r15
    cmpq    $64, %r15
    jne L22
    movq    __ZSt4cout@GOTPCREL(%rip), %rdi
    movl    %ebx, %esi
LEHB3:
    call    __ZNSo9_M_insertImEERSoT_
    movq    %rax, %rdi
    call    __ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
LEHE3:
    movq    (%rsp), %rdi
    addq    $16, %r12
    cmpq    %r12, %rdi
    je  L23
    call    __ZdlPv
L23:
    movq    64(%rsp), %rdi
    leaq    48(%rbp), %rax
    cmpq    %rax, %rdi
    je  L24
    call    __ZdlPv
L24:
    movq    32(%rsp), %rdi
    addq    $16, %rbp
    cmpq    %rbp, %rdi
    je  L39
    call    __ZdlPv
L39:
    addq    $104, %rsp
LCFI18:
    xorl    %eax, %eax
    popq    %rbx
LCFI19:
    popq    %rbp
LCFI20:
    popq    %r12
LCFI21:
    popq    %r13
LCFI22:
    popq    %r14
LCFI23:
    popq    %r15
LCFI24:
    ret
L38:
LCFI25:
    movq    %rax, %r12
    movl    $1, %eax
L19:
    movl    $1, %ebx
    subq    %rax, %rbx
    salq    $5, %rbx
    addq    %rbp, %rbx
L29:
    cmpq    %rbp, %rbx
    je  L27
    subq    $32, %rbx
    movq    (%rbx), %rdi
    leaq    16(%rbx), %rax
    cmpq    %rax, %rdi
    je  L29
    call    __ZdlPv
    jmp L29
L36:
    movq    (%rsp), %rdi
    addq    $16, %r12
    movq    %rax, %rbx
    cmpq    %r12, %rdi
    je  L32
    call    __ZdlPv
L32:
    movq    64(%rsp), %rdi
    leaq    48(%rbp), %rax
    cmpq    %rax, %rdi
    je  L33
    call    __ZdlPv
L33:
    movq    32(%rsp), %rdi
    addq    $16, %rbp
    cmpq    %rbp, %rdi
    je  L34
    call    __ZdlPv
L34:
    movq    %rbx, %rdi
LEHB4:
    call    __Unwind_Resume
L27:
    movq    %r12, %rdi
    call    __Unwind_Resume
LEHE4:
L37:
    movq    %rax, %rbx
    jmp L32
L35:
    movq    %rax, %r12
    xorl    %eax, %eax
    jmp L19
LFE1425:
    .section __DATA,__gcc_except_tab
GCC_except_table0:
LLSDA1425:
    .byte   0xff
    .byte   0xff
    .byte   0x3
    .byte   0x41
    .set L$set$0,LEHB0-LFB1425
    .long L$set$0
    .set L$set$1,LEHE0-LEHB0
    .long L$set$1
    .set L$set$2,L38-LFB1425
    .long L$set$2
    .byte   0
    .set L$set$3,LEHB1-LFB1425
    .long L$set$3
    .set L$set$4,LEHE1-LEHB1
    .long L$set$4
    .set L$set$5,L35-LFB1425
    .long L$set$5
    .byte   0
    .set L$set$6,LEHB2-LFB1425
    .long L$set$6
    .set L$set$7,LEHE2-LEHB2
    .long L$set$7
    .set L$set$8,L37-LFB1425
    .long L$set$8
    .byte   0
    .set L$set$9,LEHB3-LFB1425
    .long L$set$9
    .set L$set$10,LEHE3-LEHB3
    .long L$set$10
    .set L$set$11,L36-LFB1425
    .long L$set$11
    .byte   0
    .set L$set$12,LEHB4-LFB1425
    .long L$set$12
    .set L$set$13,LEHE4-LEHB4
    .long L$set$13
    .long   0
    .byte   0
    .section __TEXT,__text_startup,regular,pure_instructions
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDE5:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTE5:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDB6:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTB6:
    .align 4
__GLOBAL__sub_I_test1.cpp:
LFB1626:
    leaq    __ZStL8__ioinit(%rip), %rdi
    subq    $8, %rsp
LCFI26:
    call    __ZNSt8ios_base4InitC1Ev
    movq    __ZNSt8ios_base4InitD1Ev@GOTPCREL(%rip), %rdi
    leaq    ___dso_handle(%rip), %rdx
    addq    $8, %rsp
LCFI27:
    leaq    __ZStL8__ioinit(%rip), %rsi
    jmp ___cxa_atexit
LFE1626:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDE6:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTE6:
    .static_data
__ZStL8__ioinit:
    .space  1
    .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
EH_frame1:
    .set L$set$14,LECIE1-LSCIE1
    .long L$set$14
LSCIE1:
    .long   0
    .byte   0x1
    .ascii "zPLR\0"
    .byte   0x1
    .byte   0x78
    .byte   0x10
    .byte   0x7
    .byte   0x9b
    .long   ___gxx_personality_v0+4@GOTPCREL
    .byte   0x10
    .byte   0x10
    .byte   0xc
    .byte   0x7
    .byte   0x8
    .byte   0x90
    .byte   0x1
    .align 3
LECIE1:
LSFDE1:
    .set L$set$15,LEFDE1-LASFDE1
    .long L$set$15
LASFDE1:
    .long   LASFDE1-EH_frame1
    .quad   LFB1645-.
    .set L$set$16,LFE1645-LFB1645
    .quad L$set$16
    .byte   0x8
    .quad   0
    .byte   0x4
    .set L$set$17,LCFI0-LFB1645
    .long L$set$17
    .byte   0xe
    .byte   0x10
    .byte   0x8d
    .byte   0x2
    .byte   0x4
    .set L$set$18,LCFI1-LCFI0
    .long L$set$18
    .byte   0xe
    .byte   0x18
    .byte   0x8c
    .byte   0x3
    .byte   0x4
    .set L$set$19,LCFI2-LCFI1
    .long L$set$19
    .byte   0xe
    .byte   0x20
    .byte   0x86
    .byte   0x4
    .byte   0x4
    .set L$set$20,LCFI3-LCFI2
    .long L$set$20
    .byte   0xe
    .byte   0x28
    .byte   0x83
    .byte   0x5
    .byte   0x4
    .set L$set$21,LCFI4-LCFI3
    .long L$set$21
    .byte   0xe
    .byte   0x40
    .byte   0x4
    .set L$set$22,LCFI5-LCFI4
    .long L$set$22
    .byte   0xa
    .byte   0xe
    .byte   0x28
    .byte   0x4
    .set L$set$23,LCFI6-LCFI5
    .long L$set$23
    .byte   0xe
    .byte   0x20
    .byte   0x4
    .set L$set$24,LCFI7-LCFI6
    .long L$set$24
    .byte   0xe
    .byte   0x18
    .byte   0x4
    .set L$set$25,LCFI8-LCFI7
    .long L$set$25
    .byte   0xe
    .byte   0x10
    .byte   0x4
    .set L$set$26,LCFI9-LCFI8
    .long L$set$26
    .byte   0xe
    .byte   0x8
    .byte   0x4
    .set L$set$27,LCFI10-LCFI9
    .long L$set$27
    .byte   0xb
    .align 3
LEFDE1:
LSFDE3:
    .set L$set$28,LEFDE3-LASFDE3
    .long L$set$28
LASFDE3:
    .long   LASFDE3-EH_frame1
    .quad   LFB1425-.
    .set L$set$29,LFE1425-LFB1425
    .quad L$set$29
    .byte   0x8
    .quad   LLSDA1425-.
    .byte   0x4
    .set L$set$30,LCFI11-LFB1425
    .long L$set$30
    .byte   0xe
    .byte   0x10
    .byte   0x8f
    .byte   0x2
    .byte   0x4
    .set L$set$31,LCFI12-LCFI11
    .long L$set$31
    .byte   0xe
    .byte   0x18
    .byte   0x8e
    .byte   0x3
    .byte   0x4
    .set L$set$32,LCFI13-LCFI12
    .long L$set$32
    .byte   0xe
    .byte   0x20
    .byte   0x8d
    .byte   0x4
    .byte   0x4
    .set L$set$33,LCFI14-LCFI13
    .long L$set$33
    .byte   0xe
    .byte   0x28
    .byte   0x8c
    .byte   0x5
    .byte   0x4
    .set L$set$34,LCFI15-LCFI14
    .long L$set$34
    .byte   0xe
    .byte   0x30
    .byte   0x86
    .byte   0x6
    .byte   0x4
    .set L$set$35,LCFI16-LCFI15
    .long L$set$35
    .byte   0xe
    .byte   0x38
    .byte   0x83
    .byte   0x7
    .byte   0x4
    .set L$set$36,LCFI17-LCFI16
    .long L$set$36
    .byte   0xe
    .byte   0xa0,0x1
    .byte   0x4
    .set L$set$37,LCFI18-LCFI17
    .long L$set$37
    .byte   0xa
    .byte   0xe
    .byte   0x38
    .byte   0x4
    .set L$set$38,LCFI19-LCFI18
    .long L$set$38
    .byte   0xe
    .byte   0x30
    .byte   0x4
    .set L$set$39,LCFI20-LCFI19
    .long L$set$39
    .byte   0xe
    .byte   0x28
    .byte   0x4
    .set L$set$40,LCFI21-LCFI20
    .long L$set$40
    .byte   0xe
    .byte   0x20
    .byte   0x4
    .set L$set$41,LCFI22-LCFI21
    .long L$set$41
    .byte   0xe
    .byte   0x18
    .byte   0x4
    .set L$set$42,LCFI23-LCFI22
    .long L$set$42
    .byte   0xe
    .byte   0x10
    .byte   0x4
    .set L$set$43,LCFI24-LCFI23
    .long L$set$43
    .byte   0xe
    .byte   0x8
    .byte   0x4
    .set L$set$44,LCFI25-LCFI24
    .long L$set$44
    .byte   0xb
    .align 3
LEFDE3:
LSFDE5:
    .set L$set$45,LEFDE5-LASFDE5
    .long L$set$45
LASFDE5:
    .long   LASFDE5-EH_frame1
    .quad   LFB1626-.
    .set L$set$46,LFE1626-LFB1626
    .quad L$set$46
    .byte   0x8
    .quad   0
    .byte   0x4
    .set L$set$47,LCFI26-LFB1626
    .long L$set$47
    .byte   0xe
    .byte   0x10
    .byte   0x4
    .set L$set$48,LCFI27-LCFI26
    .long L$set$48
    .byte   0xe
    .byte   0x8
    .align 3
LEFDE5:
    .mod_init_func
    .align 3
    .quad   __GLOBAL__sub_I_test1.cpp
    .constructor
    .destructor
    .align 1
    .subsections_via_symbols

The assembly from your example is 31'559 lines, and trying to post it here or on Pastebin crashed my browser tab.

Still, I also wrote a C version of the above program:

#include <stdio.h>
#include <string.h>

#define NUM_PATTERNS 2

int main()
{
    const char *patterns[] = {"A", "D"};
    char *str = "ABCDEABCABD";
    unsigned int count = 0;

    for(int i = 0; i < NUM_PATTERNS; ++i)
    {
        for(char *s = str; ; )
        {
            s = strstr(s, patterns[i]);
            if(s == NULL)
            {
                break;
            }
            ++s;        // in order to not match again
            ++count;
        }
    }
    printf("%u\n", count);
    return 0;
}

Which compiles to a mere 95 lines of assembly:

    .cstring
LC0:
    .ascii "ABCDEABCABD\0"
LC1:
    .ascii "%u\12\0"
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDB2:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTB2:
    .align 4
    .globl _main
_main:
LFB1:
    pushq   %rbx
LCFI0:
    leaq    LC0(%rip), %rdi
    xorl    %ebx, %ebx
    jmp L3
    .align 4
L8:
    leaq    1(%rax), %rdi
    addl    $1, %ebx
L3:
    movl    $65, %esi
    call    _strchr
    testq   %rax, %rax
    jne L8
    leaq    LC0(%rip), %rdi
    jmp L2
    .align 4
L9:
    leaq    1(%rax), %rdi
    addl    $1, %ebx
L2:
    movl    $68, %esi
    call    _strchr
    testq   %rax, %rax
    jne L9
    leaq    LC1(%rip), %rdi
    movl    %ebx, %esi
    xorl    %eax, %eax
    call    _printf
    xorl    %eax, %eax
    popq    %rbx
LCFI1:
    ret
LFE1:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDE2:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTE2:
    .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
EH_frame1:
    .set L$set$0,LECIE1-LSCIE1
    .long L$set$0
LSCIE1:
    .long   0
    .byte   0x1
    .ascii "zR\0"
    .byte   0x1
    .byte   0x78
    .byte   0x10
    .byte   0x1
    .byte   0x10
    .byte   0xc
    .byte   0x7
    .byte   0x8
    .byte   0x90
    .byte   0x1
    .align 3
LECIE1:
LSFDE1:
    .set L$set$1,LEFDE1-LASFDE1
    .long L$set$1
LASFDE1:
    .long   LASFDE1-EH_frame1
    .quad   LFB1-.
    .set L$set$2,LFE1-LFB1
    .quad L$set$2
    .byte   0
    .byte   0x4
    .set L$set$3,LCFI0-LFB1
    .long L$set$3
    .byte   0xe
    .byte   0x10
    .byte   0x83
    .byte   0x2
    .byte   0x4
    .set L$set$4,LCFI1-LCFI0
    .long L$set$4
    .byte   0xe
    .byte   0x8
    .align 3
LEFDE1:
    .subsections_via_symbols

I suggest you transition at least that portion of the code to C. If you have a surrounding C++ code base, embedding it shouldn't be too much of a problem.

Either way, you can take one of the above examples, change str to be read from a file, change the patterns array to as many patterns as needed (make sure to set NUM_PATTERNS to the same amount!), and if that's still not enough of a performance improvement, implement the loop to be split across multiple threads (this is too broad of a topic though to cover for both C and C++ in this answer).

You can of course also parse the command line arguments into a pattern array:

#include <stdlib.h>
#include <string.h>

int numPatterns(char *str)
{
    int num = 1;
    for(int i = 0, len = strlen(str); i < len; ++i) // loop through all characters of the string
    {
        if(str[i] == '|')                           // increase the counter each time we hit a |
        {
            ++num;
        }
    }
    return num;
}

void splitPatterns(char *str, int num, char** dst)
{
    char *raw = (char*)(dst + num * sizeof(char*)); // this is the region after the pointers
    strcpy(raw, str);                               // first copy the string (note, this function is dangerous if used on anything other than strings! (google buffer overflow))
    dst[0] = raw;
    for(int i = 0, len = strlen(str), n = 1; i < len; ++i)
    {
        if(raw[i] == '|')
        {
            raw[i] = '\0';                          // replace all | by '\0' in order to split
            dst[n++] = &raw[i + 1];                 // and let the next string start at the next character
        }
    }
}

// let's say your program is called as ./program 'input_file' 'list|of|patterns'
int main(int argc, char **argv)
{
    /* check that argc >= 3 and blah */

    int num = numPatterns(argv[2]);
    // In order to malloc() and free() only once, I'm stashing things together here.
    // First comes an array of num pointers, hence num * sizeof(char*), followed by
    // the strings which take up the same space as argv[2] (the | just gets
    // replaced by \0), which equals strlen(argv[2]) + 1 for the terminating null byte.
    char **patterns = (char**)malloc(num * sizeof(char*) + strlen(argv[2]) + 1);
    splitPatterns(argv[2], num, patterns);

    /* work with patterns here */

    free(patterns);
}

If you compile the C samples here as C++, make sure to change all include directives from <*.h> to <c*>, i.e. <stdio.h> to <cstdio> etc.

Sign up to request clarification or add additional context in comments.

3 Comments

Thanks for the elaborate answer. I will study it carefully and try to implement it. Two questions (1) if the user supplies pattern as a pipe-seperated string in command line, how can I turn it into the *patterns[] array (sorry I'm very new at C/C++...) and (2) you're correct there is a lot of surrounding code. However, the code snip here is called through a function with one argument for the string s and one for the pattern. Could I have just this function in C and keep the rest in C++?
(2) sure, that works. Just make sure to change <stdio.h> and <string.h> to <cstdio> and <cstring> if you compile it as C++. (1) Updated my answer with an annotated example.
Got this working, thanks for your answer. Also used this post (stackoverflow.com/questions/16850992/…) to integrate C and C++.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.