5

Im modding an open-source project(powder toy) and always compiling with /O2 (maximize speed) option, SSE2 code generation enabled and just inspected this:

void membwand(void * destv, void * srcv, size_t destsize, size_t srcsize)
{
    size_t i;
    unsigned char * dest = (unsigned char*)destv;
    unsigned char * src = (unsigned char*)srcv;
    for(i = 0; i < destsize; i++){
        dest[i] = dest[i] & src[i%srcsize];
    }
}

Here I replaced below lines

membwand(gravy, gravmask, size_dst, size_src);

membwand(gravx, gravmask, size_dst, size_src);

    // gravy, gravx and gravmask are 1MB each

with

membwand2(gravy,gravx, gravmask, size_dst, size_src);   
    // sizes are same, why not put all in a single function?

which is implemented as:

void membwand2(void * destv1, void * destv2,void * srcv, size_t destsize, size_t srcsize)
{
    if(destsize!=srcsize)
    {
        size_t i;
        unsigned char * dest1 = (unsigned char*)destv1;
        unsigned char * dest2 = (unsigned char*)destv2;
        unsigned char * src = (unsigned char*)srcv;
        for(i = 0; i < destsize; i++)
        {
            dest1[i] = dest1[i] & src[i%srcsize];
            dest2[i] = dest2[i] & src[i%srcsize];
        }
    }
    else
    {
        size_t i;
        unsigned char * dest1 = (unsigned char*)destv1;unsigned char * dest2 = (unsigned char*)destv2;
        unsigned char * src = (unsigned char*)srcv;
        unsigned char tmpChar0=0;unsigned char tmpChar1=0;unsigned char tmpChar2=0;
        unsigned char tmpChar3=0;unsigned char tmpChar4=0;unsigned char tmpChar5=0;
        unsigned char tmpChar6=0;unsigned char tmpChar7=0;unsigned char tmpChar8=0;
        unsigned char tmpChar9=0;unsigned char tmpChar10=0;unsigned char tmpChar11=0;
        unsigned char tmpChar12=0;unsigned char tmpChar13=0;unsigned char tmpChar14=0;
        unsigned char tmpChar15=0;unsigned char tmpChar16=0;unsigned char tmpChar17=0;
        unsigned char tmpChar18=0;unsigned char tmpChar19=0;unsigned char tmpChar20=0;
        unsigned char tmpChar21=0;unsigned char tmpChar22=0;unsigned char tmpChar23=0;
        unsigned char tmpChar24=0;unsigned char tmpChar25=0;unsigned char tmpChar26=0;
        unsigned char tmpChar27=0;unsigned char tmpChar28=0;unsigned char tmpChar29=0;
        unsigned char tmpChar30=0;unsigned char tmpChar31=0;
        for(i = 0; i < destsize; i+=32)
        {
            tmpChar0=src[i];tmpChar1=src[i+1];tmpChar2=src[i+2];tmpChar3=src[i+3];
            tmpChar4=src[i+4];tmpChar5=src[i+5];tmpChar6=src[i+6];tmpChar7=src[i+7];

            tmpChar8=src[i+8];tmpChar9=src[i+9];tmpChar10=src[i+10];tmpChar11=src[i+11];
            tmpChar12=src[i+12];tmpChar13=src[i+13];tmpChar14=src[i+14];tmpChar15=src[i+15];

            tmpChar16=src[i+16];tmpChar17=src[i+17];tmpChar18=src[i+18];tmpChar19=src[i+19];
            tmpChar20=src[i+20];tmpChar21=src[i+21];tmpChar22=src[i+22];tmpChar23=src[i+23];

            tmpChar24=src[i+24];tmpChar25=src[i+25];tmpChar26=src[i+26];tmpChar27=src[i+27];
            tmpChar28=src[i+28];tmpChar29=src[i+29];tmpChar30=src[i+30];tmpChar31=src[i+31];


            dest1[i] = dest1[i] & tmpChar0;
            dest1[i+1] = dest1[i+1] & tmpChar1;
            dest1[i+2] = dest1[i+2] & tmpChar2;
            dest1[i+3] = dest1[i+3] & tmpChar3;
            dest1[i+4] = dest1[i+4] & tmpChar4;
            dest1[i+5] = dest1[i+5] & tmpChar5;
            dest1[i+6] = dest1[i+6] & tmpChar6;
            dest1[i+7] = dest1[i+7] & tmpChar7;
            dest1[i+8] = dest1[i+8] & tmpChar8;
            dest1[i+9] = dest1[i+9] & tmpChar9;
            dest1[i+10] = dest1[i+10] & tmpChar10;
            dest1[i+11] = dest1[i+11] & tmpChar11;
            dest1[i+12] = dest1[i+12] & tmpChar12;
            dest1[i+13] = dest1[i+13] & tmpChar13;
            dest1[i+14] = dest1[i+14] & tmpChar14;
            dest1[i+15] = dest1[i+15] & tmpChar15;
            dest1[i+16] = dest1[i+16] & tmpChar16;
            dest1[i+17] = dest1[i+17] & tmpChar17;
            dest1[i+18] = dest1[i+18] & tmpChar18;
            dest1[i+19] = dest1[i+19] & tmpChar19;
            dest1[i+20] = dest1[i+20] & tmpChar20;
            dest1[i+21] = dest1[i+21] & tmpChar21;
            dest1[i+22] = dest1[i+22] & tmpChar22;
            dest1[i+23] = dest1[i+23] & tmpChar23;
            dest1[i+24] = dest1[i+24] & tmpChar24;
            dest1[i+25] = dest1[i+25] & tmpChar25;
            dest1[i+26] = dest1[i+26] & tmpChar26;
            dest1[i+27] = dest1[i+27] & tmpChar27;
            dest1[i+28] = dest1[i+28] & tmpChar28;
            dest1[i+29] = dest1[i+29] & tmpChar29;
            dest1[i+30] = dest1[i+30] & tmpChar30;
            dest1[i+31] = dest1[i+31] & tmpChar31;

            dest2[i] = dest2[i] & tmpChar0;
            dest2[i+1] = dest2[i+1] & tmpChar1;
            dest2[i+2] = dest2[i+2] & tmpChar2;
            dest2[i+3] = dest2[i+3] & tmpChar3;
            dest2[i+4] = dest2[i+4] & tmpChar4;
            dest2[i+5] = dest2[i+5] & tmpChar5;
            dest2[i+6] = dest2[i+6] & tmpChar6;
            dest2[i+7] = dest2[i+7] & tmpChar7;
            dest2[i+8] = dest2[i+8] & tmpChar8;
            dest2[i+9] = dest2[i+9] & tmpChar9;
            dest2[i+10] = dest2[i+10] & tmpChar10;
            dest2[i+11] = dest2[i+11] & tmpChar11;
            dest2[i+12] = dest2[i+12] & tmpChar12;
            dest2[i+13] = dest2[i+13] & tmpChar13;
            dest2[i+14] = dest2[i+14] & tmpChar14;
            dest2[i+15] = dest2[i+15] & tmpChar15;
            dest2[i+16] = dest2[i+16] & tmpChar16;
            dest2[i+17] = dest2[i+17] & tmpChar17;
            dest2[i+18] = dest2[i+18] & tmpChar18;
            dest2[i+19] = dest2[i+19] & tmpChar19;
            dest2[i+20] = dest2[i+20] & tmpChar20;
            dest2[i+21] = dest2[i+21] & tmpChar21;
            dest2[i+22] = dest2[i+22] & tmpChar22;
            dest2[i+23] = dest2[i+23] & tmpChar23;
            dest2[i+24] = dest2[i+24] & tmpChar24;
            dest2[i+25] = dest2[i+25] & tmpChar25;
            dest2[i+26] = dest2[i+26] & tmpChar26;
            dest2[i+27] = dest2[i+27] & tmpChar27;
            dest2[i+28] = dest2[i+28] & tmpChar28;
            dest2[i+29] = dest2[i+29] & tmpChar29;
            dest2[i+30] = dest2[i+30] & tmpChar30;
            dest2[i+31] = dest2[i+31] & tmpChar31;


        }
    }
}

Omp wall clock timings were 17ms for two-lines version and 1.5ms for one-line version. (after several thousand iterations)

Question: What could have cause the 9x speed-up? Data re-use of src[] or cache-utilization? Maybe code was not fit for vectorizing before, and unrolling enabled it, could it be the missing modulus operation for indexing? Should I add #pragma omp on top of loop?

Edit: #pragma omp parallel for num_threads(4) made it worse somehow. Maybe just 1MB is not hiding multithread overhead?

Edit2: ommiting the modulus operator made it 2.5 ms but the performance increase from 2.5ms to 1.5ms (~%60 increase) must have come from unrolling/cache/...

Note: enabling "favor fast code", enabling whole program optimization, full optimization(/Ox) and enabling intrinsics did not change the speed(maybe enabling SSE2 and /O2 was enough)

CPU: FX8150 IDE: Msvc

15
  • 1
    You can check auto vectorization with /Qvec-report:1 or /Qvec-report:2. Commented Feb 13, 2014 at 12:12
  • 1
    You are not comparing apples to apples. To fairly compare the two functions the two line version should also have a branch when destsize==srcsize. Commented Feb 13, 2014 at 12:18
  • 1
    @huseyintugrulbuyukisik Fair enough, but without knowing the structure definition of the data you are moving its hard to tell. But I expect that your compiler has already done its utmost to align the data for fastest possible access. But if you are delving into micro optimizations then you should be considering that as well. Commented Feb 13, 2014 at 12:50
  • 1
    I hope you are comparing apple to apple, meaning the time it takes for both function calls for membwand (....), compared to membwand2(...) because you need to check the time it takes to move the same amount of data Commented Feb 13, 2014 at 12:51
  • 1
    [OT] As destsize seems to be a multiple of 32, if alignment allows it, you may work with uint32_t (or uint64_t) directly... Commented Feb 13, 2014 at 12:58

2 Answers 2

3

I think loop unrolling is the booster.You may google it for detailed info.

Of course modulus operation can be slow..You may test it by replacing

src[i%srcsize] 

with

src[i] 

for the sake of timing tests.

Sign up to request clarification or add additional context in comments.

1 Comment

Just omitting modulus made it 2.5ms for wall timing. So the 2.5ms-1.5ms difference must have caused by unrolling and other things. Thanks.
2

there are several thing to consider here when comparing the two functions; from a high level perspective it looks as though the 2 line version should run faster than the 1 line version, however:

- the 1 line version has better instruction cache locatity
- the 1 line version puts less pressure on the registers( this is important 
 because if the compiler runs out of registers to use it may start using the 
 stack to move parameters around for the instructions it needs - which causes 
 an overhead)

As a optimization tip:

- you can try and use the "restrict"   keyword: **void membwand(void * _restrict_
destv, void *  _restrict_ srcv, size_t destsize, size_t srcsize);** This tells the
compiler that the two pointers are not aliasing - they are will never point at the
same memory location so the read/write instruction can be executed in the same cycle.

2 Comments

Tried _ _ restrict affter void * and it did not change performance. Maybe I need to use int rather than char?
no, data type has nothing to do with it; it's only a manner of passing extra info to the compiler

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.