Search code examples
c++intel-ipploop-unrolling

Unrolling nested loops c++


I'm trying to unroll a nested loop that stores data in a 2D dynamic memory allocation in C++. Although, I'm not quite sure how to do it. Here is my original loop before unrolling:

int steps[1]; 
Ipp32f* vectx = ippiMalloc_32f_C1(size0, size1, &(steps[0])); 

for (int i = 0; i < size0; i++){
    for (int j = 0; j < size1; j++){
        Ipp32f* pointer = (Ipp32f*)((Ipp8u*)vectx + steps[0]*j + sizeof(Ipp32f)*i); 
        *pointer = datax[i]; 
    }
}

datax is an array with values, size0 = 30 and size1 = 10000 in my program. I tried the following but unfortunately the values are not the same at each position. Could someone help me?

for (int i = 0; i < size0; i+=4) {
     for (int j = 0; j < size1; j+=4) {
        *((Ipp32f*)((Ipp8u*)vectx+ (steps[0] * j +0)+ (sizeof(Ipp32f) * i ))) = datax[i];
        *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i ))) = datax[i ];
        *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 2) + (sizeof(Ipp32f) * i ))) = datax[i ];
        *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 3) + (sizeof(Ipp32f) * i ))) = datax[i ];
     }
     for (int j = 0; j < size1; j += 4) {
        *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 0) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
        *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
        *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 2) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
        *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 3) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
     }

     for (int j = 0; j < size1; j += 4) {
         *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 0) + (sizeof(Ipp32f) * i + 2))) = datax[i + 2];
         *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i + 2))) = datax[i + 2];
         *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 2) + (sizeof(Ipp32f) * i + 2))) = datax[i + 2];
         *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 3) + (sizeof(Ipp32f) * i + 2))) = datax[i + 2];
    }
    for (int j = 0; j < size1; j += 4) {
         *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 0) + (sizeof(Ipp32f) * i + 3))) = datax[i + 3];
         *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i + 3))) = datax[i + 3];
         *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 2) + (sizeof(Ipp32f) * i + 3))) = datax[i + 3];
         *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 3) + (sizeof(Ipp32f) * i + 3))) = datax[i + 3];
    }

} 

Solution

  • You are not taking in account operator precedence

    *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * j + 1) + (sizeof(Ipp32f) * i+1))) = datax[i+1];
                                            ^^^^^^--here               ^^^--and here
    

    you should add ()

    *((Ipp32f*)((Ipp8u*)vectx + (steps[0] * (j + 1)) + (sizeof(Ipp32f) * (i+1)))) = datax[i+1];
                                            ^^^^^^                        ^^^
    

    Obviously you should do this in all the rows

    And BTW, size0 = 30, if you unroll the loop 4 by 4, you will go out of bound during the last iteration of the first loop, you should use a multiple of size0, like 5 or 6