Search code examples
cfor-looploop-unrolling

Nested Loop Unrolling in C


I want to optimize my code by using unrolling loop. I tried to apply unrolling but I think I cannot do it and I cannot see my problem. I want to apply unrolling loop to outer loop.

This loops do transpose of matrix.

This is my loop to apply unrolling loop:

void transpose(int dim, int *src, int *dst) {
    for (i = 0; i < dim; i++)
        for (j = 0; j < dim; j++)
            dst[j * dim + i] = src[i * dim + j];
}

This is my unrolling loop:

void transpose(int dim, int *src, int *dst) {
    int i = 0, j = 0, dimi = 0, dimj = 0, tempi = 0;

    for (i = 0; i < dim; i += 8) {
        for (j = 0; j < dim; j++) {
            dimj = j * dim + i;
            dimi = i * dim + j;
            dst[dimj] = src[dimi];

            tempi = i + 1;
            if (tempi < dim) {
                dimj = j * dim + tempi;
                dimi = tempi * dim + j;
                dst[dimj] = src[dimi];

                tempi += 1;
                if (tempi < dim) {
                    dimj = j * dim + tempi;
                    dimi = tempi * dim + j;
                    dst[dimj] = src[dimi];

                    tempi += 1;
                    if (tempi < dim) {
                        dimj = j * dim + tempi;
                        dimi = tempi * dim + j;
                        dst[dimj] = src[dimi];

                        tempi += 1;
                        if (tempi < dim) {
                            dimj = j * dim + tempi;
                            dimi = tempi * dim + j;
                            dst[dimj] = src[dimi];

                            tempi += 1;
                            if (tempi < dim) {
                                dimj = j * dim + tempi;
                                dimi = tempi * dim + j;
                                dst[dimj] = src[dimi];

                                tempi += 1;
                                if (tempi < dim) {
                                    dimj = j * dim + tempi;
                                    dimi = tempi * dim + j;
                                    dst[dimj] = src[dimi];

                                    tempi += 1;
                                    if (tempi < dim) {
                                        dimj = j * dim + tempi;
                                        dimi = tempi * dim + j;
                                        dst[dimj] = src[dimi];
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

Solution

  • I'm not sure what the error in your current code is but here is another approach.

    void transpose(int dim, int *src, int *dst) {
        int i, j;
    
        for (i = 0; i <= dim-8; i += 8)
        {
            for (j = 0; j < dim; j++)
            {
                    dst[j * dim + (i+0)] = src[(i+0) * dim + j];
                    dst[j * dim + (i+1)] = src[(i+1) * dim + j];
                    dst[j * dim + (i+2)] = src[(i+2) * dim + j];
                    dst[j * dim + (i+3)] = src[(i+3) * dim + j];
                    dst[j * dim + (i+4)] = src[(i+4) * dim + j];
                    dst[j * dim + (i+5)] = src[(i+5) * dim + j];
                    dst[j * dim + (i+6)] = src[(i+6) * dim + j];
                    dst[j * dim + (i+7)] = src[(i+7) * dim + j];
            }
        }
    
        // Use the normal loop for any remaining elements   
        for (; i < dim; i++)
            for (j = 0; j < dim; j++)
                dst[j * dim + i] = src[i * dim + j];
    }
    

    Notice: The number of multiplication can be reduced by introducing a variable like:

    int jdim = j * dim + i;
    dst[jdim + 0] = ...
    dst[jdim + 1] = ...
    ...
    dst[jdim + 7] = ...
    

    and likewise for the RHS.