I want to optimize my code by using unrolling loop. I tried to apply unrolling but I think I cannot do it and I cannot see my problem. I want to apply unrolling loop to outer loop.
This loops do transpose of matrix.
This is my loop to apply unrolling loop:
void transpose(int dim, int *src, int *dst) {
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[j * dim + i] = src[i * dim + j];
}
This is my unrolling loop:
void transpose(int dim, int *src, int *dst) {
int i = 0, j = 0, dimi = 0, dimj = 0, tempi = 0;
for (i = 0; i < dim; i += 8) {
for (j = 0; j < dim; j++) {
dimj = j * dim + i;
dimi = i * dim + j;
dst[dimj] = src[dimi];
tempi = i + 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
}
}
}
}
}
}
}
}
}
}
I'm not sure what the error in your current code is but here is another approach.
void transpose(int dim, int *src, int *dst) {
int i, j;
for (i = 0; i <= dim-8; i += 8)
{
for (j = 0; j < dim; j++)
{
dst[j * dim + (i+0)] = src[(i+0) * dim + j];
dst[j * dim + (i+1)] = src[(i+1) * dim + j];
dst[j * dim + (i+2)] = src[(i+2) * dim + j];
dst[j * dim + (i+3)] = src[(i+3) * dim + j];
dst[j * dim + (i+4)] = src[(i+4) * dim + j];
dst[j * dim + (i+5)] = src[(i+5) * dim + j];
dst[j * dim + (i+6)] = src[(i+6) * dim + j];
dst[j * dim + (i+7)] = src[(i+7) * dim + j];
}
}
// Use the normal loop for any remaining elements
for (; i < dim; i++)
for (j = 0; j < dim; j++)
dst[j * dim + i] = src[i * dim + j];
}
Notice: The number of multiplication can be reduced by introducing a variable like:
int jdim = j * dim + i;
dst[jdim + 0] = ...
dst[jdim + 1] = ...
...
dst[jdim + 7] = ...
and likewise for the RHS.