I have implemented a simple n x n matrix multiplication to test same performance tunings in c with OpenMp. My initial code is the following:
#pragma omp parallel for shared(a,b,c) private(h,i,j,k)
for( i = 0; i < n; i++ )
{
for( j = 0; j < n; j++)
{
for( k = 0; k < n; k++ )
{
a[i*n+j]+= b[i*n+k] * c[k*n+j];
The compiler switches the j and k loop, so I get the ikj-Algorithm. The first thing I wanted to implement was padding to allign each row to 64byte for alligned cache access. Therefore I calculated the needed additional size for each row. With a row leangth of 5900, the new size is 5904 (reference is ISBN-9780124104143). My new code is the following:
#pragma omp parallel for shared(a,b,c) private(h,i,j,k)
for( i = 0; i < n; i++ )
{
for( k = 0; k < n; k++ )
{
#pragma simd
#pragma unroll(4)
for( j = 0; j < n; j++)
{
a[i*pn+j]+= b[i*pn+k] * c[k*pn+j];
where pn is the new, padded row length. I had to manually permute my loops, because the compiler refused to. Running this code on a normal Xeon Processor I get nearly the same performancce as before, maybe a little bit better. Thats about what I expected. But when I'm running the code on a Xeon Phi, it is about 1/10 of the inital code.
After further compiler investigation, I noticed, that the inner loop doesnt get unrolled and vectorized anymore. So I added the following #pragmas:
#pragma simd
#pragma unroll
The vectorization works fine, but the remainder loop doesn't get unrolled. The performance is way better, but still only about 1/2 of the normal version.
Here is the compiler (-O3) output of the normal code:
LOOP BEGIN at mm_par.c(75,3)
remark #25444: Loopnest Interchanged: ( 1 2 3 ) --> ( 1 3 2 )
LOOP BEGIN at mm_par.c(79,5)
remark #25460: No loop optimizations reported
LOOP BEGIN at mm_par.c(77,4)
remark #15301: PERMUTED LOOP WAS VECTORIZED
LOOP END
LOOP BEGIN at mm_par.c(77,4)
<Remainder>
remark #25436: completely unrolled by 4
LOOP END
LOOP END
LOOP END
And here the output of the padded one with simd and unrolling pragmas:
LOOP BEGIN at mm_ali.c(76,3)
remark #25460: No loop optimizations reported
LOOP BEGIN at mm_ali.c(78,4)
remark #25460: No loop optimizations reported
LOOP BEGIN at mm_ali.c(82,10)
remark #15301: SIMD LOOP WAS VECTORIZED
LOOP END
LOOP END
LOOP END
So the unrolling gets ignored. Is there a way to force it? I also question myself, if thats the only reason for the bad performance..
edit: The assambly for the fast matrix multiplication without padding looks like this:
vmovapd c(%r15,%rbx,8), %zmm1 #81.28 c1
vprefetche1 2048+a(%r11,%rbx,8) #81.6 c5
vmovapd 64+c(%r15,%rbx,8), %zmm3 #81.28 c9
vprefetch0 768+a(%r11,%rbx,8) #81.6 c13
vmovapd 128+c(%r15,%rbx,8), %zmm4 #81.28 c17
vprefetch1 2048+c(%r15,%rbx,8) #81.28 c21
vmovapd 192+c(%r15,%rbx,8), %zmm5 #81.28 c25
vprefetch0 768+c(%r15,%rbx,8) #81.28 c29
vfmadd213pd a(%r11,%rbx,8), %zmm0, %zmm1 #81.6 c33
vprefetche1 2112+a(%r11,%rbx,8) #81.6 c37
vfmadd213pd 64+a(%r11,%rbx,8), %zmm0, %zmm3 #81.6 c41
vprefetch0 832+a(%r11,%rbx,8) #81.6 c45
vfmadd213pd 128+a(%r11,%rbx,8), %zmm0, %zmm4 #81.6 c49
vprefetch1 2112+c(%r15,%rbx,8) #81.28 c53
vfmadd213pd 192+a(%r11,%rbx,8), %zmm0, %zmm5 #81.6 c57
vprefetch0 832+c(%r15,%rbx,8) #81.28 c61
vmovaps %zmm1, a(%r11,%rbx,8) #81.6 c65
vprefetche1 2176+a(%r11,%rbx,8) #81.6 c69
vmovaps %zmm3, 64+a(%r11,%rbx,8) #81.6 c73
vprefetch0 896+a(%r11,%rbx,8) #81.6 c77
vmovaps %zmm4, 128+a(%r11,%rbx,8) #81.6 c81
vprefetch1 2176+c(%r15,%rbx,8) #81.28 c85
vmovaps %zmm5, 192+a(%r11,%rbx,8) #81.6 c89
vprefetch0 896+c(%r15,%rbx,8) #81.28 c93
vprefetche1 2240+a(%r11,%rbx,8) #81.6 c97
vprefetch0 960+a(%r11,%rbx,8) #81.6 c101
vprefetch1 2240+c(%r15,%rbx,8) #81.28 c105
vprefetch0 960+c(%r15,%rbx,8) #81.28 c109
addq $32, %rbx #77.4 c113
cmpq %rsi, %rbx #77.4 c117
jb ..B1.51 # Prob 99% #77.4 c117
The one for the slow multiplication with padding looks like this:
vloadunpackld (%rbx), %zmm0 #83.6 c1
addl $32, %r15d #81.10 c1
vprefetch1 2048+c(%rcx) #83.30 c5
vloadunpackhd 64(%rbx), %zmm0 #83.6 c9
addq $256, %rbx #81.10 c9
vprefetch0 512+c(%rcx) #83.30 c13
vbroadcastsd b(%r12,%r13,8), %zmm2 #83.18 c17
vprefetch1 2112+c(%rcx) #83.30 c21
vfmadd132pd c(%rcx), %zmm0, %zmm2 #83.6 c25
vprefetch0 576+c(%rcx) #83.30 c29
vpackstoreld %zmm2, (%rsi) #83.6 c33
vprefetch1 2176+c(%rcx) #83.30 c37
vpackstorehd %zmm2, 64(%rsi) #83.6 c41
addq $256, %rsi #81.10 c41
vprefetch0 640+c(%rcx) #83.30 c45
vloadunpackld (%rdi), %zmm3 #83.6 c49
vprefetch1 2240+c(%rcx) #83.30 c53
vloadunpackhd 64(%rdi), %zmm3 #83.6 c57
addq $256, %rdi #81.10 c57
vprefetch0 704+c(%rcx) #83.30 c61
vbroadcastsd b(%r12,%r13,8), %zmm4 #83.18 c65
vfmadd132pd 64+c(%rcx), %zmm3, %zmm4 #83.6 c69
nop #83.6 c73
vpackstoreld %zmm4, (%r8) #83.6 c77
vpackstorehd %zmm4, 64(%r8) #83.6 c81
addq $256, %r8 #81.10 c81
vloadunpackld (%r9), %zmm5 #83.6 c85
vloadunpackhd 64(%r9), %zmm5 #83.6 c89
addq $256, %r9 #81.10 c89
vbroadcastsd b(%r12,%r13,8), %zmm6 #83.18 c93
vfmadd132pd 128+c(%rcx), %zmm5, %zmm6 #83.6 c97
nop #83.6 c101
vpackstoreld %zmm6, (%r10) #83.6 c105
vpackstorehd %zmm6, 64(%r10) #83.6 c109
addq $256, %r10 #81.10 c109
vloadunpackld (%r11), %zmm7 #83.6 c113
vloadunpackhd 64(%r11), %zmm7 #83.6 c117
addq $256, %r11 #81.10 c117
vbroadcastsd b(%r12,%r13,8), %zmm8 #83.18 c121
vfmadd132pd 192+c(%rcx), %zmm7, %zmm8 #83.6 c125
addq $256, %rcx #81.10 c129
movb %al, %al #83.6 c129
vpackstoreld %zmm8, (%rdx) #83.6 c133
vpackstorehd %zmm8, 64(%rdx) #83.6 c137
addq $256, %rdx #81.10 c137
cmpl $5888, %r15d #81.10 c141
jb ..B1.42 # Prob 99% #81.10 c141
Here is the complete code of my solution. Again, if I exchange np with n the performance is more than twice as fast.
#include <sys/time.h>
#include <omp.h>
#ifndef max
#define max(a,b) (((a) (b)) ? (a) : (b))
#define min(a,b) (((a) < (b)) ? (a) : (b))
#endif
#define n 5900
#define pn ((((n*sizeof(double))+63)/64)*(64/sizeof(double))) //padding
#define threadNum 144
#define loops 1
double dtime()
{
double tseconds = 0.0;
struct timeval mytime;
gettimeofday(&mytime, (struct timezone*)0);
tseconds = (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6);
return tseconds;
}
double a[n*pn] __attribute__((aligned(64)));
double b[n*pn] __attribute__((aligned(64)));
double c[n*pn] __attribute__((aligned(64)));
main(int argc, char **argv){
int threadNumber, loopNumber;
if(argc == 3)
{
threadNumber = atoi(argv[1]);
loopNumber = atoi(argv[2]);
} else
{
threadNumber = threadNum;
loopNumber = loops;
}
double tstart, tstop, ttime;
int i,j,k,h;
// initialize matrices
#pragma omp parallel for
for(i = 0; i < pn*n; i++)
{
a[i]=0.0;
b[i]=2.0;
c[i]=2.0;
}
omp_set_num_threads(threadNumber);
tstart = dtime();
//parallelize via OpenMP on MIC
for(h = 0; h < loopNumber; h++){
#pragma omp parallel for shared(a,b,c) private(h,i,j,k)
for( i = 0; i < n; i++ )
{
for( k = 0; k < n; k++ )
{
#pragma omp simd aligned( a, b, c: 64 )
for( j = 0; j < n; j++)
{
a[i*pn+j]+= b[i*pn+k] * c[k*pn +j];
}
}
}
}
tstop = dtime();
double elapsed = tstop - tstart;
double mFlops = ((double)n)*n*n*2.0*loopNumber/elapsed*1.0e-06;
#pragma omp parallel
#pragma omp master
printf("%d %.3f\n", omp_get_num_threads(), mFlops);
}
The code snippet you gave being too small to allow for proper testing I can on suggest a solution without testing it properly, so it might fail miserably. Anyway, here goes nothing.
You said that you did pad the data to align the rows, but, AFAICS, this information is never transmitted to the compiler, which therefore cannot exploit it. For this you have various solutions, but since you have OpenMP directives already for parallelisation, using some more for vectorisation seems like the obvious choice.
So that would give you something like this:
#pragma omp parallel for private( i, j, k )
for( i = 0; i < n; i++ ) {
double *aa = &a[i * pn];
double *bb = &b[i * pn];
for( k = 0; k < n; k++ ) {
double *cc = &c[k * pn];
#pragma omp simd aligned( aa, bb, cc: 64 )
for( j = 0; j < n; j++) {
aa[j] += bb[k] * cc[j];
}
}
}
Here I assumed your arrays were double
so you might need to change that is they weren't. Aside from this, I think the idea is there. I don't know if it will work for you but this is definitely worth a try.