c openmp compiler-optimization icc xeon-phi

Xeon Phi: slower performance with padding

I have implemented a simple n x n matrix multiplication to test same performance tunings in c with OpenMp. My initial code is the following:

#pragma omp parallel for shared(a,b,c) private(h,i,j,k)
    for( i = 0; i < n; i++ )
    {
        for( j = 0; j < n; j++)
        {
            for( k = 0; k < n; k++ )
            {
                a[i*n+j]+= b[i*n+k] * c[k*n+j];

The compiler switches the j and k loop, so I get the ikj-Algorithm. The first thing I wanted to implement was padding to allign each row to 64byte for alligned cache access. Therefore I calculated the needed additional size for each row. With a row leangth of 5900, the new size is 5904 (reference is ISBN-9780124104143). My new code is the following:

#pragma omp parallel for shared(a,b,c) private(h,i,j,k)
    for( i = 0; i < n; i++ )
    {
        for( k = 0; k < n; k++ )
        {
            #pragma simd
            #pragma unroll(4)
            for( j = 0; j < n; j++)
            {
                a[i*pn+j]+= b[i*pn+k] * c[k*pn+j];

where pn is the new, padded row length. I had to manually permute my loops, because the compiler refused to. Running this code on a normal Xeon Processor I get nearly the same performancce as before, maybe a little bit better. Thats about what I expected. But when I'm running the code on a Xeon Phi, it is about 1/10 of the inital code.

After further compiler investigation, I noticed, that the inner loop doesnt get unrolled and vectorized anymore. So I added the following #pragmas:

#pragma simd
#pragma unroll

The vectorization works fine, but the remainder loop doesn't get unrolled. The performance is way better, but still only about 1/2 of the normal version.

Here is the compiler (-O3) output of the normal code:

  LOOP BEGIN at mm_par.c(75,3)
  remark #25444: Loopnest Interchanged: ( 1 2 3 ) --> ( 1 3 2 )

   LOOP BEGIN at mm_par.c(79,5)
   remark #25460: No loop optimizations reported

    LOOP BEGIN at mm_par.c(77,4)
    remark #15301: PERMUTED LOOP WAS VECTORIZED
    LOOP END

    LOOP BEGIN at mm_par.c(77,4)
    <Remainder>
    remark #25436: completely unrolled by 4  
    LOOP END
   LOOP END
  LOOP END

And here the output of the padded one with simd and unrolling pragmas:

LOOP BEGIN at mm_ali.c(76,3)
remark #25460: No loop optimizations reported

 LOOP BEGIN at mm_ali.c(78,4)
 remark #25460: No loop optimizations reported

  LOOP BEGIN at mm_ali.c(82,10)
     remark #15301: SIMD LOOP WAS VECTORIZED
  LOOP END
 LOOP END
LOOP END

So the unrolling gets ignored. Is there a way to force it? I also question myself, if thats the only reason for the bad performance..

edit: The assambly for the fast matrix multiplication without padding looks like this:

 vmovapd   c(%r15,%rbx,8), %zmm1                         #81.28 c1
    vprefetche1 2048+a(%r11,%rbx,8)                         #81.6 c5
    vmovapd   64+c(%r15,%rbx,8), %zmm3                      #81.28 c9
    vprefetch0 768+a(%r11,%rbx,8)                           #81.6 c13
    vmovapd   128+c(%r15,%rbx,8), %zmm4                     #81.28 c17
    vprefetch1 2048+c(%r15,%rbx,8)                          #81.28 c21
    vmovapd   192+c(%r15,%rbx,8), %zmm5                     #81.28 c25
    vprefetch0 768+c(%r15,%rbx,8)                           #81.28 c29
    vfmadd213pd a(%r11,%rbx,8), %zmm0, %zmm1                #81.6 c33
    vprefetche1 2112+a(%r11,%rbx,8)                         #81.6 c37
    vfmadd213pd 64+a(%r11,%rbx,8), %zmm0, %zmm3             #81.6 c41
    vprefetch0 832+a(%r11,%rbx,8)                           #81.6 c45
    vfmadd213pd 128+a(%r11,%rbx,8), %zmm0, %zmm4            #81.6 c49
    vprefetch1 2112+c(%r15,%rbx,8)                          #81.28 c53
    vfmadd213pd 192+a(%r11,%rbx,8), %zmm0, %zmm5            #81.6 c57
    vprefetch0 832+c(%r15,%rbx,8)                           #81.28 c61
    vmovaps   %zmm1, a(%r11,%rbx,8)                         #81.6 c65
    vprefetche1 2176+a(%r11,%rbx,8)                         #81.6 c69
    vmovaps   %zmm3, 64+a(%r11,%rbx,8)                      #81.6 c73
    vprefetch0 896+a(%r11,%rbx,8)                           #81.6 c77
    vmovaps   %zmm4, 128+a(%r11,%rbx,8)                     #81.6 c81
    vprefetch1 2176+c(%r15,%rbx,8)                          #81.28 c85
    vmovaps   %zmm5, 192+a(%r11,%rbx,8)                     #81.6 c89
    vprefetch0 896+c(%r15,%rbx,8)                           #81.28 c93
    vprefetche1 2240+a(%r11,%rbx,8)                         #81.6 c97
    vprefetch0 960+a(%r11,%rbx,8)                           #81.6 c101
    vprefetch1 2240+c(%r15,%rbx,8)                          #81.28 c105
    vprefetch0 960+c(%r15,%rbx,8)                           #81.28 c109
    addq      $32, %rbx                                     #77.4 c113
    cmpq      %rsi, %rbx                                    #77.4 c117
    jb        ..B1.51       # Prob 99%                      #77.4 c117

The one for the slow multiplication with padding looks like this:

vloadunpackld (%rbx), %zmm0                             #83.6 c1
    addl      $32, %r15d                                    #81.10 c1
    vprefetch1 2048+c(%rcx)                                 #83.30 c5
    vloadunpackhd 64(%rbx), %zmm0                           #83.6 c9
    addq      $256, %rbx                                    #81.10 c9
    vprefetch0 512+c(%rcx)                                  #83.30 c13
    vbroadcastsd b(%r12,%r13,8), %zmm2                      #83.18 c17
    vprefetch1 2112+c(%rcx)                                 #83.30 c21
    vfmadd132pd c(%rcx), %zmm0, %zmm2                       #83.6 c25
    vprefetch0 576+c(%rcx)                                  #83.30 c29
    vpackstoreld %zmm2, (%rsi)                              #83.6 c33
    vprefetch1 2176+c(%rcx)                                 #83.30 c37
    vpackstorehd %zmm2, 64(%rsi)                            #83.6 c41
    addq      $256, %rsi                                    #81.10 c41
    vprefetch0 640+c(%rcx)                                  #83.30 c45
    vloadunpackld (%rdi), %zmm3                             #83.6 c49
    vprefetch1 2240+c(%rcx)                                 #83.30 c53
    vloadunpackhd 64(%rdi), %zmm3                           #83.6 c57
    addq      $256, %rdi                                    #81.10 c57
    vprefetch0 704+c(%rcx)                                  #83.30 c61
    vbroadcastsd b(%r12,%r13,8), %zmm4                      #83.18 c65
    vfmadd132pd 64+c(%rcx), %zmm3, %zmm4                    #83.6 c69
    nop                                                     #83.6 c73
    vpackstoreld %zmm4, (%r8)                               #83.6 c77
    vpackstorehd %zmm4, 64(%r8)                             #83.6 c81
    addq      $256, %r8                                     #81.10 c81
    vloadunpackld (%r9), %zmm5                              #83.6 c85
    vloadunpackhd 64(%r9), %zmm5                            #83.6 c89
    addq      $256, %r9                                     #81.10 c89
    vbroadcastsd b(%r12,%r13,8), %zmm6                      #83.18 c93
    vfmadd132pd 128+c(%rcx), %zmm5, %zmm6                   #83.6 c97
    nop                                                     #83.6 c101
    vpackstoreld %zmm6, (%r10)                              #83.6 c105
    vpackstorehd %zmm6, 64(%r10)                            #83.6 c109
    addq      $256, %r10                                    #81.10 c109
    vloadunpackld (%r11), %zmm7                             #83.6 c113
    vloadunpackhd 64(%r11), %zmm7                           #83.6 c117
    addq      $256, %r11                                    #81.10 c117
    vbroadcastsd b(%r12,%r13,8), %zmm8                      #83.18 c121
    vfmadd132pd 192+c(%rcx), %zmm7, %zmm8                   #83.6 c125
    addq      $256, %rcx                                    #81.10 c129
    movb      %al, %al                                      #83.6 c129
    vpackstoreld %zmm8, (%rdx)                              #83.6 c133
    vpackstorehd %zmm8, 64(%rdx)                            #83.6 c137
    addq      $256, %rdx                                    #81.10 c137
    cmpl      $5888, %r15d                                  #81.10 c141
    jb        ..B1.42       # Prob 99%                      #81.10 c141

Here is the complete code of my solution. Again, if I exchange np with n the performance is more than twice as fast.

#include <sys/time.h>
#include <omp.h>

#ifndef max
#define max(a,b) (((a) (b)) ? (a) : (b))
#define min(a,b) (((a) < (b)) ? (a) : (b))
#endif

#define n 5900
#define pn ((((n*sizeof(double))+63)/64)*(64/sizeof(double))) //padding
#define threadNum 144
#define loops 1

double dtime()
{
    double tseconds = 0.0;
    struct timeval mytime;
    gettimeofday(&mytime, (struct timezone*)0);
    tseconds = (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6);
    return tseconds;

}

double a[n*pn] __attribute__((aligned(64)));
double b[n*pn] __attribute__((aligned(64)));
double c[n*pn] __attribute__((aligned(64)));


main(int argc, char **argv){

    int threadNumber, loopNumber;
    if(argc == 3) 
    {
        threadNumber = atoi(argv[1]);
        loopNumber = atoi(argv[2]);
    } else 
    {
        threadNumber = threadNum;
        loopNumber = loops;     
    }


    double tstart, tstop, ttime;
    int i,j,k,h;

    // initialize matrices
    #pragma omp parallel for
    for(i = 0; i < pn*n; i++)
    {
        a[i]=0.0;
        b[i]=2.0;
        c[i]=2.0;
    }

    omp_set_num_threads(threadNumber); 

    tstart = dtime();

    //parallelize via OpenMP on MIC
    for(h = 0; h < loopNumber; h++){
        #pragma omp parallel for shared(a,b,c) private(h,i,j,k)
        for( i = 0; i < n; i++ )
        {
            for( k = 0; k < n; k++ )
            {           
                #pragma omp simd aligned( a, b, c: 64 )
                for( j = 0; j < n; j++)             
                {
                    a[i*pn+j]+= b[i*pn+k] * c[k*pn +j];
                }
            }
        }

    }

    tstop = dtime();
    double elapsed = tstop - tstart;

    double mFlops = ((double)n)*n*n*2.0*loopNumber/elapsed*1.0e-06;

    #pragma omp parallel
    #pragma omp master
    printf("%d %.3f\n", omp_get_num_threads(), mFlops);

}

Solution

The code snippet you gave being too small to allow for proper testing I can on suggest a solution without testing it properly, so it might fail miserably. Anyway, here goes nothing.

You said that you did pad the data to align the rows, but, AFAICS, this information is never transmitted to the compiler, which therefore cannot exploit it. For this you have various solutions, but since you have OpenMP directives already for parallelisation, using some more for vectorisation seems like the obvious choice.

So that would give you something like this:

#pragma omp parallel for private( i, j, k )
for( i = 0; i < n; i++ ) {
    double *aa = &a[i * pn];
    double *bb = &b[i * pn];
    for( k = 0; k < n; k++ ) {
        double *cc = &c[k * pn];
        #pragma omp simd aligned( aa, bb, cc: 64 )
        for( j = 0; j < n; j++) {
            aa[j] += bb[k] * cc[j];
        }
    }
}

Here I assumed your arrays were double so you might need to change that is they weren't. Aside from this, I think the idea is there. I don't know if it will work for you but this is definitely worth a try.