Search code examples
copenaccpgipgi-accelerator

OpenACC: having a private array for every GPU thread


I am bringing a code to the GPU. This code has a kernel that makes use of a private array. This means that the array is declared inside the kernel loop.

When I port the code to OpenACC I get buggy results. To me, it looks like the array is shared between GPU vector threads and this cause several race conditions.

I organized the following example with also external calls because that's the way my original code looks like.

header.h:

#define N 100000
#define K 16
#pragma acc routine
void assign_i_to_privj(int * priv, int j, int i);
#pragma acc routinetnumpy
void add_privi_to_sum(int * priv, int i, int *sum);

main.c:

#include "header.h"
int main(void){
int A[N];
#pragma acc data copy(A)
{
#pragma acc parallel loop
     for(int i=0; i<N;i++){
       int priv[K];
       int sum=0;
       int j=0;
       while(1){
       if(j>=K) break;
           assign_i_to_privj(priv, j, i);
           j++;
       }
       j=0;
       while(1){
           if(j>=K) break;
           add_privi_to_sum(priv, j, &sum);
           j++;
       }
       sum/=K; // now sum == i;
       A[i]=sum;
     }
   }
   //now A[i] == i
   for(int i=0; i<123; i++) printf("A[%d]=%d ",i, A[i]);
   printf("\n");
   return 0;
}

f.c:

#include "header.h"
void assign_i_to_privj(int *priv, int j, int i){
       priv[j]=i;
}
void add_privi_to_sum(int *priv, int j, int *sum){
      (*sum)+=priv[j];
}

I can see the compiler version with cc -v that returns Export PGI=/opt/pgi/17.5.0.

To compile:

cc -g -lnvToolsExt -O2  -acc  -ta=tesla:cc60  -c11 -mp -Minfo -Mlarge_arrays -c main.c &&
cc -g -lnvToolsExt -O2 -acc  -ta=tesla:cc60  -c11 -mp -Minfo -Mlarge_arrays -c f.c &&
cc -g -lnvToolsExt -O2 -acc  -ta=tesla:cc60 -c11 -mp -Minfo -Mlarge_arrays f.o main.o -o acc.exe &&
srun  -n 1 acc.exe

The code should set all A[i] elements equal to i. When I run this code with OpenACC support, I get completely wrong results. My guess is a race condition. The version without openacccompile and runs correctly. At the end of the run A[i]==i

So, my question is: how can I make a small array to be private to all GPU threads with OpenACC?


Solution

  • The declaration of "priv" is getting hoisted out of the loop thus making it shared between the threads. The work around is to declare "priv" before the loop and then use the "private" clause to privatize it. You'll also want to schedule the loop as "gang vector" to prevent the compiler from automatically parallelizing the two inner loops.

    For example:

    % cat main.c
    #include "header.h"
    int main(void){
    int A[N];
    int priv[K];
    #pragma acc data copy(A)
    {
    #pragma acc parallel loop gang vector private(priv)
         for(int i=0; i<N;i++){
           int sum=0;
           int j=0;
           while(1){
           if(j>=K) break;
               assign_i_to_privj(priv, j, i);
               j++;
           }
           j=0;
           while(1){
               if(j>=K) break;
               add_privi_to_sum(priv, j, &sum);
               j++;
           }
           sum/=K; // now sum == i;
           A[i]=sum;
         }
       }
       //now A[i] == i
       for(int i=0; i<123; i++) printf("A[%d]=%d ",i, A[i]);
       printf("\n");
       return 0;
    }
    % pgcc f.c main.c -Minfo=acc -ta=tesla:cc60 -fast -V17.10
    f.c:
    assign_i_to_privj:
          2, Generating acc routine seq
             Generating Tesla code
    add_privi_to_sum:
          5, Generating acc routine seq
             Generating Tesla code
    main.c:
    main:
          5, Generating copy(A[:])
          7, Accelerator kernel generated
             Generating Tesla code
              8, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
             11, #pragma acc loop seq
             17, #pragma acc loop seq
          7, Local memory used for priv
         11, Loop is parallelizable
         17, Loop is parallelizable
    % a.out
    A[0]=0 A[1]=1 A[2]=2 A[3]=3 A[4]=4 A[5]=5 A[6]=6 A[7]=7 A[8]=8 A[9]=9 A[10]=10 A[11]=11 A[12]=12 A[13]=13 A[14]=14 A[15]=15 A[16]=16 A[17]=17 A[18]=18 A[19]=19 A[20]=20 A[21]=21 A[22]=22 A[23]=23 A[24]=24 A[25]=25 A[26]=26 A[27]=27 A[28]=28 A[29]=29 A[30]=30 A[31]=31 A[32]=32 A[33]=33 A[34]=34 A[35]=35 A[36]=36 A[37]=37 A[38]=38 A[39]=39 A[40]=40 A[41]=41 A[42]=42 A[43]=43 A[44]=44 A[45]=45 A[46]=46 A[47]=47 A[48]=48 A[49]=49 A[50]=50 A[51]=51 A[52]=52 A[53]=53 A[54]=54 A[55]=55 A[56]=56 A[57]=57 A[58]=58 A[59]=59 A[60]=60 A[61]=61 A[62]=62 A[63]=63 A[64]=64 A[65]=65 A[66]=66 A[67]=67 A[68]=68 A[69]=69 A[70]=70 A[71]=71 A[72]=72 A[73]=73 A[74]=74 A[75]=75 A[76]=76 A[77]=77 A[78]=78 A[79]=79 A[80]=80 A[81]=81 A[82]=82 A[83]=83 A[84]=84 A[85]=85 A[86]=86 A[87]=87 A[88]=88 A[89]=89 A[90]=90 A[91]=91 A[92]=92 A[93]=93 A[94]=94 A[95]=95 A[96]=96 A[97]=97 A[98]=98 A[99]=99 A[100]=100 A[101]=101 A[102]=102 A[103]=103 A[104]=104 A[105]=105 A[106]=106 A[107]=107 A[108]=108 A[109]=109 A[110]=110 A[111]=111 A[112]=112 A[113]=113 A[114]=114 A[115]=115 A[116]=116 A[117]=117 A[118]=118 A[119]=119 A[120]=120 A[121]=121 A[122]=122