I am bringing a code to the GPU. This code has a kernel that makes use of a private array. This means that the array is declared inside the kernel loop.
When I port the code to OpenACC I get buggy results. To me, it looks like the array is shared between GPU vector threads and this cause several race conditions.
I organized the following example with also external calls because that's the way my original code looks like.
header.h:
#define N 100000
#define K 16
#pragma acc routine
void assign_i_to_privj(int * priv, int j, int i);
#pragma acc routinetnumpy
void add_privi_to_sum(int * priv, int i, int *sum);
main.c:
#include "header.h"
int main(void){
int A[N];
#pragma acc data copy(A)
{
#pragma acc parallel loop
for(int i=0; i<N;i++){
int priv[K];
int sum=0;
int j=0;
while(1){
if(j>=K) break;
assign_i_to_privj(priv, j, i);
j++;
}
j=0;
while(1){
if(j>=K) break;
add_privi_to_sum(priv, j, &sum);
j++;
}
sum/=K; // now sum == i;
A[i]=sum;
}
}
//now A[i] == i
for(int i=0; i<123; i++) printf("A[%d]=%d ",i, A[i]);
printf("\n");
return 0;
}
f.c:
#include "header.h"
void assign_i_to_privj(int *priv, int j, int i){
priv[j]=i;
}
void add_privi_to_sum(int *priv, int j, int *sum){
(*sum)+=priv[j];
}
I can see the compiler version with cc -v
that returns Export PGI=/opt/pgi/17.5.0
.
To compile:
cc -g -lnvToolsExt -O2 -acc -ta=tesla:cc60 -c11 -mp -Minfo -Mlarge_arrays -c main.c &&
cc -g -lnvToolsExt -O2 -acc -ta=tesla:cc60 -c11 -mp -Minfo -Mlarge_arrays -c f.c &&
cc -g -lnvToolsExt -O2 -acc -ta=tesla:cc60 -c11 -mp -Minfo -Mlarge_arrays f.o main.o -o acc.exe &&
srun -n 1 acc.exe
The code should set all A[i]
elements equal to i
. When I run this code with OpenACC support, I get completely wrong results. My guess is a race condition.
The version without openacc
compile and runs correctly. At the end of the run A[i]==i
So, my question is: how can I make a small array to be private to all GPU threads with OpenACC?
The declaration of "priv" is getting hoisted out of the loop thus making it shared between the threads. The work around is to declare "priv" before the loop and then use the "private" clause to privatize it. You'll also want to schedule the loop as "gang vector" to prevent the compiler from automatically parallelizing the two inner loops.
For example:
% cat main.c
#include "header.h"
int main(void){
int A[N];
int priv[K];
#pragma acc data copy(A)
{
#pragma acc parallel loop gang vector private(priv)
for(int i=0; i<N;i++){
int sum=0;
int j=0;
while(1){
if(j>=K) break;
assign_i_to_privj(priv, j, i);
j++;
}
j=0;
while(1){
if(j>=K) break;
add_privi_to_sum(priv, j, &sum);
j++;
}
sum/=K; // now sum == i;
A[i]=sum;
}
}
//now A[i] == i
for(int i=0; i<123; i++) printf("A[%d]=%d ",i, A[i]);
printf("\n");
return 0;
}
% pgcc f.c main.c -Minfo=acc -ta=tesla:cc60 -fast -V17.10
f.c:
assign_i_to_privj:
2, Generating acc routine seq
Generating Tesla code
add_privi_to_sum:
5, Generating acc routine seq
Generating Tesla code
main.c:
main:
5, Generating copy(A[:])
7, Accelerator kernel generated
Generating Tesla code
8, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
11, #pragma acc loop seq
17, #pragma acc loop seq
7, Local memory used for priv
11, Loop is parallelizable
17, Loop is parallelizable
% a.out
A[0]=0 A[1]=1 A[2]=2 A[3]=3 A[4]=4 A[5]=5 A[6]=6 A[7]=7 A[8]=8 A[9]=9 A[10]=10 A[11]=11 A[12]=12 A[13]=13 A[14]=14 A[15]=15 A[16]=16 A[17]=17 A[18]=18 A[19]=19 A[20]=20 A[21]=21 A[22]=22 A[23]=23 A[24]=24 A[25]=25 A[26]=26 A[27]=27 A[28]=28 A[29]=29 A[30]=30 A[31]=31 A[32]=32 A[33]=33 A[34]=34 A[35]=35 A[36]=36 A[37]=37 A[38]=38 A[39]=39 A[40]=40 A[41]=41 A[42]=42 A[43]=43 A[44]=44 A[45]=45 A[46]=46 A[47]=47 A[48]=48 A[49]=49 A[50]=50 A[51]=51 A[52]=52 A[53]=53 A[54]=54 A[55]=55 A[56]=56 A[57]=57 A[58]=58 A[59]=59 A[60]=60 A[61]=61 A[62]=62 A[63]=63 A[64]=64 A[65]=65 A[66]=66 A[67]=67 A[68]=68 A[69]=69 A[70]=70 A[71]=71 A[72]=72 A[73]=73 A[74]=74 A[75]=75 A[76]=76 A[77]=77 A[78]=78 A[79]=79 A[80]=80 A[81]=81 A[82]=82 A[83]=83 A[84]=84 A[85]=85 A[86]=86 A[87]=87 A[88]=88 A[89]=89 A[90]=90 A[91]=91 A[92]=92 A[93]=93 A[94]=94 A[95]=95 A[96]=96 A[97]=97 A[98]=98 A[99]=99 A[100]=100 A[101]=101 A[102]=102 A[103]=103 A[104]=104 A[105]=105 A[106]=106 A[107]=107 A[108]=108 A[109]=109 A[110]=110 A[111]=111 A[112]=112 A[113]=113 A[114]=114 A[115]=115 A[116]=116 A[117]=117 A[118]=118 A[119]=119 A[120]=120 A[121]=121 A[122]=122