How to declare a global dynamic array with C/OpenACC with PGI compiler

I am trying to run a simple test case where a dynamically allocated array A is defined extern and is uploaded to the GPU using OpenACC. All with the PGI compiler.

My header.h file:

     extern int *A;
     #pragma acc declare create(A)

Then, my header.c implementation:

    int *A;
    #pragma acc declare copyin(A)

Then, in main.c I have

#include "header.h"
int main(int argc, char* argv[]){
        printf("main() start\n");
        int sum=0;
        int N=0;
        if(argc==1){ 
          printf("usage: ./main.exe N");
        }else{
          N=atoi(argv[1]);  
        }
        printf("N =%d\n", N);
        A=(int*)malloc(N*sizeof(int));
        for(int i=0;i<N;i++){A[i]=i;}
        printf("almost data region\n");
        #pragma acc data copy(sum)
        {
             printf("inside data region\n");
             #pragma acc update device(A[0:N])
             #pragma acc parallel loop reduction(+:sum)
             for(int i=0;i<N;i++){
                sum+=A[i];
             }
        }
        printf("sum = %d\n",sum);
    }

I compile the code with the following commands:

$ cc -g -lnvToolsExt -O2 -acc -ta=tesla:cc60 -c11 -mp -Minfo -Mlarge_arrays   -c  -o header.o header.c
$ cc -g -lnvToolsExt -O2 -acc -ta=tesla:cc60 -c11 -mp -Minfo -Mlarge_arrays   -c  -o main.o main.c
PGC-W-0155-Pointer value created from a nonlong integral type  (main.c: 12)
main:
     13, Generated 2 alternate versions of the loop
         Generated vector simd code for the loop
     17, Generating copy(sum)
     21, Generating update device(A[:N])
         Accelerator kernel generated
         Generating Tesla code
         21, Generating reduction(+:sum)
         22, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
PGC/x86-64 Linux 17.5-0: compilation completed with warnings
$ cc -g -lnvToolsExt -O2 -acc -ta=tesla:cc60 -c11 -mp -Minfo -Mlarge_arrays   header.o main.o -o main.exe

My PGI compiler version is:

$ cc -v
Export PGI=/opt/pgi/17.5.0

To execute the code:

$ ACC_NOTIFY=3 srun cuda-memcheck --show-backtrace yes main.exe 10000
upload CUDA data  file=/scratch/snx3000/ragagnin/2017/prova/main.c function=main line=17 device=0 threadid=1 variable=A bytes=8
upload CUDA data  file=/scratch/snx3000/ragagnin/2017/prova/main.c function=main line=17 device=0 threadid=1 variable=sum bytes=4
Present table dump for device[1]: NVIDIA Tesla GPU 0, compute capability 6.0, threadid=1
host:0x606780 device:0x10216200000 size:8 presentcount:0+1 line:-1 name:A
host:0x7fffffff67ac device:0x1021a400000 size:4 presentcount:1+0 line:17 name:sum
allocated block device:0x1021a400000 size:512 thread:1
FATAL ERROR: data in update device clause was not found on device 1: name=A
 file:/scratch/snx3000/ragagnin/2017/prova/main.c main line:21
main() start
N =10000
almost data region
inside data region
========= CUDA-MEMCHECK
========= Program hit CUDA_ERROR_INVALID_DEVICE (error 101) due to "invalid device ordinal" on CUDA API call to cuDevicePrimaryCtxRetain. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/opt/cray/nvidia/default/lib64/libcuda.so (cuDevicePrimaryCtxRetain + 0x15c) [0x1e497c]
=========     Host Frame:/opt/pgi/17.5.0/linux86-64/17.5/lib/libaccnmp.so (__pgi_uacc_cuda_initdev + 0x962) [0x140e1]
=========     Host Frame:/opt/pgi/17.5.0/linux86-64/17.5/lib/libaccgmp.so (__pgi_uacc_enumerate + 0x173) [0x12e31]
=========     Host Frame:/opt/pgi/17.5.0/linux86-64/17.5/lib/libaccgmp.so (__pgi_uacc_initialize + 0x9b) [0x1340d]
=========     Host Frame:/opt/pgi/17.5.0/linux86-64/17.5/lib/libaccgmp.so (__pgi_uacc_dataenterstart + 0x50) [0x9de1]
=========     Host Frame:main.exe [0x16a5]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x206e5]
=========     Host Frame:main.exe [0x11c9]
=========
========= ERROR SUMMARY: 1 error
srun: error: nid03948: task 0: Exited with exit code 1
srun: Terminating job step 4066800.15

I think the problem is that the PGI compiler sends variable=A bytes=8, thus ignoring my request of sending A[0:N].

So, how to declare a global dynamic array with C/OpenACC with PGI compiler?

Solution

When you use "declare" with a pointer, you are creating a global device pointer but not the array that pointer points to. Hence when you try and update the array it doesn't exist and why the runtime errors.

To fix, you also need to add the array to a data region such as the "enter data" directive as I show below. When you put the array in a data region, besides creating space for the array, the runtime will then go back and "attach" it to "A", i.e. fill in the device copy of "A" with correct device pointer value.

You'll also want to tell the compiler that "A" is already present on device by putting a "present(A)" on the compute regions.

Note that the second "declare copyin" is not needed. Also, with "create" the device data is uninitialized while "copyin" will initialize the variable with the host value. But since the host value is a host pointer, it's still garbage on the device. So not necessarily wrong, just not needed.

% cat header.h

#include <stdio.h>
#include <stdlib.h>

extern int *A;
#pragma acc declare create(A)

% cat header.c
#include <header.h>
int *A;

% cat test.c
#include "header.h"
int main(int argc, char* argv[]){
        printf("main() start\n");
        int sum=0;
        int N=0;
        if(argc==1){
          printf("usage: ./main.exe N");
        }else{
          N=atoi(argv[1]);
        }
        printf("N =%d\n", N);
        A=(int*)malloc(N*sizeof(int));
        #pragma acc enter data create(A[0:N])

        for(int i=0;i<N;i++){A[i]=i;}
        printf("almost data region\n");
        #pragma acc data copy(sum)
        {
             printf("inside data region\n");
             #pragma acc update device(A[0:N])
             #pragma acc parallel loop present(A) reduction(+:sum)
             for(int i=0;i<N;i++){
                sum+=A[i];
             }
        }
        printf("sum = %d\n",sum);
        #pragma acc exit data delete(A)
        free(A);
        exit(0);
    }
% pgcc -I./ test.c header.c -ta=tesla:cc60 -Minfo=accel
test.c:
main:
     13, Generating enter data create(A[:N])
     17, Generating copy(sum)
     21, Generating update device(A[:N])
         Accelerator kernel generated
         Generating Tesla code
         21, Generating reduction(+:sum)
         22, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     27, Generating exit data delete(A[:1])
header.c:
% setenv PGI_ACC_TIME 1
% a.out 1024
main() start
N =1024
almost data region
inside data region
sum = 523776

Accelerator Kernel Timing data
test.c
  main  NVIDIA  devicenum=0
    time(us): 124
    13: upload reached 1 time
        13: data copyin transfers: 1
             device time(us): total=33 max=33 min=33 avg=33
    13: data region reached 1 time
        13: data copyin transfers: 1
             device time(us): total=9 max=9 min=9 avg=9
    17: data region reached 2 times
        17: data copyin transfers: 1
             device time(us): total=33 max=33 min=33 avg=33
        26: data copyout transfers: 1
             device time(us): total=22 max=22 min=22 avg=22
    21: update directive reached 1 time
        21: data copyin transfers: 1
             device time(us): total=10 max=10 min=10 avg=10
    21: compute region reached 1 time
        21: kernel launched 1 time
            grid: [8]  block: [128]
             device time(us): total=4 max=4 min=4 avg=4
            elapsed time(us): total=589 max=589 min=589 avg=589
        21: reduction kernel launched 1 time
            grid: [1]  block: [256]
             device time(us): total=4 max=4 min=4 avg=4
            elapsed time(us): total=27 max=27 min=27 avg=27
    27: data region reached 1 time
        27: data copyin transfers: 1
             device time(us): total=9 max=9 min=9 avg=9