Search code examples
structgnu-parallelopenacc

OpenACC How can I keep a data between differetn calls of a function?


I'm trying to optimize an application with OpenACC. In the main, I have an iteration loop of this type:

while(t<tstop){

 add(&data, nx);

}

Where data is a variable of type Data, defined by this Structure

typedef struct Data_{   
  double *x;    
}Data;

The function I'm calling in the while loop is parallelizable, but what I don't manage to do is to maintain the array x[] in the device memory between the different calls of the function.

void add(Data *data, int n){

  #pragma acc data pcopy(data[0:1])
  #pragma acc data pcopy(data->x[0:n])

  #pragma acc parallel loop
  for(int i=0; i < n ; i++){
    data->x[i] += 1.;
  }
  #pragma acc exit data copyout(data->x[0:n])
  #pragma acc exit data copyout(data[0:1])
}

I know the program seems to be no sense but I just wrote something to reproduce the problem I have in the real code.

I tryied to use unstructured data region:

#pragma acc enter data copyin(data[0:1])
#pragma acc enter data copyin(data->x[0:n])

#pragma acc data present(data[:1], data->x[:n])
#pragma acc parallel loop
  for(int i=0; i < n ; i++){
    data->x[i] += 1.;
  }

#pragma acc exit data copyout(data->x[0:n])
#pragma acc exit data copyout(data[0:1])

but for some reason I get an error of this type:

FATAL ERROR: variable in data clause is partially present on the device: name=data


Solution

  • I'm not able to reproduce the partially present error from the code snip-it provided so it's unclear why this error is occurring. In general, the error occurs when the size of the variable in the present table differs from the size being used in the data clause. If you can provide a reproducing example, I can take a look and determine why it's happening here.

    To answer the topic question, device variables can be accessed anywhere within the scope of the data region they are in, even across subroutines. For unstructured data regions (i.e. enter data/exit data), the scope is defined at runtime between the enter and exit calls. For structured data regions, the scope is defined by the structured block.

    Here's an example using the structure you define above (though I've included the size of x as part of the struct).

    % cat test.c
    #include <stdio.h>
    #include <stdlib.h>
    
    
    typedef struct Data_{
      double *x;
      int n;
    }Data;
    
    void add(Data *data){
    
    #pragma acc parallel loop present(data)
      for(int i=0; i < data->n ; i++){
        data->x[i] += 1.;
      }
    }
    
    int main () {
    
       Data *data;
       data = (Data*) malloc(sizeof(Data));
       data->n = 64;
       data->x = (double *) malloc(sizeof(double)*data->n);
       for(int i=0; i < data->n ; i++){
          data->x[i] = (double) i;
       }
    
    #pragma acc enter data copyin(data[0:1])
    #pragma acc enter data copyin(data->x[0:data->n])
       add(data);
    #pragma acc exit data copyout(data->x[0:data->n])
    #pragma acc exit data delete(data)
    
       for(int i=0; i < data->n ; i++){
          printf("%d:%f\n",i,data->x[i]);
       }
       free(data->x);
       free(data);
    }
    % pgcc test.c -ta=tesla -Minfo=accel; a.out
    add:
         12, Generating present(data[:])
             Generating Tesla code
             13, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    main:
         28, Generating enter data copyin(data[:1])
         29, Generating enter data copyin(data->x[:data->n])
         31, Generating exit data copyout(data->x[:data->n])
         32, Generating exit data delete(data[:1])
    0:1.000000
    1:2.000000
    2:3.000000
    3:4.000000
    4:5.000000
    5:6.000000
    6:7.000000
    7:8.000000
    8:9.000000
    9:10.000000
    10:11.000000
    11:12.000000
    12:13.000000
    13:14.000000
    14:15.000000
    15:16.000000
    16:17.000000
    17:18.000000
    18:19.000000
    19:20.000000
    20:21.000000
    21:22.000000
    22:23.000000
    23:24.000000
    24:25.000000
    25:26.000000
    26:27.000000
    27:28.000000
    28:29.000000
    29:30.000000
    30:31.000000
    31:32.000000
    32:33.000000
    33:34.000000
    34:35.000000
    35:36.000000
    36:37.000000
    37:38.000000
    38:39.000000
    39:40.000000
    40:41.000000
    41:42.000000
    42:43.000000
    43:44.000000
    44:45.000000
    45:46.000000
    46:47.000000
    47:48.000000
    48:49.000000
    49:50.000000
    50:51.000000
    51:52.000000
    52:53.000000
    53:54.000000
    54:55.000000
    55:56.000000
    56:57.000000
    57:58.000000
    58:59.000000
    59:60.000000
    60:61.000000
    61:62.000000
    62:63.000000
    63:64.000000
    

    Also, here's a second example, but now with "data" being an array where the size of each "x" can be different.

    % cat test2.c
    #include <stdio.h>
    #include <stdlib.h>
    
    #define M 16
    
    typedef struct Data_{
      double *x;
      int n;
    }Data;
    
    void add(Data *data){
    
    #pragma acc parallel loop present(data)
      for(int i=0; i < data->n ; i++){
        data->x[i] += 1.;
      }
    }
    
    int main () {
    
       Data *data;
       data = (Data*) malloc(sizeof(Data)*M);
    #pragma acc enter data create(data[0:M])
       for (int i =0; i < M; ++i) {
          data[i].n = i+1;
          data[i].x = (double *) malloc(sizeof(double)*data[i].n);
          for(int j=0; j < data[i].n ; j++){
             data[i].x[j] = (double)((i*data[i].n) + j);
          }
    #pragma acc update device(data[i].n)
    #pragma acc enter data copyin(data[i].x[0:data[i].n])
       }
    
       for (int i =0; i < M; ++i) {
         add(&data[i]);
       }
    
       for (int i =0; i < M; ++i) {
    #pragma acc update self(data[i].x[:data[i].n])
         for(int j=0; j < data[i].n ; j++){
          printf("%d:%d:%f\n",i,j,data[i].x[j]);
       }}
    
       for (int i =0; i < M; ++i) {
    #pragma acc exit data delete(data[i].x)
          free(data[i].x);
       }
    #pragma acc exit data delete(data)
       free(data);
    
    }
    % pgcc test2.c -ta=tesla -Minfo=accel; a.out
    add:
         11, Generating present(data[:1])
             Generating Tesla code
             14, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    main:
         22, Generating enter data create(data[:16])
         32, Generating update device(data->n)
             Generating enter data copyin(data->x[:data->n])
         38, Generating update self(data->x[:data->n])
         46, Generating exit data delete(data->x[:1])
         49, Generating exit data delete(data[:1])
    0:0:1.000000
    1:0:3.000000
    1:1:4.000000
    2:0:7.000000
    2:1:8.000000
    2:2:9.000000
    3:0:13.000000
    3:1:14.000000
    3:2:15.000000
    3:3:16.000000
    4:0:21.000000
    4:1:22.000000
    4:2:23.000000
    4:3:24.000000
    4:4:25.000000
    5:0:31.000000
    5:1:32.000000
    5:2:33.000000
    5:3:34.000000
    5:4:35.000000
    5:5:36.000000
    6:0:43.000000
    6:1:44.000000
    6:2:45.000000
    6:3:46.000000
    6:4:47.000000
    6:5:48.000000
    6:6:49.000000
    7:0:57.000000
    7:1:58.000000
    7:2:59.000000
    7:3:60.000000
    7:4:61.000000
    7:5:62.000000
    7:6:63.000000
    7:7:64.000000
    8:0:73.000000
    8:1:74.000000
    8:2:75.000000
    8:3:76.000000
    8:4:77.000000
    8:5:78.000000
    8:6:79.000000
    8:7:80.000000
    8:8:81.000000
    9:0:91.000000
    9:1:92.000000
    9:2:93.000000
    9:3:94.000000
    9:4:95.000000
    9:5:96.000000
    9:6:97.000000
    9:7:98.000000
    9:8:99.000000
    9:9:100.000000
    10:0:111.000000
    10:1:112.000000
    10:2:113.000000
    10:3:114.000000
    10:4:115.000000
    10:5:116.000000
    10:6:117.000000
    10:7:118.000000
    10:8:119.000000
    10:9:120.000000
    10:10:121.000000
    11:0:133.000000
    11:1:134.000000
    11:2:135.000000
    11:3:136.000000
    11:4:137.000000
    11:5:138.000000
    11:6:139.000000
    11:7:140.000000
    11:8:141.000000
    11:9:142.000000
    11:10:143.000000
    11:11:144.000000
    12:0:157.000000
    12:1:158.000000
    12:2:159.000000
    12:3:160.000000
    12:4:161.000000
    12:5:162.000000
    12:6:163.000000
    12:7:164.000000
    12:8:165.000000
    12:9:166.000000
    12:10:167.000000
    12:11:168.000000
    12:12:169.000000
    13:0:183.000000
    13:1:184.000000
    13:2:185.000000
    13:3:186.000000
    13:4:187.000000
    13:5:188.000000
    13:6:189.000000
    13:7:190.000000
    13:8:191.000000
    13:9:192.000000
    13:10:193.000000
    13:11:194.000000
    13:12:195.000000
    13:13:196.000000
    14:0:211.000000
    14:1:212.000000
    14:2:213.000000
    14:3:214.000000
    14:4:215.000000
    14:5:216.000000
    14:6:217.000000
    14:7:218.000000
    14:8:219.000000
    14:9:220.000000
    14:10:221.000000
    14:11:222.000000
    14:12:223.000000
    14:13:224.000000
    14:14:225.000000
    15:0:241.000000
    15:1:242.000000
    15:2:243.000000
    15:3:244.000000
    15:4:245.000000
    15:5:246.000000
    15:6:247.000000
    15:7:248.000000
    15:8:249.000000
    15:9:250.000000
    15:10:251.000000
    15:11:252.000000
    15:12:253.000000
    15:13:254.000000
    15:14:255.000000
    15:15:256.000000
    

    Note, be careful about copying structs with dynamic data members. Copying the struct itself, i.e. like you have above "#pragma acc exit data copyout(data[0:1])", will overwrite the host address of "x" with the device address. Instead, copy only "data->x" and delete "data".