I'm trying to optimize an application with OpenACC. In the main, I have an iteration loop of this type:
while(t<tstop){
add(&data, nx);
}
Where data is a variable of type Data, defined by this Structure
typedef struct Data_{
double *x;
}Data;
The function I'm calling in the while loop is parallelizable, but what I don't manage to do is to maintain the array x[] in the device memory between the different calls of the function.
void add(Data *data, int n){
#pragma acc data pcopy(data[0:1])
#pragma acc data pcopy(data->x[0:n])
#pragma acc parallel loop
for(int i=0; i < n ; i++){
data->x[i] += 1.;
}
#pragma acc exit data copyout(data->x[0:n])
#pragma acc exit data copyout(data[0:1])
}
I know the program seems to be no sense but I just wrote something to reproduce the problem I have in the real code.
I tryied to use unstructured data region:
#pragma acc enter data copyin(data[0:1])
#pragma acc enter data copyin(data->x[0:n])
#pragma acc data present(data[:1], data->x[:n])
#pragma acc parallel loop
for(int i=0; i < n ; i++){
data->x[i] += 1.;
}
#pragma acc exit data copyout(data->x[0:n])
#pragma acc exit data copyout(data[0:1])
but for some reason I get an error of this type:
FATAL ERROR: variable in data clause is partially present on the device: name=data
I'm not able to reproduce the partially present error from the code snip-it provided so it's unclear why this error is occurring. In general, the error occurs when the size of the variable in the present table differs from the size being used in the data clause. If you can provide a reproducing example, I can take a look and determine why it's happening here.
To answer the topic question, device variables can be accessed anywhere within the scope of the data region they are in, even across subroutines. For unstructured data regions (i.e. enter data/exit data), the scope is defined at runtime between the enter and exit calls. For structured data regions, the scope is defined by the structured block.
Here's an example using the structure you define above (though I've included the size of x as part of the struct).
% cat test.c
#include <stdio.h>
#include <stdlib.h>
typedef struct Data_{
double *x;
int n;
}Data;
void add(Data *data){
#pragma acc parallel loop present(data)
for(int i=0; i < data->n ; i++){
data->x[i] += 1.;
}
}
int main () {
Data *data;
data = (Data*) malloc(sizeof(Data));
data->n = 64;
data->x = (double *) malloc(sizeof(double)*data->n);
for(int i=0; i < data->n ; i++){
data->x[i] = (double) i;
}
#pragma acc enter data copyin(data[0:1])
#pragma acc enter data copyin(data->x[0:data->n])
add(data);
#pragma acc exit data copyout(data->x[0:data->n])
#pragma acc exit data delete(data)
for(int i=0; i < data->n ; i++){
printf("%d:%f\n",i,data->x[i]);
}
free(data->x);
free(data);
}
% pgcc test.c -ta=tesla -Minfo=accel; a.out
add:
12, Generating present(data[:])
Generating Tesla code
13, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
main:
28, Generating enter data copyin(data[:1])
29, Generating enter data copyin(data->x[:data->n])
31, Generating exit data copyout(data->x[:data->n])
32, Generating exit data delete(data[:1])
0:1.000000
1:2.000000
2:3.000000
3:4.000000
4:5.000000
5:6.000000
6:7.000000
7:8.000000
8:9.000000
9:10.000000
10:11.000000
11:12.000000
12:13.000000
13:14.000000
14:15.000000
15:16.000000
16:17.000000
17:18.000000
18:19.000000
19:20.000000
20:21.000000
21:22.000000
22:23.000000
23:24.000000
24:25.000000
25:26.000000
26:27.000000
27:28.000000
28:29.000000
29:30.000000
30:31.000000
31:32.000000
32:33.000000
33:34.000000
34:35.000000
35:36.000000
36:37.000000
37:38.000000
38:39.000000
39:40.000000
40:41.000000
41:42.000000
42:43.000000
43:44.000000
44:45.000000
45:46.000000
46:47.000000
47:48.000000
48:49.000000
49:50.000000
50:51.000000
51:52.000000
52:53.000000
53:54.000000
54:55.000000
55:56.000000
56:57.000000
57:58.000000
58:59.000000
59:60.000000
60:61.000000
61:62.000000
62:63.000000
63:64.000000
Also, here's a second example, but now with "data" being an array where the size of each "x" can be different.
% cat test2.c
#include <stdio.h>
#include <stdlib.h>
#define M 16
typedef struct Data_{
double *x;
int n;
}Data;
void add(Data *data){
#pragma acc parallel loop present(data)
for(int i=0; i < data->n ; i++){
data->x[i] += 1.;
}
}
int main () {
Data *data;
data = (Data*) malloc(sizeof(Data)*M);
#pragma acc enter data create(data[0:M])
for (int i =0; i < M; ++i) {
data[i].n = i+1;
data[i].x = (double *) malloc(sizeof(double)*data[i].n);
for(int j=0; j < data[i].n ; j++){
data[i].x[j] = (double)((i*data[i].n) + j);
}
#pragma acc update device(data[i].n)
#pragma acc enter data copyin(data[i].x[0:data[i].n])
}
for (int i =0; i < M; ++i) {
add(&data[i]);
}
for (int i =0; i < M; ++i) {
#pragma acc update self(data[i].x[:data[i].n])
for(int j=0; j < data[i].n ; j++){
printf("%d:%d:%f\n",i,j,data[i].x[j]);
}}
for (int i =0; i < M; ++i) {
#pragma acc exit data delete(data[i].x)
free(data[i].x);
}
#pragma acc exit data delete(data)
free(data);
}
% pgcc test2.c -ta=tesla -Minfo=accel; a.out
add:
11, Generating present(data[:1])
Generating Tesla code
14, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
main:
22, Generating enter data create(data[:16])
32, Generating update device(data->n)
Generating enter data copyin(data->x[:data->n])
38, Generating update self(data->x[:data->n])
46, Generating exit data delete(data->x[:1])
49, Generating exit data delete(data[:1])
0:0:1.000000
1:0:3.000000
1:1:4.000000
2:0:7.000000
2:1:8.000000
2:2:9.000000
3:0:13.000000
3:1:14.000000
3:2:15.000000
3:3:16.000000
4:0:21.000000
4:1:22.000000
4:2:23.000000
4:3:24.000000
4:4:25.000000
5:0:31.000000
5:1:32.000000
5:2:33.000000
5:3:34.000000
5:4:35.000000
5:5:36.000000
6:0:43.000000
6:1:44.000000
6:2:45.000000
6:3:46.000000
6:4:47.000000
6:5:48.000000
6:6:49.000000
7:0:57.000000
7:1:58.000000
7:2:59.000000
7:3:60.000000
7:4:61.000000
7:5:62.000000
7:6:63.000000
7:7:64.000000
8:0:73.000000
8:1:74.000000
8:2:75.000000
8:3:76.000000
8:4:77.000000
8:5:78.000000
8:6:79.000000
8:7:80.000000
8:8:81.000000
9:0:91.000000
9:1:92.000000
9:2:93.000000
9:3:94.000000
9:4:95.000000
9:5:96.000000
9:6:97.000000
9:7:98.000000
9:8:99.000000
9:9:100.000000
10:0:111.000000
10:1:112.000000
10:2:113.000000
10:3:114.000000
10:4:115.000000
10:5:116.000000
10:6:117.000000
10:7:118.000000
10:8:119.000000
10:9:120.000000
10:10:121.000000
11:0:133.000000
11:1:134.000000
11:2:135.000000
11:3:136.000000
11:4:137.000000
11:5:138.000000
11:6:139.000000
11:7:140.000000
11:8:141.000000
11:9:142.000000
11:10:143.000000
11:11:144.000000
12:0:157.000000
12:1:158.000000
12:2:159.000000
12:3:160.000000
12:4:161.000000
12:5:162.000000
12:6:163.000000
12:7:164.000000
12:8:165.000000
12:9:166.000000
12:10:167.000000
12:11:168.000000
12:12:169.000000
13:0:183.000000
13:1:184.000000
13:2:185.000000
13:3:186.000000
13:4:187.000000
13:5:188.000000
13:6:189.000000
13:7:190.000000
13:8:191.000000
13:9:192.000000
13:10:193.000000
13:11:194.000000
13:12:195.000000
13:13:196.000000
14:0:211.000000
14:1:212.000000
14:2:213.000000
14:3:214.000000
14:4:215.000000
14:5:216.000000
14:6:217.000000
14:7:218.000000
14:8:219.000000
14:9:220.000000
14:10:221.000000
14:11:222.000000
14:12:223.000000
14:13:224.000000
14:14:225.000000
15:0:241.000000
15:1:242.000000
15:2:243.000000
15:3:244.000000
15:4:245.000000
15:5:246.000000
15:6:247.000000
15:7:248.000000
15:8:249.000000
15:9:250.000000
15:10:251.000000
15:11:252.000000
15:12:253.000000
15:13:254.000000
15:14:255.000000
15:15:256.000000
Note, be careful about copying structs with dynamic data members. Copying the struct itself, i.e. like you have above "#pragma acc exit data copyout(data[0:1])", will overwrite the host address of "x" with the device address. Instead, copy only "data->x" and delete "data".