Struct OpenACC如何在函数的不同调用之间保存数据?

Struct OpenACC如何在函数的不同调用之间保存数据?,struct,gnu-parallel,openacc,Struct,Gnu Parallel,Openacc,我正试图用OpenACC优化一个应用程序。大体上,我有一个这种类型的迭代循环: while(t<tstop){ add(&data, nx); } 我在while循环中调用的函数是可并行的,但我没有做到的是在函数的不同调用之间在设备内存中维护数组x[] void add(Data *data, int n){ #pragma acc data pcopy(data[0:1]) #pragma acc data pcopy(data->x[0:n]) #

我正试图用OpenACC优化一个应用程序。大体上,我有一个这种类型的迭代循环:

while(t<tstop){

 add(&data, nx);

}
我在while循环中调用的函数是可并行的,但我没有做到的是在函数的不同调用之间在设备内存中维护数组x[]

void add(Data *data, int n){

  #pragma acc data pcopy(data[0:1])
  #pragma acc data pcopy(data->x[0:n])

  #pragma acc parallel loop
  for(int i=0; i < n ; i++){
    data->x[i] += 1.;
  }
  #pragma acc exit data copyout(data->x[0:n])
  #pragma acc exit data copyout(data[0:1])
}
void添加(数据*数据,整数n){
#pragma acc数据pcopy(数据[0:1])
#pragma acc数据pcopy(数据->x[0:n])
#pragma-acc并行环路
对于(int i=0;ix[i]+=1。;
}
#pragma acc退出数据复制输出(数据->x[0:n])
#pragma acc退出数据复制输出(数据[0:1])
}
我知道这个程序似乎毫无意义,但我只是写了一些东西来重现我在真实代码中遇到的问题

我尝试使用非结构化数据区域:

#pragma acc enter data copyin(data[0:1])
#pragma acc enter data copyin(data->x[0:n])

#pragma acc data present(data[:1], data->x[:n])
#pragma acc parallel loop
  for(int i=0; i < n ; i++){
    data->x[i] += 1.;
  }

#pragma acc exit data copyout(data->x[0:n])
#pragma acc exit data copyout(data[0:1])
#pragma acc输入数据copyin(数据[0:1])
#pragma acc输入数据copyin(数据->x[0:n])
#存在pragma acc数据(数据[:1],数据->x[:n])
#pragma-acc并行环路
对于(int i=0;ix[i]+=1。;
}
#pragma acc退出数据复制输出(数据->x[0:n])
#pragma acc退出数据复制输出(数据[0:1])
但由于某种原因,我得到了一个这种类型的错误:

while(t<tstop){

 add(&data, nx);

}

致命错误:data子句中的变量在设备上部分存在:name=data

我无法从它提供的代码剪报中重现部分存在的错误,因此不清楚发生此错误的原因。通常,当当前表中变量的大小与data子句中使用的大小不同时,会发生错误。如果你能提供一个复制的例子,我可以看一看并确定为什么会发生在这里

为了回答主题问题,可以在设备变量所在的数据区域范围内的任何位置访问设备变量,甚至可以跨子程序访问。对于非结构化数据区域(即输入数据/退出数据),范围在运行时在输入和退出调用之间定义。对于结构化数据区域,范围由结构化块定义

下面是一个使用上面定义的结构的示例(尽管我已经将x的大小作为结构的一部分)

%cat test.c
#包括
#包括
typedef结构数据_{
双*x;
int n;
}数据;
无效添加(数据*数据){
#存在pragma acc并行环路(数据)
对于(int i=0;in;i++){
数据->x[i]+=1。;
}
}
int main(){
数据*数据;
数据=(数据*)malloc(sizeof(数据));
数据->n=64;
数据->x=(双*)malloc(sizeof(双)*数据->n);
对于(int i=0;in;i++){
数据->x[i]=(双)i;
}
#pragma acc输入数据copyin(数据[0:1])
#pragma acc输入数据copyin(数据->x[0:数据->n])
添加(数据);
#pragma acc退出数据复制输出(数据->x[0:数据->n])
#pragma acc退出数据删除(数据)
对于(int i=0;in;i++){
printf(“%d:%f\n”,i,data->x[i]);
}
免费(数据->x);
免费(数据);
}
%pgcc试验。c-ta=特斯拉-Minfo=加速度;a、 出去
加:
12,生成当前(数据[:])
生成特斯拉码
13,#pragma acc循环组,向量(128)/*blockIdx.x threadIdx.x*/
主要内容:
28,生成输入数据copyin(数据[:1])
29,生成输入数据copyin(数据->x[:数据->n])
31,生成退出数据复制输出(数据->x[:数据->n])
32,生成退出数据删除(数据[:1])
0:1.000000
1:2.000000
2:3.000000
3:4.000000
4:5.000000
5:6.000000
6:7.000000
7:8.000000
8:9.000000
9:10.000000
10:11.000000
11:12.000000
12:13.000000
13:14.000000
14:15.000000
15:16.000000
16:17.000000
17:18.000000
18:19.000000
19:20.000000
20:21.000000
21:22.000000
22:23.000000
23:24.000000
24:25.000000
25:26.000000
26:27.000000
27:28.000000
28:29.000000
29:30.000000
30:31.000000
31:32.000000
32:33.000000
33:34.000000
34:35.000000
35:36.000000
36:37.000000
37:38.000000
38:39.000000
39:40.000000
40:41.000000
41:42.000000
42:43.000000
43:44.000000
44:45.000000
45:46.000000
46:47.000000
47:48.000000
48:49.000000
49:50.000000
50:51.000000
51:52.000000
52:53.000000
53:54.000000
54:55.000000
55:56.000000
56:57.000000
57:58.000000
58:59.000000
59:60.000000
60:61.000000
61:62.000000
62:63.000000
63:64.000000
还有,这里是第二个示例,但现在“data”是一个数组,其中每个“x”的大小可能不同

% cat test2.c
#include <stdio.h>
#include <stdlib.h>

#define M 16

typedef struct Data_{
  double *x;
  int n;
}Data;

void add(Data *data){

#pragma acc parallel loop present(data)
  for(int i=0; i < data->n ; i++){
    data->x[i] += 1.;
  }
}

int main () {

   Data *data;
   data = (Data*) malloc(sizeof(Data)*M);
#pragma acc enter data create(data[0:M])
   for (int i =0; i < M; ++i) {
      data[i].n = i+1;
      data[i].x = (double *) malloc(sizeof(double)*data[i].n);
      for(int j=0; j < data[i].n ; j++){
         data[i].x[j] = (double)((i*data[i].n) + j);
      }
#pragma acc update device(data[i].n)
#pragma acc enter data copyin(data[i].x[0:data[i].n])
   }

   for (int i =0; i < M; ++i) {
     add(&data[i]);
   }

   for (int i =0; i < M; ++i) {
#pragma acc update self(data[i].x[:data[i].n])
     for(int j=0; j < data[i].n ; j++){
      printf("%d:%d:%f\n",i,j,data[i].x[j]);
   }}

   for (int i =0; i < M; ++i) {
#pragma acc exit data delete(data[i].x)
      free(data[i].x);
   }
#pragma acc exit data delete(data)
   free(data);

}
% pgcc test2.c -ta=tesla -Minfo=accel; a.out
add:
     11, Generating present(data[:1])
         Generating Tesla code
         14, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
main:
     22, Generating enter data create(data[:16])
     32, Generating update device(data->n)
         Generating enter data copyin(data->x[:data->n])
     38, Generating update self(data->x[:data->n])
     46, Generating exit data delete(data->x[:1])
     49, Generating exit data delete(data[:1])
0:0:1.000000
1:0:3.000000
1:1:4.000000
2:0:7.000000
2:1:8.000000
2:2:9.000000
3:0:13.000000
3:1:14.000000
3:2:15.000000
3:3:16.000000
4:0:21.000000
4:1:22.000000
4:2:23.000000
4:3:24.000000
4:4:25.000000
5:0:31.000000
5:1:32.000000
5:2:33.000000
5:3:34.000000
5:4:35.000000
5:5:36.000000
6:0:43.000000
6:1:44.000000
6:2:45.000000
6:3:46.000000
6:4:47.000000
6:5:48.000000
6:6:49.000000
7:0:57.000000
7:1:58.000000
7:2:59.000000
7:3:60.000000
7:4:61.000000
7:5:62.000000
7:6:63.000000
7:7:64.000000
8:0:73.000000
8:1:74.000000
8:2:75.000000
8:3:76.000000
8:4:77.000000
8:5:78.000000
8:6:79.000000
8:7:80.000000
8:8:81.000000
9:0:91.000000
9:1:92.000000
9:2:93.000000
9:3:94.000000
9:4:95.000000
9:5:96.000000
9:6:97.000000
9:7:98.000000
9:8:99.000000
9:9:100.000000
10:0:111.000000
10:1:112.000000
10:2:113.000000
10:3:114.000000
10:4:115.000000
10:5:116.000000
10:6:117.000000
10:7:118.000000
10:8:119.000000
10:9:120.000000
10:10:121.000000
11:0:133.000000
11:1:134.000000
11:2:135.000000
11:3:136.000000
11:4:137.000000
11:5:138.000000
11:6:139.000000
11:7:140.000000
11:8:141.000000
11:9:142.000000
11:10:143.000000
11:11:144.000000
12:0:157.000000
12:1:158.000000
12:2:159.000000
12:3:160.000000
12:4:161.000000
12:5:162.000000
12:6:163.000000
12:7:164.000000
12:8:165.000000
12:9:166.000000
12:10:167.000000
12:11:168.000000
12:12:169.000000
13:0:183.000000
13:1:184.000000
13:2:185.000000
13:3:186.000000
13:4:187.000000
13:5:188.000000
13:6:189.000000
13:7:190.000000
13:8:191.000000
13:9:192.000000
13:10:193.000000
13:11:194.000000
13:12:195.000000
13:13:196.000000
14:0:211.000000
14:1:212.000000
14:2:213.000000
14:3:214.000000
14:4:215.000000
14:5:216.000000
14:6:217.000000
14:7:218.000000
14:8:219.000000
14:9:220.000000
14:10:221.000000
14:11:222.000000
14:12:223.000000
14:13:224.000000
14:14:225.000000
15:0:241.000000
15:1:242.000000
15:2:243.000000
15:3:244.000000
15:4:245.000000
15:5:246.000000
15:6:247.000000
15:7:248.000000
15:8:249.000000
15:9:250.000000
15:10:251.000000
15:11:252.000000
15:12:253.000000
15:13:254.000000
15:14:255.000000
15:15:256.000000
%cat test2.c
#包括
#包括
#定义M16
typedef结构数据_{
双*x;
int n;
}数据;
无效添加(数据*数据){
#存在pragma acc并行环路(数据)
对于(int i=0;in;i++){
数据->x[i]+=1。;
}
}
int main(){
数据*数据;
数据=(数据*)malloc(sizeof(数据)*M);
#pragma acc输入数据创建(数据[0:M])
对于(int i=0;in)
正在生成输入数据复制(数据->x[:数据->n])
38,生成更新自我(数据->x[:数据->n])
46,生成退出数据删除(数据->x[:1])
49,生成退出数据删除(数据[:1])
0:0:1.000000
1:0:3.000000
1:1:4.000000
2:0:7.000000
2:1:8.000000
2:2:9.000000
3:0:13.000000
3:1:14.000000
3:2:15.000000
3:3:16.000000
4:0:21.000000
4:1:22.000000
4:2:23.000000
4:3:24.000000
4:4:25.000000
5:0:31.000000
5:1:32.000000
5:2:33.
% cat test2.c
#include <stdio.h>
#include <stdlib.h>

#define M 16

typedef struct Data_{
  double *x;
  int n;
}Data;

void add(Data *data){

#pragma acc parallel loop present(data)
  for(int i=0; i < data->n ; i++){
    data->x[i] += 1.;
  }
}

int main () {

   Data *data;
   data = (Data*) malloc(sizeof(Data)*M);
#pragma acc enter data create(data[0:M])
   for (int i =0; i < M; ++i) {
      data[i].n = i+1;
      data[i].x = (double *) malloc(sizeof(double)*data[i].n);
      for(int j=0; j < data[i].n ; j++){
         data[i].x[j] = (double)((i*data[i].n) + j);
      }
#pragma acc update device(data[i].n)
#pragma acc enter data copyin(data[i].x[0:data[i].n])
   }

   for (int i =0; i < M; ++i) {
     add(&data[i]);
   }

   for (int i =0; i < M; ++i) {
#pragma acc update self(data[i].x[:data[i].n])
     for(int j=0; j < data[i].n ; j++){
      printf("%d:%d:%f\n",i,j,data[i].x[j]);
   }}

   for (int i =0; i < M; ++i) {
#pragma acc exit data delete(data[i].x)
      free(data[i].x);
   }
#pragma acc exit data delete(data)
   free(data);

}
% pgcc test2.c -ta=tesla -Minfo=accel; a.out
add:
     11, Generating present(data[:1])
         Generating Tesla code
         14, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
main:
     22, Generating enter data create(data[:16])
     32, Generating update device(data->n)
         Generating enter data copyin(data->x[:data->n])
     38, Generating update self(data->x[:data->n])
     46, Generating exit data delete(data->x[:1])
     49, Generating exit data delete(data[:1])
0:0:1.000000
1:0:3.000000
1:1:4.000000
2:0:7.000000
2:1:8.000000
2:2:9.000000
3:0:13.000000
3:1:14.000000
3:2:15.000000
3:3:16.000000
4:0:21.000000
4:1:22.000000
4:2:23.000000
4:3:24.000000
4:4:25.000000
5:0:31.000000
5:1:32.000000
5:2:33.000000
5:3:34.000000
5:4:35.000000
5:5:36.000000
6:0:43.000000
6:1:44.000000
6:2:45.000000
6:3:46.000000
6:4:47.000000
6:5:48.000000
6:6:49.000000
7:0:57.000000
7:1:58.000000
7:2:59.000000
7:3:60.000000
7:4:61.000000
7:5:62.000000
7:6:63.000000
7:7:64.000000
8:0:73.000000
8:1:74.000000
8:2:75.000000
8:3:76.000000
8:4:77.000000
8:5:78.000000
8:6:79.000000
8:7:80.000000
8:8:81.000000
9:0:91.000000
9:1:92.000000
9:2:93.000000
9:3:94.000000
9:4:95.000000
9:5:96.000000
9:6:97.000000
9:7:98.000000
9:8:99.000000
9:9:100.000000
10:0:111.000000
10:1:112.000000
10:2:113.000000
10:3:114.000000
10:4:115.000000
10:5:116.000000
10:6:117.000000
10:7:118.000000
10:8:119.000000
10:9:120.000000
10:10:121.000000
11:0:133.000000
11:1:134.000000
11:2:135.000000
11:3:136.000000
11:4:137.000000
11:5:138.000000
11:6:139.000000
11:7:140.000000
11:8:141.000000
11:9:142.000000
11:10:143.000000
11:11:144.000000
12:0:157.000000
12:1:158.000000
12:2:159.000000
12:3:160.000000
12:4:161.000000
12:5:162.000000
12:6:163.000000
12:7:164.000000
12:8:165.000000
12:9:166.000000
12:10:167.000000
12:11:168.000000
12:12:169.000000
13:0:183.000000
13:1:184.000000
13:2:185.000000
13:3:186.000000
13:4:187.000000
13:5:188.000000
13:6:189.000000
13:7:190.000000
13:8:191.000000
13:9:192.000000
13:10:193.000000
13:11:194.000000
13:12:195.000000
13:13:196.000000
14:0:211.000000
14:1:212.000000
14:2:213.000000
14:3:214.000000
14:4:215.000000
14:5:216.000000
14:6:217.000000
14:7:218.000000
14:8:219.000000
14:9:220.000000
14:10:221.000000
14:11:222.000000
14:12:223.000000
14:13:224.000000
14:14:225.000000
15:0:241.000000
15:1:242.000000
15:2:243.000000
15:3:244.000000
15:4:245.000000
15:5:246.000000
15:6:247.000000
15:7:248.000000
15:8:249.000000
15:9:250.000000
15:10:251.000000
15:11:252.000000
15:12:253.000000
15:13:254.000000
15:14:255.000000
15:15:256.000000