OpenCL（Aparabi）在Radeon上的简单简化_Opencl_Aparapi

OpenCL（Aparabi）在Radeon上的简单简化

opencl

OpenCL（Aparabi）在Radeon上的简单简化,opencl,aparapi,Opencl,Aparapi,我试图在OpenCL中对一个大的双数组编写一个简单的约简（在本例中是求和）。我看过在线教程，发现这基本上就是解决我问题的方法： #pragma OPENCL EXTENSION cl_khr_fp64 : enable typedef struct This_s{ __global double *nums; int nums__javaArrayLength; __local double *buffer; __global double *res; int p

我试图在OpenCL中对一个大的双数组编写一个简单的约简（在本例中是求和）。我看过在线教程，发现这基本上就是解决我问题的方法：

#pragma OPENCL EXTENSION cl_khr_fp64 : enable

typedef struct This_s{
   __global double *nums;
   int nums__javaArrayLength;
   __local double *buffer;
   __global double *res;
   int passid;
}This;
int get_pass_id(This *this){
   return this->passid;
}
__kernel void run(
   __global double *nums, 
   int nums__javaArrayLength, 
   __local double *buffer, 
   __global double *res, 
   int passid
){
   This thisStruct;
   This* this=&thisStruct;
   this->nums = nums;
   this->nums__javaArrayLength = nums__javaArrayLength;
   this->buffer = buffer;
   this->res = res;
   this->passid = passid;
   {
      int tid = get_local_id(0);
      int i = (get_group_id(0) * get_local_size(0)) + get_local_id(0);
      int gridSize = get_local_size(0) * get_num_groups(0);
      int n = this->nums__javaArrayLength;
      double cur = 0.0;
      for (; i<n; i = i + gridSize){
         cur = cur + this->nums[i];
      }
      this->buffer[tid]  = cur;
      barrier(CLK_LOCAL_MEM_FENCE);
      barrier(CLK_LOCAL_MEM_FENCE);
      if (tid<32){
         this->buffer[tid]  = this->buffer[tid] + this->buffer[(tid + 32)];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
      if (tid<16){
         this->buffer[tid]  = this->buffer[tid] + this->buffer[(tid + 16)];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
      if (tid<8){
         this->buffer[tid]  = this->buffer[tid] + this->buffer[(tid + 8)];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
      if (tid<4){
         this->buffer[tid]  = this->buffer[tid] + this->buffer[(tid + 4)];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
      if (tid<2){
         this->buffer[tid]  = this->buffer[tid] + this->buffer[(tid + 2)];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
      if (tid<1){
         this->buffer[tid]  = this->buffer[tid] + this->buffer[(tid + 1)];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
      if (tid==0){
         this->res[get_group_id(0)]  = this->buffer[0];
      }
      return;
   }
}

#pragma OPENCL扩展cl_khr_fp64:启用
typedef结构这个{
__全球双*nums；
int nums__javaarraylelength；
__本地双*缓冲区；
__全球双*res；
int passid；
}这,；
int get_pass_id（This*This）{
返回此->密码ID；
}
__内核无效运行(
__全球双*nums，
int nums__javaArrayLength，
__本地双*缓冲区，
__全球双*res，
整数密码
){
这个结构；
This*This=&thisStruct；
这->nums=nums；
这->nums\uuu javaArrayLength=nums\uu javaArrayLength；
这个->缓冲区=缓冲区；
这->res=res；
此->passid=passid；
{
int tid=获取本地id（0）；
int i=（获取组id（0）*获取本地大小（0））+获取本地id（0）；
int gridSize=get_local_size（0）*get_num_groups（0）；
int n=此->nums\uuu javaArrayLength；
双电流=0.0；
对于（；单位[i]；
}
此->缓冲区[tid]=cur；
屏障（CLK_本地_MEM_围栏）；
屏障（CLK_本地_MEM_围栏）；
如果（tidbuffer[tid]=this->buffer[tid]+this->buffer[（tid+32）]；
}
屏障（CLK_本地_MEM_围栏）；
如果（tidbuffer[tid]=this->buffer[tid]+this->buffer[（tid+16）]；
}
屏障（CLK_本地_MEM_围栏）；
如果（tidbuffer[tid]=this->buffer[tid]+this->buffer[（tid+8）]；
}
屏障（CLK_本地_MEM_围栏）；
如果（tidbuffer[tid]=this->buffer[tid]+this->buffer[（tid+4）]；
}
屏障（CLK_本地_MEM_围栏）；
如果（tidbuffer[tid]=this->buffer[tid]+this->buffer[（tid+2）]；
}
屏障（CLK_本地_MEM_围栏）；
如果（tidbuffer[tid]=this->buffer[tid]+this->buffer[（tid+1）]；
}
屏障（CLK_本地_MEM_围栏）；
如果（tid==0）{
this->res[get_group_id（0）]=this->buffer[0]；
}
回来
}
}

如果您想知道奇怪的

这个，那是Aparabi的一个（不幸的是必要的）工件，我用它将Java翻译成OpenCL
我的内核产生了正确的结果，在相当坚固的Nvidia硬件上，它比Java中的顺序和快约10倍。但在Radeon R9 280上，它的性能与简单的Java代码相当
我已经用CodeXL分析了内核。它告诉我MemUnitBusy只有6%。为什么这么低？
结果表明OpenCL没有（直接）问题，但aparapis缓冲区管理有问题
我在没有Aparabi的情况下尝试了完全相同的内核，性能很好。当我使用CL\u MEM\u use\u HOST\u PTR
时，它就变差了，这是使用Aparabi时唯一的选择。看起来AMD没有使用该选项将主机内存复制到设备上，即使经过几次“预热”Run.你可能想考虑迁移到更活跃的项目中。它包括对上面链接的旧库的bug和许多额外的特性和性能增强的修复。它也在Maven Central中有十几个版本。所以使用起来更容易。新的./P>我为此打开了一张罚单并添加了一个BANT。y在这里：