OpenCL：将本地原子_inc插入到还原内核_Opencl

OpenCL：将本地原子_inc插入到还原内核

opencl

OpenCL：将本地原子_inc插入到还原内核,opencl,Opencl,我试图在一个可工作的精简内核中包含一个类似于暗零描述的局部原子。内核在一组点中找到一个最大值；局部原子的目的是允许我将选定的点号过滤到输出数组中，而不留任何间隙目前，当我使用局部原子来增加对局部数组的加法时，内核运行，但产生了一个错误的总体最高点。如果原子行被注释掉，则返回正确的结果这里发生了什么？我该如何修复它简化内核代码： __kernel void reduce(__global const float4* dataSet, __global const int* input, co

我试图在一个可工作的精简内核中包含一个类似于暗零描述的局部原子。内核在一组点中找到一个最大值；局部原子的目的是允许我将选定的点号过滤到输出数组中，而不留任何间隙

目前，当我使用局部原子来增加对局部数组的加法时，内核运行，但产生了一个错误的总体最高点。如果原子行被注释掉，则返回正确的结果

这里发生了什么？我该如何修复它

简化内核代码：

__kernel void reduce(__global const float4* dataSet, __global const int* input, const unsigned int items,                                   //points and index
                    __global int* output, __local float4* shared, const unsigned int n,                                                 //finding highest
                        __global int* filtered, __global const float2* tri_input, const unsigned int pass,                              //finding filtered
                            __global int* global_count                                                                                  //global count
                                ){
//set everything up

const unsigned int group_id = get_global_id(0) / get_local_size(0);
const unsigned int local_id = get_local_id(0);
const unsigned int group_size = items;
const unsigned int group_stride = 2 * group_size;
const int local_stride = group_stride * group_size;

__local float4 *zeroIt = &shared[local_id];
zeroIt->x = 0; zeroIt->y = 0; zeroIt->z = 0; zeroIt->w = 0;

volatile __local int local_count_set_1;
volatile __local int global_val_set_1;
volatile __local int filter_local[64];

if(local_id==0){
    local_count_set_1 = 0;
    global_val_set_1 = -1;
}
barrier(CLK_LOCAL_MEM_FENCE);

int i = group_id * group_stride + local_id;

while (i < n){
//load up a pair of points using the index to locate them within a massive dataSet
    int ia = input[i];
    float4 a = dataSet[ia-1];

    int ib = input[i + group_size];
    float4 b = dataSet[ib-1];

    //on the first pass kernel increment a local count
    if(pass == 0){
        filter_local[atomic_inc(&local_count_set_1)] = 1;  //including this line causes an erroneous highest point result
        //filter_local[local_id] = 1; //but including this line does not
        //atomic_inc(&local_count_set_1); //and neither does this one
    }

    //find the highest of the pair
    float4 result;
    if(a.z>b.z) result = a;
    else result = b;

    //load up the previous highest result locally
    float4 s = shared[local_id];

    //if the previous highest beat this, stick, else twist
    if(s.z>result.z){ result = s; }
    shared[local_id] = result;
    i += local_stride;
}

barrier(CLK_LOCAL_MEM_FENCE);
if (group_size >= 512){
    if (local_id < 256) {
        __local float4 *a = &shared[local_id];
        __local float4 *b = &shared[local_id+256];
        if(b->z>a->z){  shared[local_id] = shared[local_id+256]; }
    }}

//repeat barrier ops in increments down to group_size>=2 - this filters the highest result in shared
//finally, return the filtered highest result of shared to the global level

barrier(CLK_LOCAL_MEM_FENCE);
    if(local_id == 0){
        __local float4 *v = &shared[0];
        int send = v->w ;
        output[group_id] = send+1;
    }}

\uuuuu内核void reduce（\uuuu全局常量float4*数据集，\uuuu全局常量int*输入，常量无符号int项，//点和索引
__全局int*输出，\ uu本地float4*共享，常量unsigned int n，//查找最高值
__全局整型*已筛选，\全局常量浮点2*三输入，常量无符号整型传递，//查找已筛选
__全局整数*全局计数//全局计数
){
//安排一切
const unsigned int group_id=get_global_id（0）/get_local_size（0）；
const unsigned int local_id=get_local_id（0）；
常量无符号整数组大小=项；
常量无符号整数组步长=2*组大小；
const int local_stride=组步数*组步数大小；
__本地float4*zeroIt=&shared[local_id]；
零位->x=0；零位->y=0；零位->z=0；零位->w=0；
volatile uu local int local u count u set 1；
volatile uu local int global u val u set_1；
volatile uu local int filter_local[64]；
如果（本地_id==0）{
本地计数集=0；
全局值集1=-1；
}
屏障（CLK_本地_MEM_围栏）；
int i=组id*组步幅+本地id；
而（ib.z）结果=a；
其他结果=b；
//本地加载上一个最高的结果
float4 s=共享[本地_id]；
//如果前一个最高点击败了这个，坚持，否则扭转
如果（s.z>result.z）{result=s；}
共享的[local_id]=结果；
i+=局部步幅；
}
屏障（CLK_本地_MEM_围栏）；
如果（组大小>=512）{
如果（本地_id<256）{
__本地float4*a=&shared[local_id]；
__本地float4*b=&shared[local_id+256]；
如果（b->z>a->z）{shared[local_id]=shared[local_id+256]；}
}}
//以增量重复屏障操作，直到组大小>=2-这将过滤共享中的最高结果
//最后，将筛选出的最高共享结果返回到全局级别
屏障（CLK_本地_MEM_围栏）；
如果（本地_id==0）{
__本地float4*v=&shared[0]；
int send=v->w；
输出[组id]=发送+1；
}}

[更新]：当包含原子_inc线时，“错误”的最高点结果始终是测试数据集末尾附近的一个点。我猜这意味着atomic_inc正在影响后面的比较，但我不确定具体是什么或在哪里

[更新]：编辑代码以简化/澄清/更新调试调整。仍然不起作用，这让我发疯。

全脸手掌瞬间。在内核的设置阶段，有以下几行：

if(local_id==0){
   local_count_set_1 = 0;
   global_val_set_1 = -1;
}
barrier(CLK_LOCAL_MEM_FENCE);

当这些被拆分并且本地计数集包含在while循环中时，不会发生错误。i、 e:

if(local_id==0) global_val_set_1 = -1;
barrier(CLK_LOCAL_MEM_FENCE);

while (i < n){
    if(local_id==0) local_count_set_1 = 0;
    barrier(CLK_LOCAL_MEM_FENCE);
    ....
    if(pass = 0){
        filter_local[atomic_inc(&local_count_set_1)] = 1;
    }
    ....

if（local_id==0）global_val_set_1=-1；
屏障（CLK_本地_MEM_围栏）；
而（i


我希望这能解决问题//如果不能，也会更新
aa那是一个周末，我再也回不来了。
试着将过滤器\u本地\u 1
数组声明为volatile。这就是OpenCL原子函数所期望的：\u本地volatile int过滤器\u本地\u 1[GROUP\u SIZE]；
不，仍然存在同样的问题。更多的调查表明，在尝试这一行时：过滤器\u本地\u 1[get_local_id（0）]=&aW；
给出正常结果，此行：filter_local_1[atomic_inc（&local_count_set_1）]=&aW；
给出了一个错误的“un”。看着错误的结果，最高点选择器似乎返回了最后一个最高点结果，而不是总体最高点……这可能是原子公司影响的线索，但我无法理解