Cuda 使用OpenCL的累积数组求和

Cuda 使用OpenCL的累积数组求和,cuda,concurrency,opencl,Cuda,Concurrency,Opencl,我在用OpenCL计算n维点之间的欧几里德距离。我得到两个n维点的列表,我应该返回一个数组,其中只包含从第一个表中的每个点到第二个表中的每个点的距离 我的方法是做规则的doble循环(对于表1中的每个点{对于表2中的每个点{…}}),然后对并行中的每对点进行计算 然后将欧几里德距离分成3部分: 1.取点中各尺寸之间的差值 2.平方差(仍然适用于每个尺寸) 3.将2中获得的所有值相加。 4.取3中获得的值的平方根。(本例中省略了此步骤。) 在我尝试累积所有差异的总和之前(即,执行上面描述的过程的步

我在用OpenCL计算n维点之间的欧几里德距离。我得到两个n维点的列表,我应该返回一个数组,其中只包含从第一个表中的每个点到第二个表中的每个点的距离

我的方法是做规则的doble循环(对于表1中的每个点{对于表2中的每个点{…}}),然后对并行中的每对点进行计算

然后将欧几里德距离分成3部分: 1.取点中各尺寸之间的差值 2.平方差(仍然适用于每个尺寸) 3.将2中获得的所有值相加。 4.取3中获得的值的平方根。(本例中省略了此步骤。)

在我尝试累积所有差异的总和之前(即,执行上面描述的过程的步骤3,下面代码的第49行),一切都像一个符咒一样工作

作为测试数据,我使用的描述符列表各有2个点: 描述符列表1:001002003,…,127128;(p1) 129130131,…,255256;(p2)

描述符列表2:000001002,…,126127;(p1) 128129130,…,254255;(p2)

因此结果向量的值应该是:1282064512130048128 现在我得到的随机数随每次跑步而变化

我感谢任何关于我做错了什么的帮助或线索。希望我所处的场景一切都清楚

#define BLOCK_SIZE 128

typedef struct
{
    //How large each point is
    int length;
    //How many points in every list
    int num_elements;
    //Pointer to the elements of the descriptor (stored as a raw array)
    __global float *elements;
} DescriptorList;

__kernel void CompareDescriptors_deb(__global float *C, DescriptorList A, DescriptorList B, int elements, __local float As[BLOCK_SIZE])
{

    int gpidA = get_global_id(0);

    int featA = get_local_id(0);

    //temporary array  to store the difference between each dimension of 2 points
    float dif_acum[BLOCK_SIZE];

    //counter to track the iterations of the inner loop
    int loop = 0;

    //loop over all descriptors in A
    for (int i = 0; i < A.num_elements/BLOCK_SIZE; i++){

        //take the i-th descriptor. Returns a DescriptorList with just the i-th
        //descriptor in DescriptorList A
        DescriptorList tmpA = GetDescriptor(A, i);

        //copy the current descriptor to local memory.
        //returns one element of the only descriptor in DescriptorList tmpA
        //and index featA
        As[featA] = GetElement(tmpA, 0, featA);
        //wait for all the threads to finish copying before continuing
        barrier(CLK_LOCAL_MEM_FENCE);

        //loop over all the descriptors in B
        for (int k = 0; k < B.num_elements/BLOCK_SIZE; k++){
            //take the difference of both current points
            dif_acum[featA] = As[featA]-B.elements[k*BLOCK_SIZE + featA];
            //wait again
            barrier(CLK_LOCAL_MEM_FENCE);
            //square value of the difference in dif_acum and store in C
            //which is where the results should be stored at the end.
            C[loop] = 0;
            C[loop] += dif_acum[featA]*dif_acum[featA];
            loop += 1;
            barrier(CLK_LOCAL_MEM_FENCE);
        }
    }
}
#定义块大小128
类型定义结构
{
//每个点有多大
整数长度;
//每个列表中有多少个点
int num_元素;
//指向描述符元素的指针(存储为原始数组)
__全球浮动*要素;
}描述符列表;
__内核void comparedDescriptors_deb(_全局浮点*C,描述符列表A,描述符列表B,int元素,_局部浮点为[BLOCK_SIZE])
{
int gpidA=get_global_id(0);
int featA=获取本地id(0);
//用于存储两个点的每个维度之间差异的临时数组
浮动dif_acum[块大小];
//计数器以跟踪内部循环的迭代
int循环=0;
//循环一个表中的所有描述符
对于(int i=0;i
您的问题在于这些代码行:

C[loop] = 0;
C[loop] += dif_acum[featA]*dif_acum[featA];
您的工作组中的所有线程(实际上是您的所有线程,但让我们稍后再讨论)都试图在不进行任何同步的情况下同时修改此数组位置。有几个因素使这一问题变得非常严重:

  • 工作组不能保证完全并行工作,这意味着对于某些线程,可以在其他线程已经执行下一行之后调用C[loop]=0
  • 那些并行执行的线程都从C[loop]读取相同的值,用它们的增量修改它并尝试写回相同的地址。我不完全确定写回的结果是什么(我认为其中一个线程成功写回,而其他线程失败,但我不完全确定),但这两种方式都是错误的
  • 现在让我们来解决这个问题: 虽然我们可以使用原子学在全局内存中实现这一点,但它不会很快,所以让我们在本地内存中积累:

    local float* accum;
    ...
    accum[featA] = dif_acum[featA]*dif_acum[featA];
    barrier(CLK_LOCAL_MEM_FENCE);
    for(unsigned int i = 1; i < BLOCKSIZE; i *= 2)
    {
        if ((featA % (2*i)) == 0)
            accum[featA] += accum[featA + i];
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if(featA == 0)
        C[loop] = accum[0];
    
    这将使代码(不考虑前两个项目):

    \uuuuuuu内核无效比较描述符\u deb(\uuuu全局浮点*C,描述符列表A,描述符列表B,int元素,\uuuu局部浮点累计[块大小])
    {
    int gpidA=get_global_id(0);
    int featA=获取本地id(0);
    int循环=0;
    对于(int i=0;i
    多亏了Grizzly,我现在有了一个可以工作的内核。根据Grizzly的回答,我需要修改一些东西:

    我在例程的开头添加了一个IF语句,以丢弃所有不会引用我正在使用的数组中任何有效位置的线程

    if(featA > BLOCK_SIZE){return;}
    
    当将第一个描述符复制到本地(共享)内存(例如,复制到Bs)时,必须指定索引,因为函数GetElement每次调用只返回一个元素(我在问题中跳过了这个)

    然后,扫描循环需要一些调整,因为每次迭代后缓冲区都会被覆盖,并且无法控制哪个t
    __kernel void CompareDescriptors_deb(__global float *C, DescriptorList A, DescriptorList B, int elements, __local float accum[BLOCK_SIZE])
    {
       int gpidA = get_global_id(0);
       int featA = get_local_id(0);
       int loop = 0;
       for (int i = 0; i < A.num_elements/BLOCK_SIZE; i++){
           DescriptorList tmpA = GetDescriptor(A, i);
           float As = GetElement(tmpA, 0, featA);
           for (int k = 0; k < B.num_elements/BLOCK_SIZE; k++){
               float dif_acum = As-B.elements[k*BLOCK_SIZE + featA];
    
               accum[featA] = dif_acum[featA]*dif_acum[featA];
               barrier(CLK_LOCAL_MEM_FENCE);
               for(unsigned int i = 1; i < BLOCKSIZE; i *= 2)
               {
                  if ((featA % (2*i)) == 0)
                     accum[featA] += accum[featA + i];
                  barrier(CLK_LOCAL_MEM_FENCE);
               }
               if(featA == 0)
                  C[loop] = accum[0];
               barrier(CLK_LOCAL_MEM_FENCE);
    
               loop += 1;
            }
        }
    }
    
    if(featA > BLOCK_SIZE){return;}
    
    Bs[featA] = GetElement(tmpA, 0, featA);
    
    dif_acum[featA] = accum[featA];
    
    if (featA >= j && next_addend >= 0 && next_addend < BLOCK_SIZE){
    
    if(featA == 0){
        C[loop] = accum[BLOCK_SIZE-1];
        loop += 1;
    }
    
    __kernel void CompareDescriptors(__global float *C, DescriptorList A, DescriptorList B, int elements, __local float accum[BLOCK_SIZE], __local float Bs[BLOCK_SIZE])
    {
    
        int gpidA = get_global_id(0);
        int featA = get_local_id(0);
    
        //global counter to store final differences
        int loop = 0;
    
        //auxiliary buffer to store temporary data
        local float dif_acum[BLOCK_SIZE];
    
        //discard the threads that are not going to be used.
        if(featA > BLOCK_SIZE){
            return;
        }
    
        //loop over all descriptors in A
        for (int i = 0; i < A.num_elements/BLOCK_SIZE; i++){
    
            //take the gpidA-th descriptor
            DescriptorList tmpA = GetDescriptor(A, i);
    
            //copy the current descriptor to local memory
            Bs[featA] = GetElement(tmpA, 0, featA);
    
            //loop over all the descriptors in B
            for (int k = 0; k < B.num_elements/BLOCK_SIZE; k++){
                //take the difference of both current descriptors
                dif_acum[featA] = Bs[featA]-B.elements[k*BLOCK_SIZE + featA];
    
                //square the values in dif_acum
                accum[featA] = dif_acum[featA]*dif_acum[featA];
                barrier(CLK_LOCAL_MEM_FENCE);
    
                //copy the values of accum to keep consistency once the scan procedure starts. Mostly important for the first element. Two buffers are necesarry because the scan procedure would override values that are then further read if one buffer is being used instead.
                dif_acum[featA] = accum[featA];
    
                //Compute the accumulated sum (a.k.a. scan)
                for(int j = 1; j < BLOCK_SIZE; j *= 2){
                    int next_addend = featA-(j/2);
                    if (featA >= j && next_addend >= 0 && next_addend < BLOCK_SIZE){
                        dif_acum[featA] = accum[featA] + accum[next_addend];
                    }
                    barrier(CLK_LOCAL_MEM_FENCE);
    
                    //copy As to accum
                    accum[featA] = GetElementArray(dif_acum, BLOCK_SIZE, featA); 
                    barrier(CLK_LOCAL_MEM_FENCE);
                }
    
                //tell one of the threads to write the result of the scan in the array containing the results.
                if(featA == 0){
                    C[loop] = accum[BLOCK_SIZE-1];
                    loop += 1;
                }
                barrier(CLK_LOCAL_MEM_FENCE);
    
            }
        }
    }