Cuda编程直方图_Cuda_Gpu - Fatal编程技术网

Cuda编程直方图

cuda

Cuda编程直方图,cuda,gpu,Cuda,Gpu,我想运行cuda程序，但我是一个初学者。我必须为直方图编写一个程序。但是用水桶。根据maxValue（示例中为40），该数字将添加到相应的存储桶中。如果我们有4个桶：历史：1 | 10 | 30 | 39 | 32 | 2 | 4 | 5 | 1| 0-9（第一个铲斗） 10-19（第二个铲斗） 20-29（第三个铲斗） 30-39（第四个铲斗）我的GPU具有计算能力1.1。我试着做一些事情，比如为一个块创建一个共享的temp[]，每个线程在他的temp表上添加他的值： __global

我想运行cuda程序，但我是一个初学者。我必须为直方图编写一个程序。但是用水桶。根据maxValue（示例中为40），该数字将添加到相应的存储桶中。如果我们有4个桶：

历史：1 | 10 | 30 | 39 | 32 | 2 | 4 | 5 | 1|

0-9（第一个铲斗）

10-19（第二个铲斗）

20-29（第三个铲斗）

30-39（第四个铲斗）

我的GPU具有计算能力1.1。

我试着做一些事情，比如为一个块创建一个共享的temp[]，每个线程在他的temp表上添加他的值：

__global__ void histo_kernel_optimized5( unsigned char *buffer, long size,
                               unsigned int *histo )
{
     extern __shared__ unsigned int temp[];
     temp[threadIdx.x] = 0;
     __syncthreads();

     int i = threadIdx.x + blockIdx.x * blockDim.x;
     int offset = blockDim.x * gridDim.x;
     int bucketID;
     while (i < size)
     {
              bucketID = array[i]/Bwidth;
              atomicAdd( &temp[bucketID], 1);
              i += offset;
     }
     __syncthreads();


    atomicAdd( &(histo[threadIdx.x]), temp[threadIdx.x] );
}

histo_kernel_optimized <<<array_size/buckets, buckets,buckets*sizeof(unsigned int)>>>(buffer,SIZE, histogram)

\uuuuu全局\uuuuuu无效历史内核\u优化5（无符号字符*缓冲区，长大小，
无符号整数*histo）
{
外部共享无符号整数温度[]；
温度[threadIdx.x]=0；
__同步线程（）；
int i=threadIdx.x+blockIdx.x*blockDim.x；
int offset=blockDim.x*gridDim.x；
内巴基蒂；
而（i


但是编译器说：
指令“{atom，red}.shared”需要。目标sm_12或更高版本
我还尝试为创建的每个线程创建一个临时表：
__global__ void histo_kernel_optimized5( unsigned char *buffer, long size,
                               unsigned int *histo )
{
    unsigned int temp[buckets];
     int j;
    for (j=0;j<buckets;j++){
        temp[j]=0;
    }

    int bucketID;

    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int offset = blockDim.x * gridDim.x;
    while (i < size)
    {
        bucketID = array[i]/Bwidth;
        temp[bucketID]++;
        i += offset;
    }


    for (j=0;j<buckets;j++){
        histo[j] += temp[j];    
    }
 }

\uuuuu全局\uuuuuu无效历史内核\u优化5（无符号字符*缓冲区，长大小，
无符号整数*histo）
{
无符号整数温度[桶]；
int j；
对于（j=0；j使用原子操作很容易实现直方图。我不知道您为什么要编写如此复杂的内核。并行化操作的动机是利用算法的并行性。不需要在内核中迭代整个直方图。下面是一个示例CUDA内核和包装函数，用于使用指定数量的存储单元计算数组的直方图。
我不认为它可以为Compute1.1设备进行进一步优化，但对于Compute1.2，可以利用共享内存
__global__ void kernel_getHist(unsigned char* array, long size, unsigned int* histo, int buckets)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if(tid>=size)   return;

    unsigned char value = array[tid];

    int bin = value % buckets;

    atomicAdd(&histo[bin],1);
}

void getHist(unsigned char* array, long size, unsigned int* histo,int buckets)
{
    unsigned char* dArray;
    cudaMalloc(&dArray,size);
    cudaMemcpy(dArray,array,size,cudaMemcpyHostToDevice);

    unsigned int* dHist;
    cudaMalloc(&dHist,buckets * sizeof(int));
    cudaMemset(dHist,0,buckets * sizeof(int));

    dim3 block(32);
    dim3 grid((size + block.x - 1)/block.x);

    kernel_getHist<<<grid,block>>>(dArray,size,dHist,buckets);

    cudaMemcpy(histo,dHist,buckets * sizeof(int),cudaMemcpyDeviceToHost);

    cudaFree(dArray);
    cudaFree(dHist);
}

\uuuuu全局\uuuuu无效内核\u getHist（无符号字符*数组、长大小、无符号整数*历史、整数存储桶）
{
int tid=blockIdx.x*blockDim.x+threadIdx.x；
如果（tid>=大小）返回；
无符号字符值=数组[tid]；
int bin=值%bucket；
原子添加（和历史[bin]，1）；
}
void getHist（无符号字符*数组、长大小、无符号整数*历史、整数存储桶）
{
无符号字符*dArray；
Cudamaloc（和dArray，尺寸）；
cudaMemcpy（dArray、数组、大小、cudamemcpyhostodevice）；
无符号int*dHist；
cudaMalloc（&dHist，bucket*sizeof（int））；
cudaMemset（dHist，0，bucket*sizeof（int））；
dim3块（32）；
dim3网格（（尺寸+block.x-1）/block.x）；
kernel_getHist（dArray、size、dHist、bucket）；
cudaMemcpy（历史、dHist、桶*sizeof（int）、cudaMemcpyDeviceToHost）；
cudaFree（dArray）；
库达弗里（dHist）；
}
当使用原子时，启动较少的块将减少争用（从而提高性能），因为它不必在较少的块之间进行协调。启动较少的块，并使每个块在更多的输入元素上循环
for (unsigned tid = blockIdx.x*blockDim.x+threadIdx.x; 
              tid < size; tid += gridDim.x*blockDim.x) {
    unsigned char value = array[tid]; // borrowing notation from another answer here
    int bin = value % buckets;
    atomicAdd(&histo[bin],1);
}

for（unsigned tid=blockIdx.x*blockDim.x+threadIdx.x；
tid
有一种针对无原子操作的设备的解决方案，它展示了一种最小化片上内存冲突的方法，通过将片上内存细分为由Podlozhnyuk在
代码位于CUDASamples\3\u Imaging\histogram（来自CUDA Samples）
为什么要将一个块大小添加到大小：grid（（size+block.x-1）/block.x）；因此线程总数至少等于大小
。此公式将线程总数四舍五入到块大小大于或等于大小
的倍数。选择大小
的值，然后计算线程总数以供自己查看。是的，你是对的！但我没有得到相同的结果ts，平行直方图[]和序列直方图[]是不同的！我不知道为什么，我使用了你的代码！从：到外部资源的链接是鼓励的，但请在链接周围添加上下文，这样你的其他用户就会知道它是什么以及为什么存在。总是引用一个重要链接的最相关部分，以防目标站点无法访问或永久关闭上下文：有一个针对无原子操作设备的解决方案，展示了一种最小化片上内存冲突的方法，将其细分为扭曲。投票被否决的不是我。事实上，我的评论有2票赞成，所以我想它是有用的。