C++ 谁能从mark的文档中给我这个代码的主程序?
我这里有mark harris记录的代码(并行缩减)。我是cuda编程新手,我不知道如何为这段代码编写主程序。请帮帮我,谢谢 代码如下:C++ 谁能从mark的文档中给我这个代码的主程序?,c++,c,cuda,parallel-processing,gpu,C++,C,Cuda,Parallel Processing,Gpu,我这里有mark harris记录的代码(并行缩减)。我是cuda编程新手,我不知道如何为这段代码编写主程序。请帮帮我,谢谢 代码如下: template <unsigned int blockSize> __global__ voidreduce6(int *g_idata, int *g_odata, unsigned int n) { extern __shared__ int sdata[]; unsigned int tid = threadIdx.x; unsigned
template <unsigned int blockSize>
__global__ voidreduce6(int *g_idata, int *g_odata, unsigned int n)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + tid;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid] = 0;
do{sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; } while (i < n);
__syncthreads();
if (blockSize >= 512) {if(tid<256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) {if(tid<128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) {if(tid< 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32){
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
模板
__全局无效约简6(int*g\u idata,int*g\u odata,无符号int n)
{
外部共享数据数据[];
unsigned int tid=threadIdx.x;
无符号整数i=blockIdx.x*(blockSize*2)+tid;
unsigned int gridSize=blockSize*2*gridDim.x;
sdata[tid]=0;
do{sdata[tid]+=g_-idata[i]+g_-idata[i+blockSize];i+=gridSize;}而(i=512){if(tid=256){if(tid=128){if(tid<64){sdata[tid]+=sdata[tid+64];}
如果(tid<32){
如果(块大小>=64)sdata[tid]+=sdata[tid+32];
如果(块大小>=32)sdata[tid]+=sdata[tid+16];
如果(块大小>=16)sdata[tid]+=sdata[tid+8];
如果(块大小>=8)sdata[tid]+=sdata[tid+4];
如果(块大小>=4)sdata[tid]+=sdata[tid+2];
如果(块大小>=2)sdata[tid]+=sdata[tid+1];
}
如果(tid==0)g_odata[blockIdx.x]=sdata[0];
}
中给出了使用此内核的完整示例代码
请参阅文件reduce_kernel.cu。内核启动的包装器函数将根据特定的缩减方法选择内核,并且内核也会根据threadblock大小进行模板化:
case 6:
default:
if (isPow2(size))
{
switch (threads)
{
case 512:
reduce6<T, 512, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 256:
reduce6<T, 256, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 128:
reduce6<T, 128, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 64:
reduce6<T, 64, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 32:
reduce6<T, 32, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 16:
reduce6<T, 16, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 8:
reduce6<T, 8, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 4:
reduce6<T, 4, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 2:
reduce6<T, 2, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 1:
reduce6<T, 1, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
}
}
else
{
switch (threads)
{
case 512:
reduce6<T, 512, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 256:
reduce6<T, 256, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 128:
reduce6<T, 128, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 64:
reduce6<T, 64, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 32:
reduce6<T, 32, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 16:
reduce6<T, 16, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 8:
reduce6<T, 8, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 4:
reduce6<T, 4, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 2:
reduce6<T, 2, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 1:
reduce6<T, 1, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
}
}
break;
}
}
// Instantiate the reduction function for 3 types
template void
reduce<int>(int size, int threads, int blocks,
int whichKernel, int *d_idata, int *d_odata);
template void
reduce<float>(int size, int threads, int blocks,
int whichKernel, float *d_idata, float *d_odata);
template void
reduce<double>(int size, int threads, int blocks,
int whichKernel, double *d_idata, double *d_odata);
案例6:
违约:
如果(isPow2(尺寸))
{
开关(线程)
{
案例512:
缩小6>(d_idata,d_odata,大小);
打破
案例256:
缩小6>(d_idata,d_odata,大小);
打破
案例128:
缩小6>(d_idata,d_odata,大小);
打破
案例64:
缩小6>(d_idata,d_odata,大小);
打破
案例32:
缩小6>(d_idata,d_odata,大小);
打破
案例16:
缩小6>(d_idata,d_odata,大小);
打破
案例8:
缩小6>(d_idata,d_odata,大小);
打破
案例4:
缩小6>(d_idata,d_odata,大小);
打破
案例2:
缩小6>(d_idata,d_odata,大小);
打破
案例1:
缩小6>(d_idata,d_odata,大小);
打破
}
}
其他的
{
开关(线程)
{
案例512:
缩小6>(d_idata,d_odata,大小);
打破
案例256:
缩小6>(d_idata,d_odata,大小);
打破
案例128:
缩小6>(d_idata,d_odata,大小);
打破
案例64:
缩小6>(d_idata,d_odata,大小);
打破
案例32:
缩小6>(d_idata,d_odata,大小);
打破
案例16:
缩小6>(d_idata,d_odata,大小);
打破
案例8:
缩小6>(d_idata,d_odata,大小);
打破
案例4:
缩小6>(d_idata,d_odata,大小);
打破
案例2:
缩小6>(d_idata,d_odata,大小);
打破
案例1:
缩小6>(d_idata,d_odata,大小);
打破
}
}
打破
}
}
//实例化3种类型的缩减函数
模板空隙
减少(整数大小、整数线程、整数块、,
int whichKernel,int*d_-idata,int*d_-odata);
模板空隙
减少(整数大小、整数线程、整数块、,
int,其中内核、浮点*d_-idata、浮点*d_-odata);
模板空隙
减少(整数大小、整数线程、整数块、,
int,其中kernel,double*d_-idata,double*d_-odata);
您看过CUDA工具包中所有漂亮的CUDA示例了吗?这是什么代码?这是主要功能吗@罗伯特·克罗维拉诺,这不是主要功能。它是调用“Mark’s code”(内核启动的包装函数)的代码的一部分。CUDA示例中的代码太多,我无法在这里发布,但您可以自己轻松查看。我已经给了你它的链接。