Cuda 将阵列转换为纹理表示
我试图在cuda中将数组表示为tex2D。。。。经过几个小时的调试,我注意到100万个元素中有19个被错误地复制到纹理中,这意味着作为一个二进制数组,我得到了0而不是1Cuda 将阵列转换为纹理表示,cuda,gpgpu,Cuda,Gpgpu,我试图在cuda中将数组表示为tex2D。。。。经过几个小时的调试,我注意到100万个元素中有19个被错误地复制到纹理中,这意味着作为一个二进制数组,我得到了0而不是1 void evolve_gpu( byte* h_in, byte* h_out) { //int SIZE = N * N * N * N * sizeof( float ); cudaEvent_t start, stop; size_t d_in_pitch; size_t d_out_pitch; int len
void evolve_gpu( byte* h_in, byte* h_out)
{
//int SIZE = N * N * N * N * sizeof( float );
cudaEvent_t start, stop;
size_t d_in_pitch;
size_t d_out_pitch;
int len = 1002;
checkCudaErrors( cudaEventCreate(&start) );
checkCudaErrors( cudaEventCreate(&stop) );
// Allocate the device input image array
unsigned char *d_in = NULL;
unsigned char *d_out = NULL;
checkCudaErrors(cudaMallocPitch(&d_in, &d_in_pitch, sizeof(unsigned char)*len, len));
checkCudaErrors(cudaMallocPitch(&d_out, &d_out_pitch, sizeof(unsigned char)*len, len));
// Copy the host input image to the device memory
checkCudaErrors(cudaMemcpy2D(d_in, d_in_pitch, h_in, sizeof(unsigned char)*len
, sizeof(unsigned char)*len, len, cudaMemcpyHostToDevice));
/**************************** TEXTURE CONFIGURATION ******************************/
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = d_in;
resDesc.res.pitch2D.pitchInBytes = d_in_pitch;
resDesc.res.pitch2D.width = len;
resDesc.res.pitch2D.height = len;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<unsigned char>();
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords=false;
texDesc.addressMode[0]=cudaAddressModeBorder;
texDesc.addressMode[1]=cudaAddressModeBorder;
cudaTextureObject_t tex;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
/*********************************************************************************/
checkCudaErrors( cudaEventRecord(start, NULL) );
// Launch the CUDA Kernel
dim3 block = dim3(THREADS_X, THREADS_Y);
dim3 grid = dim3((len+block.x-1)/block.x,(len+block.y-1)/block.y);//25*50
evolve_kernel<<<grid, block>>>( tex, d_out );
//******** kernel<<< number of blocks, number of threads, dynamic memory per block, associated stream >>> *******//
// Copy the device result to the host
checkCudaErrors(cudaMemcpy2D(h_out, d_out_pitch,
d_out, d_out_pitch,
sizeof(unsigned char)*len, len,
cudaMemcpyDeviceToHost));
for(int i=0;i<1002*1002;i++){
if(h_in[i] != h_out[i])
printf("i = %d\n",i);
}
checkCudaErrors( cudaGetLastError() );
checkCudaErrors( cudaEventRecord(stop, NULL) );
checkCudaErrors( cudaEventSynchronize(stop) );
checkCudaErrors( cudaFree(d_in) );
checkCudaErrors( cudaFree(d_out) );
float msec = 0.f;
checkCudaErrors( cudaEventElapsedTime(&msec, start, stop) );
printf("Basic version took: %f ms\n", msec);
}
void evolve\u gpu(字节*h\u输入,字节*h\u输出)
{
//int SIZE=N*N*N*N*sizeof(浮点);
cudaEvent\u t启动、停止;
以音高表示的尺寸;
大小、大小、间距;
int len=1002;
检查CUDAERRORS(cudaEventCreate(&start));
检查CUDAERRORS(cudaEventCreate(&stop));
//分配设备输入图像数组
无符号字符*d_in=NULL;
无符号字符*d_out=NULL;
检查CUDAERRORS(cudaMallocPitch(&d_-in,&d_-in_-pitch,sizeof(未签名字符)*len,len));
检查CUDAERRORS(CUDAMALLOCITCH(&d_out,&d_out_pitch,sizeof(unsigned char)*len,len));
//将主机输入映像复制到设备内存
检查CUDAERRORS(cudaMemcpy2D(d_in,d_in_pitch,h_in,sizeof(未签名字符)*len
,sizeof(未签名字符)*len,len,cudaMemcpyHostToDevice);
/****************************纹理配置******************************/
cudaResourceDesc resDesc;
memset(&resDesc,0,sizeof(resDesc));
resDesc.resType=cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr=d_in;
resDesc.res.pitch2D.pitchInBytes=d_英寸间距;
resDesc.res.pitch2D.width=len;
resDesc.res.pitch2D.height=len;
resDesc.res.pitch2D.desc=cudaCreateChannelDesc();
CudatextureDisc texDesc;
memset(&texDesc,0,sizeof(texDesc));
texDesc.readMode=cudaReadModeElementType;
texDesc.normalizedCoords=false;
texDesc.addressMode[0]=cudaAddressModeBorder;
texDesc.addressMode[1]=cudaAddressModeBorder;
cudaTextureObject_t tex;
cudaCreateTextureObject(&tex,&resDesc,&texDesc,NULL);
/*********************************************************************************/
检查CUDAERRORS(cudaEventRecord(start,NULL));
//启动CUDA内核
dim3块=dim3(螺纹X,螺纹Y);
dim3网格=dim3((len+block.x-1)/block.x,(len+block.y-1)/block.y);//25*50
进化内核(tex,d_out);
//********内核>*******//
//将设备结果复制到主机
检查CUDAERRORS(cudaMemcpy2D(h_out,d_out_pitch,
出去,出去投球,
sizeof(无符号字符)*len,len,
cudaMemcpyDeviceToHost);
对于(int i=0;i我在代码中看到的一个问题是设备->主机副本:
checkCudaErrors(cudaMemcpy2D(h_out, d_out_pitch,
d_out, d_out_pitch,
sizeof(unsigned char)*len, len,
cudaMemcpyDeviceToHost));
关于,此cudaMemcpy2D
调用的第二个参数是目标分配的基音(即在本例中为h\u out
的基音)。但是h\u out
不太可能指基音分配,即使它以某种方式是,基音也不太可能由d\u out\u基音
给出
虽然您尚未显示完整的代码,但假设h_out
和h_in
是类似的分配,则第二个参数应更改为h_out
数组的(未倾斜)宽度:
checkCudaErrors(cudaMemcpy2D(h_out, len*sizeof(unsigned char),
d_out, d_out_pitch,
sizeof(unsigned char)*len, len,
cudaMemcpyDeviceToHost));
我还很好奇,当您没有将du out
的音调传递给内核时,内核如何能够在du out
上正确运行(一种音调分配):
evolve_kernel<<<grid, block>>>( tex, d_out );
evolve_内核(tex,d_out);
我希望看到这样的电话:
evolve_kernel<<<grid, block>>>( tex, d_out, d_out_pitch);
evolve_内核(tex,d_out,d_out_pitch);
但是您还没有显示您的内核代码
下面是我围绕您展示的代码创建的一个完整的示例,修复了上述问题,并进行了一些其他更改以构建一个示例:
$ cat t648.cu
#include <stdio.h>
#include <helper_cuda.h>
#define THREADS_X 16
#define THREADS_Y 16
const int len = 1002;
typedef unsigned char byte;
__global__ void evolve_kernel(cudaTextureObject_t tex, unsigned char *d_out, size_t pitch ){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < len) && (idy < len))
d_out[idy*pitch+idx] = tex2D<unsigned char>(tex, idx, idy);
}
void evolve_gpu( byte* h_in, byte* h_out)
{
//int SIZE = N * N * N * N * sizeof( float );
cudaEvent_t start, stop;
size_t d_in_pitch;
size_t d_out_pitch;
checkCudaErrors( cudaEventCreate(&start) );
checkCudaErrors( cudaEventCreate(&stop) );
// Allocate the device input image array
unsigned char *d_in = NULL;
unsigned char *d_out = NULL;
checkCudaErrors(cudaMallocPitch(&d_in, &d_in_pitch, sizeof(unsigned char)*len, len));
checkCudaErrors(cudaMallocPitch(&d_out, &d_out_pitch, sizeof(unsigned char)*len, len));
// Copy the host input image to the device memory
checkCudaErrors(cudaMemcpy2D(d_in, d_in_pitch, h_in, sizeof(unsigned char)*len
, sizeof(unsigned char)*len, len, cudaMemcpyHostToDevice));
/**************************** TEXTURE CONFIGURATION ******************************/
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = d_in;
resDesc.res.pitch2D.pitchInBytes = d_in_pitch;
resDesc.res.pitch2D.width = len;
resDesc.res.pitch2D.height = len;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<unsigned char>();
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords=false;
texDesc.addressMode[0]=cudaAddressModeBorder;
texDesc.addressMode[1]=cudaAddressModeBorder;
cudaTextureObject_t tex;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
/*********************************************************************************/
checkCudaErrors( cudaEventRecord(start, NULL) );
// Launch the CUDA Kernel
dim3 block = dim3(THREADS_X, THREADS_Y);
dim3 grid = dim3((len+block.x-1)/block.x,(len+block.y-1)/block.y);//25*50
evolve_kernel<<<grid, block>>>( tex, d_out, d_out_pitch );
//******** kernel<<< number of blocks, number of threads, dynamic memory per block, associated stream >>> *******//
// Copy the device result to the host
checkCudaErrors(cudaMemcpy2D(h_out, len*sizeof(unsigned char),
d_out, d_out_pitch,
sizeof(unsigned char)*len, len,
cudaMemcpyDeviceToHost));
for(int i=0;i<1002*1002;i++){
if(h_in[i] != h_out[i])
printf("i = %d\n",i);
}
checkCudaErrors( cudaGetLastError() );
checkCudaErrors( cudaEventRecord(stop, NULL) );
checkCudaErrors( cudaEventSynchronize(stop) );
checkCudaErrors( cudaFree(d_in) );
checkCudaErrors( cudaFree(d_out) );
float msec = 0.f;
checkCudaErrors( cudaEventElapsedTime(&msec, start, stop) );
printf("Basic version took: %f ms\n", msec);
}
int main(){
byte *h_data_in, *h_data_out;
h_data_in = (byte *)malloc(len*len*sizeof(byte));
h_data_out = (byte *)malloc(len*len*sizeof(byte));
for (int i = 0; i < len*len; i++){
h_data_in[i] = 3;
h_data_out[i] = 0;}
evolve_gpu(h_data_in, h_data_out);
return 0;
}
$ nvcc -arch=sm_35 -I/usr/local/cuda/samples/common/inc t648.cu -o t648
$ ./t648
Basic version took: 3.868576 ms
$
$cat t648.cu
#包括
#包括
#定义线程×16
#定义线程_y16
常数int len=1002;
typedef无符号字符字节;
__全局无效演化内核(cudaTextureObject tex、无符号字符*d\u out、大小\u t间距){
int idx=threadIdx.x+blockDim.x*blockIdx.x;
int-idy=threadIdx.y+blockDim.y*blockIdx.y;
如果((idx*******//
//将设备结果复制到主机
检查CUDAERRORS(cudaMemcpy2D(h_out,len*sizeof(unsigned char)),
出去,出去投球,
sizeof(无符号字符)*len,len,
cudaMemcpyDeviceToHost);
对于(int i=0;iSO,对于ques)