Vector 流向量加法
我已经看到了这个问题,但这不是我的代码的问题。虽然我得到了相同的错误,但根本原因是不同的。当我编译时,我得到以下错误 解决方案不正确。解决方案与第0行的预期结果不匹配。预期为(1+0.5=1.5),但得到0。Vector 流向量加法,vector,cuda,parallel-processing,Vector,Cuda,Parallel Processing,我已经看到了这个问题,但这不是我的代码的问题。虽然我得到了相同的错误,但根本原因是不同的。当我编译时,我得到以下错误 解决方案不正确。解决方案与第0行的预期结果不匹配。预期为(1+0.5=1.5),但得到0。 我试图打印内核中的值,发现计算是正确的。但当我从设备复制到主机时,我看到所有的零都被打印出来 #include<wb.h> #define wbCheck(stmt) do {
我试图打印内核中的值,发现计算是正确的。但当我从设备复制到主机时,我看到所有的零都被打印出来
#include<wb.h>
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \
return -1; \
} \
} while(0)
#define NUM_STREAMS 2
__global__ void vecAdd(float * in1, float * in2, float * out, int len) {
//@@ Insert code to implement vector addition here
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i< len)
{
out[i]= in1[i]+in2[i];
printf("Thread %d %f %f out %f\n",i,in1[i],in2[i],out[i]);
}
}
int main(int argc, char ** argv) {
wbArg_t args;
int inputLength;
float * hostInput1;
float * hostInput2;
float * hostOutput;
float * deviceInput1;
float * deviceInput2;
float * deviceOutput;
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength);
hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength);
hostOutput = (float *) malloc(inputLength * sizeof(float));
wbTime_stop(Generic, "Importing data and creating memory on host");
float *h_A, *h_B, *h_C;
float *d_A0, *d_B0, *d_C0; //Device memory for stream0
float *d_A1, *d_B1, *d_C1; //Device memory for stream1
cudaHostAlloc((void**)&h_A, inputLength*sizeof(float), cudaHostAllocDefault);
cudaHostAlloc((void**)&h_B, inputLength*sizeof(float), cudaHostAllocDefault);
cudaHostAlloc((void**)&h_C, inputLength*sizeof(float), cudaHostAllocDefault);
memcpy(h_A, hostInput1,inputLength*sizeof(float));
memcpy(h_B, hostInput2,inputLength*sizeof(float));
printf("%f %f\n", h_A[0],hostInput1[0]);
printf("%f %f \n",h_A[1],hostInput1[1]);
printf("Input length is %d\n", inputLength);
int nstreams = NUM_STREAMS;
cudaStream_t *streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t));
for(int i = 0; i < nstreams; i++)
cudaStreamCreate(&(streams[i]));
long segSize = 1024;
wbCheck(cudaMalloc((void **)&d_A0, segSize*sizeof(float)));
wbCheck(cudaMalloc((void **)&d_A1, segSize*sizeof(float)));
wbCheck(cudaMalloc((void **)&d_B0, segSize*sizeof(float)));
wbCheck(cudaMalloc((void **)&d_B1, segSize*sizeof(float)));
wbCheck(cudaMalloc((void **)&d_C0, segSize*sizeof(float)));
wbCheck(cudaMalloc((void **)&d_C1, segSize*sizeof(float)));
for(int i=0; i< inputLength; i+=segSize*2)
{
if(i+segSize <= inputLength)
{
cudaMemcpyAsync(d_A0,h_A+i,segSize*sizeof(float),cudaMemcpyHostToDevice,streams[0]);
cudaMemcpyAsync(d_B0,h_B+i,segSize*sizeof(float),cudaMemcpyHostToDevice,streams[0]);
if(i+2*segSize <= inputLength )
{
cudaMemcpyAsync(d_A1,h_A+i+segSize,segSize*sizeof(float),cudaMemcpyHostToDevice,streams[1]);
cudaMemcpyAsync(d_B1,h_B+i+segSize,segSize*sizeof(float),cudaMemcpyHostToDevice,streams[1]);
}
else
{
cudaMemcpyAsync(d_A1,h_A+i+segSize,(inputLength-i-segSize)*sizeof(float),cudaMemcpyHostToDevice,streams[1]);
cudaMemcpyAsync(d_B1,h_B+i+segSize,(inputLength-i-segSize)*sizeof(float),cudaMemcpyHostToDevice,streams[1]);
}
}
else
{
cudaMemcpyAsync(d_A0,h_A+i,(inputLength-i)*sizeof(float),cudaMemcpyHostToDevice,streams[0]);
cudaMemcpyAsync(d_B0,h_B+i,(inputLength-i)*sizeof(float),cudaMemcpyHostToDevice,streams[0]);
}
if(i+segSize <= inputLength)
{
vecAdd<<<segSize/256, 256, 1, streams[0]>>>(d_A0,d_B0,d_C0, segSize);
if(i+2*segSize <= inputLength )
{
vecAdd<<<segSize/256, 256, 1, streams[1]>>>(d_A1,d_B1,d_C1, segSize);
}
else
{
vecAdd<<<segSize/256, 256, 1, streams[1]>>>(d_A1,d_B1,d_C1, inputLength-i-segSize);
}
}
else
{
vecAdd<<<segSize/256, 256, 1, streams[0]>>>(d_A0,d_B0,d_C0, inputLength-i);
}
if(i+segSize <= inputLength)
{
cudaMemcpyAsync(h_C+i,d_C0,segSize*sizeof(float),cudaMemcpyDeviceToHost,streams[0]);
if(i+2*segSize <= inputLength )
{
cudaMemcpyAsync(h_C+i+segSize,d_C1,segSize*sizeof(float),cudaMemcpyDeviceToHost,streams[1]);
printf("hello %f\n", h_C[0]);
}
else
{
cudaMemcpyAsync(h_C+i+segSize,d_C1,(inputLength-i-segSize)*sizeof(float),cudaMemcpyDeviceToHost,streams[1]);
}
}
else
{
cudaMemcpyAsync(h_C+i,d_C0,(inputLength-i)*sizeof(float),cudaMemcpyDeviceToHost,streams[0]);
}
}
memcpy(hostOutput, h_C, inputLength*sizeof(float));
wbSolution(args, hostOutput, inputLength); //hostOutput and h_C contains all zeroes
free(hostInput1);
free(hostInput2);
free(hostOutput);
cudaFree(d_A0);
cudaFree(d_A1);
cudaFree(d_B0);
cudaFree(d_B1);
cudaFree(d_C0);
cudaFree(d_C1);
return 0;
}
#包括
#定义wbCheck(stmt)do{\
cudaError\u t err=stmt\
如果(err!=cudaSuccess){\
wbLog(错误,“无法运行stmt”,#stmt)\
wbLog(错误,“获取CUDA错误…”,cudaGetErrorString(错误))\
返回-1\
} \
}而(0)
#定义NUM_流2
__全局无效向量添加(浮点*in1,浮点*in2,浮点*out,整数len){
//@@在此处插入实现向量加法的代码
int i=blockIdx.x*blockDim.x+threadIdx.x;
如果(i 如果(i+segSize正如@hubs在下面的评论中所建议的那样,我应该在memcpy之前使用cudaDeviceSynchronize();那么这个建议是有效的。在调用memcpy(hostOutput,h_C,inputLength*sizeof(float))之前,尝试添加一个cudaDeviceSynchronize();
。您不能保证所有数据都从设备复制到主机上,因为cudaMemcpyAsync
调用不会阻止主机。“但是当我从设备复制到主机时,我看到所有的零都被打印出来。”。您的代码总是打印出h_C
的同一元素。为什么要打印出不同的内容?我不知道使用了CudDeviceSynchronize();正如建议的那样,它可以工作。