CUDA GPU比CPU慢_Cuda - Fatal编程技术网

CUDA GPU比CPU慢

cuda

CUDA GPU比CPU慢,cuda,Cuda,我很难弄明白为什么我的cuda代码比我的cpu代码运行得慢我的桌面配置是i7 2600S，geforce 560ti 我的代码如下： int** kernel_shiftSeam(int **MCEnergyMat, int **newE, int *seam, int width, int height, int direction) { //time measurement float elapsed_time_ms = 0; cudaEvent_t start, stop; //

我很难弄明白为什么我的cuda代码比我的cpu代码运行得慢

我的桌面配置是i7 2600S，geforce 560ti

我的代码如下：

int** kernel_shiftSeam(int **MCEnergyMat, int **newE, int *seam, int width, int height,     int direction)
{
//time measurement
float elapsed_time_ms = 0;
cudaEvent_t start, stop; //threads per block

dim3 threads(16,16);
//blocks
dim3 blocks((width+threads.x-1)/threads.x, (height+threads.y-1)/threads.y);

int *device_Seam;

int *host_Seam;

int seamSize;
if(direction == 1)
{
    seamSize = height*sizeof(int);
    host_Seam = (int*)malloc(seamSize);
    for(int i=0;i<height;i++)
    host_Seam[i] = seam[i];
}
else
{
    seamSize = width*sizeof(int);
    host_Seam = (int*)malloc(seamSize);
    for(int i=0;i<width;i++)
        host_Seam[i] = seam[i];
}

cudaMalloc((void**)&device_Seam, seamSize);
cudaMemcpy(device_Seam, host_Seam, seamSize, cudaMemcpyHostToDevice);

global_host_MC = MCEnergyMat;
new_host_MC = newE;

//copy host array to device
cudaMemcpy(global_MC, global_MC2, sizeof(int*)*width, cudaMemcpyHostToDevice);
    for(int i=0;i<width;i++)
        cudaMemcpy(global_MC2[i], global_host_MC[i], sizeof(int)*height, cudaMemcpyHostToDevice);

cudaMemcpy(new_MC, new_MC2, sizeof(int*)*width, cudaMemcpyHostToDevice);
    for(int i=0;i<width;i++)
        cudaMemcpy(new_MC2[i], new_host_MC[i], sizeof(int)*height, cudaMemcpyHostToDevice);


cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);

//do some operations on the 2d matrix
gpu_shiftSeam<<< blocks,threads >>>(global_MC, new_MC, device_Seam, width, height);

//measure end time for cpu calcuations
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_ms, start, stop );

execTime += elapsed_time_ms;

//copy out the data back to host (RESULT)
for(int i=0;i<width;i++)
{
    cudaMemcpy(newE[i], new_MC2[i], sizeof(int)*height, cudaMemcpyDeviceToHost);
}

return newE;
}

int**kernel\u shiftsam（int**MCEnergyMat，int**newE，int*seam，int宽度，int高度，int方向）
{
//时间测量
浮动经过时间=0；
cudaEvent\u t启动、停止；//每个块的线程数
dim3螺纹（16,16）；
//块
dim3块（（宽度+螺纹.x-1）/threads.x，（高度+螺纹.y-1）/threads.y）；
int*装置接缝；
int*host_接缝；
内接缝尺寸；
如果（方向==1）
{
seamSize=高度*尺寸（整数）；
host_Seam=（int*）malloc（seamSize）；
对于（inti=0；iIm），我的经验记忆访问是缓慢的第一个原因
分析数组副本以查看花费了多少时间。如果这是一个相当大的数量，也许可以尝试优化代码。与其在for循环中复制，不如直接复制sizeof（int*）*height*width
。减少调用memcpy的次数应该会有所帮助
cudaMemcpy(global_MC, global_MC2, sizeof(int*)*width, cudaMemcpyHostToDevice);
cudaMemcpy(global_MC2, global_host_MC, sizeof(int)*height*width,cudaMemcpyHostToDevice);

我有过类似的经历，发现Cudamaloc是瓶颈，而cudaMemcpy不是。在我的设备中，我记得16 MB的分配需要160毫秒。但CUDA内存分配可以在实际计算之前完成，例如，通过另一个函数调用。因此，内存分配时间可以从整体性能度量中删除，e、 例如，加速，尽管我会在加速计算中包括cudaMemcpy操作。
只要移动计时器，或者创建更多计时器，这样你就可以看到时间的去向。可能在cudaMemcpy（）调用中使用了时间。如果在cudaMemcpy（）中使用了时间怎么办像你说的那样调用？这是否意味着不可避免地要花这么多时间使用该函数？因为我认为在优化之前，没有其他方法可以替代cudaMemcpy（），我会对kernel_shiftSeam的子部分计时，或者使用一个分析器（Parallel Nsight、CUDA profiler、NVIDIA Visual profiler）。导致开销的项目包括但不限于：（a）CPU内存复制到固定方向，（b）malloc，（c）cudaMalloc，（d）由于将非固定内存复制到GPU而在cudaMemcpy中复制CPU内存，（e）阻止对cudaMemcpy的调用，不必要的cudaEventSynchronize（移到末尾），将cudaMemcopy划分为多个调用。您可以研究使用cudaMallocPitch/cudaMemcpy2D来处理2D副本。您还泄漏了CPU和GPU内存，WDDM上的CPU和GPU内存将减慢每个连续的GPU操作。感谢您的回复！复制大小（int*）*height*width直接缩短了时间，但它只是偶尔生成正确的输出。是否有其他方法可以在不使用for循环的情况下执行memcpy？如果只是偶尔生成正确的输出，则可能存在同步问题。