cudamemcpyasync,memcpy无法在内核内部复制,而直接复制工作正常

cudamemcpyasync,memcpy无法在内核内部复制,而直接复制工作正常,cuda,memcpy,Cuda,Memcpy,我正在尝试从cuda内核中的源浮点数组(包含1.0f)复制到目标浮点数组(包含2.0f)。我尝试使用以下三种不同的方法: cudamemcpysync memcpy 直接复制(dst[i]=src[i]) 当我在内核被执行后读取结果时,我发现cudamemcpyasync和memcpy都无法复制,而直接复制方法已经起作用 为什么cudamemcpysync和memcpy方法失败了 我正在使用GTX TitanX(SM_52) 编译时使用:nvcc-arch=compute_52 main.c

我正在尝试从cuda内核中的源浮点数组(包含1.0f)复制到目标浮点数组(包含2.0f)。我尝试使用以下三种不同的方法:

  • cudamemcpysync
  • memcpy
  • 直接复制(dst[i]=src[i])
当我在内核被执行后读取结果时,我发现cudamemcpyasync和memcpy都无法复制,而直接复制方法已经起作用

为什么cudamemcpysync和memcpy方法失败了

我正在使用GTX TitanX(SM_52)

编译时使用:nvcc-arch=compute_52 main.cu

main.cu:

#include <stdio.h>
#include <iostream>


__global__
void cudamemcpy_inside_kernel(float *src, float *dst, int size)
{
  int idx = blockIdx.x*blockDim.x + threadIdx.x;

    if(idx < size){
//        memcpy(dst +idx*sizeof(float), src + idx*sizeof(float), 1); // FAILS TO COPY
//        cudaMemcpyAsync(dst +idx*sizeof(float), src + idx*sizeof(float), 1, cudaMemcpyDeviceToDevice); // FAILS TO COPY
//          dst[idx] = src[idx]; // COPIES SUCCESSFULLY
    }

}

int current = 0;
int UniqueNumber () { return ++current; }

int main(void)
{
  int N = 1000;

  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));



  cudaMalloc(&d_x, N*sizeof(float)); 
  cudaMalloc(&d_y, N*sizeof(float));


  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);


//  cudamemcpy_inside_kernel<<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudamemcpy_inside_kernel<<<2, 512>>>(d_x, d_y, N);
  cudaDeviceSynchronize();

  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();

  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]); // y[i] should have all 1.0f

}
#包括
#包括
__全球的__
内核内的void cudamemcpy_(float*src,float*dst,int size)
{
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx
两个memcpy调用中的源参数、目标参数和大小参数都错误。大概是这样的:

#include <stdio.h>
#include <iostream>

template<int action>
__global__
void cudamemcpy_inside_kernel(float *src, float *dst, int size)
{
  int idx = blockIdx.x*blockDim.x + threadIdx.x;
  if(idx < size)
    switch(action) {
      case 1:
        memcpy(dst+idx, src+idx, sizeof(float));
        break;
      case 2:
        cudaMemcpyAsync(dst+idx, src+idx, sizeof(float), cudaMemcpyDeviceToDevice);
        break;
      default:
        dst[idx] = src[idx];
    }
}

int main(void)
{
  int N = 10;

  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));

  cudaMalloc(&d_x, N*sizeof(float)); 
  cudaMalloc(&d_y, N*sizeof(float));


  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }
  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);

  printf("Assignment \n");
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudamemcpy_inside_kernel<0><<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]);

  printf("\n Memcpy \n");
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudamemcpy_inside_kernel<1><<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]);

  printf("\n cudaMemcpyAsync \n");
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudamemcpy_inside_kernel<2><<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]);

  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
}

memcpy和cudaMemcpyAsync都有不正确的参数,这就是它们不能正常工作的原因
$ nvcc -arch=sm_52 -dc -o memcpy_kernel.o memcpy_kernel.cu
$ nvcc -arch=sm_52 -o memcpy_kernel memcpy_kernel.o
$ ./memcpy_kernel 
Assignment 
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000

 Memcpy 
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000

 cudaMemcpyAsync 
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000