cudaMemcpyAsync的奇怪行为：1。cudaMemcpyKind没有区别。2.复制失败，但没有任何提示_Cuda_Nvlink_Cuda Uva

cudaMemcpyAsync的奇怪行为：1。cudaMemcpyKind没有区别。2.复制失败，但没有任何提示

cuda

cudaMemcpyAsync的奇怪行为：1。cudaMemcpyKind没有区别。2.复制失败，但没有任何提示,cuda,nvlink,cuda-uva,Cuda,Nvlink,Cuda Uva,我正在熟悉一个配备Pascal P100 GPU+Nvlink的新集群。我编写了一个乒乓程序来测试gpugpu和gpucpu的带宽以及点对点访问。（我知道cuda示例包含这样一个程序，但为了更好地理解，我想自己做。）Nvlink带宽似乎合理（约35 GB/s双向，理论最大值为40）。然而，在调试乒乓球时，我发现了一些奇怪的行为首先，无论我指定什么样的cudaMemcpyKind，cudaMemcpyAsync都会成功，例如，如果cudaMemcpyAsync将内存从主机复制到设备，那么即使我将

我正在熟悉一个配备Pascal P100 GPU+Nvlink的新集群。我编写了一个乒乓程序来测试gpugpu和gpucpu的带宽以及点对点访问。（我知道cuda示例包含这样一个程序，但为了更好地理解，我想自己做。）Nvlink带宽似乎合理（约35 GB/s双向，理论最大值为40）。然而，在调试乒乓球时，我发现了一些奇怪的行为

首先，无论我指定什么样的cudaMemcpyKind，cudaMemcpyAsync都会成功，例如，如果cudaMemcpyAsync将内存从主机复制到设备，那么即使我将cudamemcpydevicetoost作为一种类型传递，它也会成功

其次，当主机内存未被页面锁定时，CUDAMEMCPIASYNC执行以下操作：

将内存从主机复制到设备似乎成功（没有SEGFULTS或cuda运行时错误，并且数据传输正常）
将内存从设备复制到主机时会自动失败：不会发生segfault，memcpy返回cudaSuccess后cudaDeviceSynchronize，但检查数据会发现gpu上的数据未正确传输到主机

这种行为是意料之中的吗？我已经包括了一个在我的系统上演示它的最简单的工作示例代码（该示例不是乒乓球应用程序，它所做的只是使用各种参数测试cudaMemcpyAsync）

P100启用了UVA，因此我认为cudaMemcpyAsync只是推断src和dst指针的位置，而忽略了cudaMemcpyKind参数。但是，我不确定为什么cudaMemcpyAsync无法为非页面锁定的主机内存抛出错误。我的印象是这是一个严格的禁止

#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void checkDataDevice( int* current, int* next, int expected_current_val, int n )
{
  int tid = threadIdx.x + blockIdx.x*blockDim.x;
  for( int i = tid; i < n; i += blockDim.x*gridDim.x )
  {
    if( current[i] != expected_current_val )
      printf( "Error on device:  expected = %d, current[%d] = %d\n"
          , expected_current_val
          , i
          , current[i] );
    // Increment the data so the next copy is properly tested
    next[i] = current[i] + 1;
  }
}

void checkDataHost( int* current, int* next, int expected_current_val, int n )
{
  for( int i = 0; i < n; i++ )
  {
    if( current[i] != expected_current_val )
      printf( "Error on host:  expected = %d, current[%d] = %d\n"
          , expected_current_val
          , i
          , current[i] );
    // Increment the data so the next copy is properly tested
    next[i] = current[i] + 1;
  }
}

int main( int argc, char** argv )
{
  bool pagelocked = true;
  // invoking the executable with any additional argument(s) will turn off page locked memory, i.e.,
  // Run with pagelocked memory:  ./a.out
  // Run with ordinary malloc'd memory: ./a.out jkfdlsja
  if( argc > 1 )
    pagelocked = false;

  int copybytes = 1e8; // Ok to use int instead of size_t for 1e8.

  cudaStream_t* stream = (cudaStream_t*)malloc( sizeof(cudaStream_t) );
  cudaStreamCreate( stream );

  int* srcHost;
  int* dstHost;
  int* srcDevice;
  int* dstDevice;

  cudaMalloc( (void**)&srcDevice, copybytes );
  cudaMalloc( (void**)&dstDevice, copybytes );
  if( pagelocked )
  {
    printf( "Using page locked memory\n" );
    cudaMallocHost( (void**)&srcHost, copybytes );
    cudaMallocHost( (void**)&dstHost, copybytes );
  }
  else
  {
    printf( "Using non page locked memory\n" );
    srcHost = (int*)malloc( copybytes );
    dstHost = (int*)malloc( copybytes );
  }

  for( int i = 0; i < copybytes/sizeof(int); i++ )
    srcHost[i] = 1;

  cudaMemcpyKind kinds[4];
  kinds[0] = cudaMemcpyHostToDevice;
  kinds[1] = cudaMemcpyDeviceToHost;
  kinds[2] = cudaMemcpyHostToHost;
  kinds[3] = cudaMemcpyDeviceToDevice;

  // Test cudaMemcpyAsync in both directions,
  // iterating through all "cudaMemcpyKinds" to verify
  // that they don't matter.
  int expected_current_val = 1;
  for( int kind = 0; kind<4; kind++ )
  {
    // Host to device copy 
    cudaMemcpyAsync( dstDevice
        , srcHost
        , copybytes
        , kinds[kind]
        , *stream );
    gpuErrchk( cudaDeviceSynchronize() );

    checkDataDevice<<<56*8,256>>>( dstDevice
        , srcDevice
        , expected_current_val
        , copybytes/sizeof(int) );
    expected_current_val++;

    // Device to host copy
    cudaMemcpyAsync( dstHost
        , srcDevice
        , copybytes
        , kinds[kind]
        , *stream );
    gpuErrchk( cudaDeviceSynchronize() );

    checkDataHost( dstHost
        , srcHost
        , expected_current_val
        , copybytes/sizeof(int) );
    expected_current_val++;
  }

  free( stream );

  cudaFree( srcDevice );
  cudaFree( dstDevice );
  if( pagelocked )
  {
    cudaFreeHost( srcHost );
    cudaFreeHost( dstHost );
  }
  else
  {
    free( srcHost );
    free( dstHost );
  }

  return 0;
}

#包括
#包括
#包括
#定义gpuerchk（ans）{gpuAssert（（ans），_文件_，_行__）}
内联void gpuAssert（cudaError\u t代码，const char*文件，int行，bool abort=true）
{
如果（代码！=cudaSuccess）
{
fprintf（标准，“GPUassert:%s%s%d\n”，cudaGetErrorString（代码）、文件、行）；
如果（中止）退出（代码）；
}
}
__全局无效checkDataDevice（int*当前，int*下一步，int预期值，int n）
{
int tid=threadIdx.x+blockIdx.x*blockDim.x；
对于（inti=tid；i1）
pagelocked=false；
int copybytes=1e8；//确定对1e8使用int而不是size\u t。
cudaStream_t*stream=（cudaStream_t*）malloc（sizeof（cudaStream_t））；
cudaStreamCreate（流）；
int*srcHost；
int*dstHost；
int*src设备；
int*dst设备；
cudamaloc（（void**）和srcDevice，copybytes）；
cudamaloc（（void**）和dstDevice，copybytes）；
如果（页面锁定）
{
printf（“使用页面锁定内存\n”）；
cudaMallocHost（（void**）和srcHost，copybytes）；
cudaMallocHost（（void**）和dstHost，copybytes）；
}
其他的
{
printf（“使用非页面锁定内存\n”）；
srcHost=（int*）malloc（copybytes）；
DSTOST=（int*）malloc（copybytes）；
}
对于（int i=0；i对于（int kind=0；kind当CUDA代码出现问题时，我强烈建议使用严格（==检查每个呼叫返回代码）
你的错误检查是有缺陷的，这些缺陷导致了你的一些困惑
首先，在页面锁定的情况下，给定的（映射的）指针在主机和设备上都是可访问/有效的。因此，每个可能的方向枚举（H2D、D2H、D2D、H2H）都是合法和有效的。因此，不会返回错误，复制操作成功
在非页面锁定的情况下，上述情况不正确，因此一般来说，指示的传输方向最好与从指针检查的隐含传输方向匹配。如果不匹配，则cudaMemcpyAsync
将返回错误代码（cudaErrorInvalidValue
==11）。在您的情况下，您忽略了此错误结果。如果您有足够的耐心，您可以通过使用cuda memcheck
（当您在使用CUDA代码时，另一件好事是）或者只需进行适当、严格的错误检查
当cudaMemcpyAsync操作指示失败时，操作未成功完成，因此数据未被复制，并且您的数据检查指示不匹配。希望这不会令人惊讶，因为预期的复制操作会起作用
$ cat t153.cu
#include <stdio.h>
#include <stdlib.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void checkDataDevice( int* current, int* next, int expected_current_val, int n )
{
  int tid = threadIdx.x + blockIdx.x*blockDim.x;
  for( int i = tid; i < n; i += blockDim.x*gridDim.x )
  {
    if( current[i] != expected_current_val )
      printf( "Error on device:  expected = %d, current[%d] = %d\n"
          , expected_current_val
          , i
          , current[i] );
    // Increment the data so the next copy is properly tested
    next[i] = current[i] + 1;
  }
}

void checkDataHost( int* current, int* next, int expected_current_val, int n )
{
  for( int i = 0; i < n; i++ )
  {
    if( current[i] != expected_current_val ){
      printf( "Error on host:  expected = %d, current[%d] = %d\n"
          , expected_current_val
          , i
          , current[i] );
      exit(0);}
    // Increment the data so the next copy is properly tested
    next[i] = current[i] + 1;
  }
}

int main( int argc, char** argv )
{
  bool pagelocked = true;
  // invoking the executable with any additional argument(s) will turn off page locked memory, i.e.,
  // Run with pagelocked memory:  ./a.out
  // Run with ordinary malloc'd memory: ./a.out jkfdlsja
  if( argc > 1 )
    pagelocked = false;

  int copybytes = 1e8; // Ok to use int instead of size_t for 1e8.

  cudaStream_t* stream = (cudaStream_t*)malloc( sizeof(cudaStream_t) );
  cudaStreamCreate( stream );

  int* srcHost;
  int* dstHost;
  int* srcDevice;
  int* dstDevice;

  cudaMalloc( (void**)&srcDevice, copybytes );
  cudaMalloc( (void**)&dstDevice, copybytes );
  if( pagelocked )
  {
    printf( "Using page locked memory\n" );
    cudaMallocHost( (void**)&srcHost, copybytes );
    cudaMallocHost( (void**)&dstHost, copybytes );
  }
  else
  {
    printf( "Using non page locked memory\n" );
    srcHost = (int*)malloc( copybytes );
    dstHost = (int*)malloc( copybytes );
  }

  for( int i = 0; i < copybytes/sizeof(int); i++ )
    srcHost[i] = 1;

  cudaMemcpyKind kinds[4];
  kinds[0] = cudaMemcpyHostToDevice;
  kinds[1] = cudaMemcpyDeviceToHost;
  kinds[2] = cudaMemcpyHostToHost;
  kinds[3] = cudaMemcpyDeviceToDevice;

  // Test cudaMemcpyAsync in both directions,
  // iterating through all "cudaMemcpyKinds" to verify
  // that they don't matter.
  int expected_current_val = 1;
  for( int kind = 0; kind<4; kind++ )
  {
    // Host to device copy
    cudaMemcpyAsync( dstDevice
        , srcHost
        , copybytes
        , kinds[kind]
        , *stream );
    gpuErrchk( cudaDeviceSynchronize() );

    checkDataDevice<<<56*8,256>>>( dstDevice
        , srcDevice
        , expected_current_val
        , copybytes/sizeof(int) );
    expected_current_val++;

    // Device to host copy
    cudaMemcpyAsync( dstHost
        , srcDevice
        , copybytes
        , kinds[kind]
        , *stream );
    gpuErrchk( cudaDeviceSynchronize() );

    checkDataHost( dstHost
        , srcHost
        , expected_current_val
        , copybytes/sizeof(int) );
    expected_current_val++;
  }

  free( stream );

  cudaFree( srcDevice );
  cudaFree( dstDevice );
  if( pagelocked )
  {
    cudaFreeHost( srcHost );
    cudaFreeHost( dstHost );
  }
  else
  {
    free( srcHost );
    free( dstHost );
  }

  return 0;
}
$ nvcc -arch=sm_61 -o t153 t153.cu
$ cuda-memcheck ./t153 a
========= CUDA-MEMCHECK
Using non page locked memory
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpyAsync.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2ef423]
=========     Host Frame:./t153 [0x489a3]
=========     Host Frame:./t153 [0x2e11]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
=========     Host Frame:./t153 [0x2a49]
=========
Error on host:  expected = 2, current[0] = 0
========= ERROR SUMMARY: 1 error
$