Vector Cuda的128位矢量加法，性能问题_Vector_Cuda

Vector Cuda的128位矢量加法，性能问题

vector cuda

Vector Cuda的128位矢量加法，性能问题,vector,cuda,Vector,Cuda,我想把128位向量和进位相加。我的128位版本（addKernel128在下面的代码中）比基本32位版本（addKernel32在下面）慢两倍。我有记忆整合问题吗？如何才能获得更好的性能 #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <iostream> #define UADDO(c, a, b) asm volatile("add.cc.u32 %0, %1, %2;" : "

我想把128位向量和进位相加。我的128位版本（

addKernel128

在下面的代码中）比基本32位版本（

addKernel32

在下面）慢两倍。我有记忆整合问题吗？如何才能获得更好的性能

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>

#define UADDO(c, a, b) asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
#define UADDC(c, a, b) asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));

__global__ void addKernel32(unsigned int *c, const unsigned int *a, const unsigned int *b, const int size)
{
  int tid = blockIdx.x * blockDim.x + threadIdx.x;

  while (tid < size)
  {
    c[tid] = a[tid] + b[tid];
    tid += blockDim.x * gridDim.x;
  }
}

__global__ void addKernel128(unsigned *c, const unsigned *a, const unsigned *b, const int size)
{
  int tid = blockIdx.x * blockDim.x + threadIdx.x;

  while (tid < size / 4)
  {
    uint4 a4 = ((const uint4 *)a)[tid],
          b4 = ((const uint4 *)b)[tid],
          c4;

    UADDO(c4.x, a4.x, b4.x)
    UADDC(c4.y, a4.y, b4.y) // add with carry
    UADDC(c4.z, a4.z, b4.z) // add with carry
    UADDC(c4.w, a4.w, b4.w) // add with carry (no overflow checking for clarity)

    ((uint4 *)c)[tid] = c4;

    tid += blockDim.x * gridDim.x;
  }
}

int main()
{
  const int size = 10000000; // 10 million

  unsigned int *d_a, *d_b, *d_c;

  cudaMalloc((void**)&d_a, size * sizeof(int));
  cudaMalloc((void**)&d_b, size * sizeof(int));
  cudaMalloc((void**)&d_c, size * sizeof(int));

  cudaMemset(d_a, 1, size * sizeof(int)); // dummy init just for the example
  cudaMemset(d_b, 2, size * sizeof(int)); // dummy init just for the example
  cudaMemset(d_c, 0, size * sizeof(int));

  int nbThreads = 512;
  int nbBlocks = 1024; // for example

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start);

  addKernel128<<<nbBlocks, nbThreads>>>(d_c, d_a, d_b, size);

  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  float m = 0;
  cudaEventElapsedTime(&m, start, stop);

  cudaFree(d_c);
  cudaFree(d_b);
  cudaFree(d_a);
  cudaDeviceReset();
  printf("Elapsed = %g\n", m);
  return 0;
}

#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#定义UADDO（c，a，b）asm volatile（“add.cc.u32%0，%1，%2；”：“=r”（c）：“r”（a），“r”（b））；
#定义UADDC（c，a，b）asm volatile（“addc.cc.u32%0，%1，%2；”：“=r”（c）：“r”（a），“r”（b））；
__全局\uuuvoid addKernel32（无符号整数*c、常量无符号整数*a、常量无符号整数*b、常量整数大小）
{
int tid=blockIdx.x*blockDim.x+threadIdx.x；
而（tid<尺寸）
{
c[tid]=a[tid]+b[tid]；
tid+=blockDim.x*gridDim.x；
}
}
__全局\uuuvoid addKernel128（无符号*c、常数无符号*a、常数无符号*b、常数整数大小）
{
int tid=blockIdx.x*blockDim.x+threadIdx.x；
而（tid

由于各种原因，在WDDM GPU上对CUDA代码进行计时可能非常困难。其中大部分都围绕着这样一个事实，即GPU是由Windows作为显示设备进行管理的，这可能会在计时中引入各种工件。一个例子是Windows驱动程序和WDM将为GPU批量工作，并且可以在CUDA GPU工作的中间交错显示工作。

如果可能，在linux或windows GPU上计时cuda代码在变矩器离合器模式下
为了提高性能，始终在不使用
```
-G
```
开关的情况下构建。在VisualStudio中，这通常对应于生成版本，而不是项目的调试版本
为了获得良好的性能比较，通常建议在实际测量计时结果之前进行一些“预热运行”。这些将消除“启动”和其他一次性测量问题，使您更有可能获得合理的结果。您可能还希望多次运行代码并平均结果
通常建议使用与GPU对应的arch标志进行编译，例如，对于cc2.0 GPU，使用
```
-arch=sm_20
```

如果可能，在linux或windows GPU上计时cuda代码在变矩器离合器模式下
为了提高性能，始终在不使用
```
-G
```
开关的情况下构建。在VisualStudio中，这通常对应于生成版本，而不是项目的调试版本
为了获得良好的性能比较，通常建议在实际测量计时结果之前进行一些“预热运行”。这些将消除“启动”和其他一次性测量问题，使您更有可能获得合理的结果。您可能还希望多次运行代码并平均结果
通常建议使用与GPU对应的arch标志进行编译，例如，对于cc2.0 GPU，使用
```
-arch=sm_20
```