C++ Cuda内存传输开销_C++_Optimization_Cuda_Gpu

C++ Cuda内存传输开销

c++ optimization cuda

C++ Cuda内存传输开销,c++,optimization,cuda,gpu,C++,Optimization,Cuda,Gpu,众所周知，将数据复制到GPU的速度很慢，我想知道将数据传递到GPU的具体“计数”是什么 __global__ void add_kernel(float* a, float* b, float* c, int size) { for (int i = 0; i < size; ++i) { a[i] = b[i] + c[i]; } int main() { int size = 100000; //Or any arbitrarily large number

众所周知，将数据复制到GPU的速度很慢，我想知道将数据传递到GPU的具体“计数”是什么

__global__
void add_kernel(float* a, float* b, float* c, int size) {
   for (int i = 0; i < size; ++i) {
       a[i] = b[i] + c[i];
   }

int main() {
int size = 100000; //Or any arbitrarily large number
int reps = 1000;   //Or any arbitrarily large number


extern float* a; //float* of [size] allocated on the GPU 
extern float* b; //float* of [size] allocated on the GPU 
extern float* c; //float* of [size] allocated on the GPU 

 for (int i = 0; i < reps; ++i)
add_kernel<<<blocks, threads>>>(a, b, c, size); 

}

IE（同样）在“理想”CUDA程序中，程序员应该尝试用纯CUDA内核编写大部分计算程序，或者编写CPU调用的CUDA内核（在从堆栈传递值不会产生显著开销的情况下）

为清晰起见进行了编辑。

一切都很重要。为了运行内核，CPU需要以某种方式传递调用哪个内核以及使用哪个参数。在“微观层面”，如果您的内核只执行几个操作，那么这些都是相当可观的开销。在现实生活中，如果你的内核做了很多工作，它们是可以忽略不计的

如果这样的小规模运营没有管道化的话，相对较大的服务费用可能会增加。你可以在NVidia的视觉分析器中看到这一点。我不知道/记得确切的数字，但顺序如下。CPU和GPU之间的带宽大约为1 GB/s，因此为1字节/纳秒。但实际上，发送4字节的数据包并获得确认大约需要1微秒。所以要发送10000字节，比如11微秒

此外，操作的执行针对GPU上的大规模执行进行了优化，因此使用一个32线程扭曲执行10个连续操作可能需要200个GPU时钟周期（如0.2微秒）。假设在内核启动之前发送内核执行命令的时间为0.5微秒

在现实生活中，问题通常在于，由于带宽限制，要计算1亿个数字的总和，需要花费0.4秒，而计算本身需要花费0.1微秒。因为顶级GPU可以在接近1纳秒长的周期内执行大约1000次操作。

在现实生活中，问题通常在于，由于带宽限制，要计算1亿个数字的总和，需要花费0.4秒，而计算本身需要花费0.1微秒。因为top GPU可以在每一个周期中执行大约1000个操作，大约1纳秒长。

嗨，我已经对这两个版本进行了基准测试。简单地调用CUDA函数确实会有明显的开销

这是输出--

这是我的基准--

/*
*test_area.cu
*
*创建日期：2018年1月11日
*作者：约瑟夫
*/
#ifndef测试区域_
#定义测试区域_
#包括
#包括
int线程（）{
返回256；
}
整数块（整数大小）{
返回（大小+线程（）-1）/线程（）；
}
__全球的__
void add_内核（float*a、float*b、float*c、int size）{
对于（int i=0；i

您好，我已经对这两个版本进行了基准测试。简单地调用CUDA函数确实会有明显的开销

这是输出--

这是我的基准--

/*
*test_area.cu
*
*创建日期：2018年1月11日
*作者：约瑟夫
*/
#ifndef测试区域_
#定义测试区域_
#包括
#包括
int线程（）{
返回256；
}
整数块（整数大小）{
返回（大小+线程（）-1）/线程（）；
}
__全球的__
void add_内核（float*a、float*b、float*c、int size）{
对于（int i=0；i__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
for (int j = 0; i < reps; ++j)
   for (int i = 0; i < size; ++i) {
       a[i] = b[i] + c[i];
   }

int main() {
int size = 100000; //Or any arbitrarily large number
int reps = 1000; //Or any arbitrarily large number

extern float* a; //float* of [size] allocated on the GPU 
extern float* b; //float* of [size] allocated on the GPU 
extern float* c; //float* of [size] allocated on the GPU 

add_kernel<<<blocks, threads>>>(a, b, c, size, reps); 
}


 Calculating... (BlackCat_Tensors) reps outside
It took me 27.359249 clicks (27.359249 seconds).

 Calculating... (BlackCat_Tensors) reps inside 
It took me 10.855168 clicks (10.855168 seconds).

/*
 * test_area.cu
 *
 *  Created on: Jan 11, 2018
 *      Author: joseph
 */

#ifndef TEST_AREA_CU_
#define TEST_AREA_CU_

#include <omp.h>
#include <stdio.h>


int threads() {
    return 256;
}
int blocks(int size) {
    return (size + threads() - 1) / threads();
}

__global__
void add_kernel(float* a, float* b, float* c, int size) {
   for (int i = 0; i < size; ++i) {
       a[i] = b[i] + c[i];
   }
}


__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
    for (int j = 0; j < reps; ++j)
   for (int i = 0; i < size; ++i) {
       a[i] = b[i] + c[i];
   }
}

int main() {
int sz = 10000; //Or any arbitrarily large number
int reps = 10000;   //Or any arbitrarily large number

float* a; //float* of [size] allocated on the GPU
 float* b; //float* of [size] allocated on the GPU
 float* c; //flo

cudaMalloc((void**)&a, sizeof(float) * sz);
cudaMalloc((void**)&b, sizeof(float) * sz);
cudaMalloc((void**)&c, sizeof(float) * sz);


float t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps outside\n");

for (int i = 0; i < reps; ++i) {
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz);
cudaDeviceSynchronize();
}
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));


 t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps inside \n");

add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz, reps);
cudaDeviceSynchronize();


t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));





 cudaFree(a);
 cudaFree(b);
 cudaFree(c);
}


#endif /* TEST_AREA_CU_ */

/*
 * test_area.cu
 *
 *  Created on: Jan 11, 2018
 *      Author: joseph
 */

#ifndef TEST_AREA_CU_
#define TEST_AREA_CU_

#include <omp.h>
#include <stdio.h>


int threads() {
    return 256;
}
int blocks(int size) {
    return (size + threads() - 1) / threads();
}

__global__
void add_kernel(float* a, float* b, float* c, int size) {
   for (int i = 0; i < size; ++i) {
       a[i] = b[i] + c[i];
   }
}


__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
    for (int j = 0; j < reps; ++j)
   for (int i = 0; i < size; ++i) {
       a[i] = b[i] + c[i];
   }
}

int main() {
int sz = 10000; //Or any arbitrarily large number
int reps = 1000;   //Or any arbitrarily large number

float* a; //float* of [size] allocated on the GPU
 float* b; //float* of [size] allocated on the GPU
 float* c; //flo

cudaMalloc((void**)&a, sizeof(float) * sz);
cudaMalloc((void**)&b, sizeof(float) * sz);
cudaMalloc((void**)&c, sizeof(float) * sz);


float t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps outside\n");

for (int i = 0; i < reps; ++i) {
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz);
cudaDeviceSynchronize();
}
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));


 t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps inside \n");

add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz, reps);
cudaDeviceSynchronize();


t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));





 cudaFree(a);
 cudaFree(b);
 cudaFree(c);
}


#endif /* TEST_AREA_CU_ */



 Calculating... (BlackCat_Tensors) reps outside
It took me 14.969501 clicks (14.969501 seconds).

 Calculating... (BlackCat_Tensors) reps inside 
It took me 13.060688 clicks (13.060688 seconds).