Cuda共享内存拷贝比较:哪种方法更好
在共享内存中将一个数组转换为另一个数组时,我尝试了六种不同的方法(请参阅程序中的注释)。经过讨论和测试,我的结论是: (1) memcpy并不比数组的元素拷贝快。 (2) 对于小阵列,方法3是最好的。对于较大的阵列,方法6是最好的Cuda共享内存拷贝比较:哪种方法更好,cuda,Cuda,在共享内存中将一个数组转换为另一个数组时,我尝试了六种不同的方法(请参阅程序中的注释)。经过讨论和测试,我的结论是: (1) memcpy并不比数组的元素拷贝快。 (2) 对于小阵列,方法3是最好的。对于较大的阵列,方法6是最好的 #include <stdio.h> #include <iostream> #include <time.h> #include <stdlib.h> #include <assert.h> const
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <stdlib.h>
#include <assert.h>
const int NUM_OF_BLOCKS = 1;
const int NUM_OF_THREADS_PER_BLOCK = 8;
const int NUM_OF_DATA = 6000;
const int NUM_OF_COPIES= 1000;
//const int NUM_OF_COPIES= 1000000;
cudaError_t cuda_status;
__shared__ int start_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int end_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int src[NUM_OF_DATA];
__shared__ int dest[NUM_OF_DATA];
using namespace std;
__device__ void init(){
unsigned int num_of_data_per_thread = NUM_OF_DATA / NUM_OF_THREADS_PER_BLOCK;
unsigned int extra_data = NUM_OF_DATA % NUM_OF_THREADS_PER_BLOCK;
int size[NUM_OF_THREADS_PER_BLOCK];
start_index[threadIdx.x] = threadIdx.x * num_of_data_per_thread;
if (threadIdx.x < extra_data){
start_index[threadIdx.x] = start_index[threadIdx.x] + threadIdx.x;
size[threadIdx.x] = num_of_data_per_thread + 1;
}else{
start_index[threadIdx.x] = start_index[threadIdx.x] + extra_data;
size[threadIdx.x] = num_of_data_per_thread ;
}
end_index[threadIdx.x] = start_index[threadIdx.x] + size[threadIdx.x] -1;
//printf("start_index[%d] = %d, end_index[%d] = %d\n", threadIdx.x, start_index[threadIdx.x], threadIdx.x, end_index[threadIdx.x]);
}
__device__ void inc_src_data(int* src){
int i;
for (i = 0; i < NUM_OF_DATA; i++, src++){
*src += 1;
}
//__threadfence_block();
}
template <int sel>
__device__ void copy_to_dest_array(int* src, int* dest){
int i;
switch (sel){
case 1:
// Approach 1: every thread executes memcpy
memcpy(dest, src, NUM_OF_DATA * sizeof(int));
break;
case 2:
// Approach 2: one thread executes memcpy and then threadfence
if (threadIdx.x == 0){
memcpy(dest, src, NUM_OF_DATA * sizeof(int));
__threadfence_block();
}
break;
case 3:
// Approach 3: every thread copies each element individually
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
//__threadfence_block(); // added this line to demonstrate timing difference
break;
case 4:
// Approach 4: one thread copy each element individually and then threadfence
if (threadIdx.x == 0)
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
__threadfence_block();
break;
case 5:
// Approach 5: every thread execute memcpy and then threadfence
memcpy(dest+start_index[threadIdx.x], src + start_index[threadIdx.x], (end_index[threadIdx.x] - start_index[threadIdx.x] + 1) * sizeof(int));
__threadfence_block();
break;
case 6:
// Approach 6: every thread copies each element individually and then threadfence
for (i = start_index[threadIdx.x]; i <= end_index[threadIdx.x]; i++){
dest[i] = src[i];
}
__threadfence_block();
break;
default:
assert(0);
break;
}
}
template <int sel>
__global__ void copy_data_test(int* data){
init();
copy_to_dest_array<sel>(data, src);
for (int i = 0; i < NUM_OF_COPIES; i++){
inc_src_data(src);
copy_to_dest_array<sel>(&src[0], &dest[0]);
}
copy_to_dest_array<sel>(dest, data);
}
template <int sel>
__global__ void copy_data_test(int* data){
init();
copy_to_dest_array<sel>(data, src);
for (int i = 0; i < NUM_OF_COPIES; i++){
inc_src_data(src);
copy_to_dest_array<sel>(&src[0], &dest[0]);
}
copy_to_dest_array<sel>(dest, data);
}
template <int sel>
void run_test(int *rdata, int *hdata, int *ddata){
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaMemcpy(ddata, hdata, NUM_OF_DATA * sizeof(int), cudaMemcpyHostToDevice);
cudaEventRecord(start);
copy_data_test<sel><<<NUM_OF_BLOCKS, NUM_OF_THREADS_PER_BLOCK>>>(ddata);
cudaEventRecord(stop);
cout << "kernel error: " << cudaGetErrorString(cudaPeekAtLastError()) << "---" << cudaGetErrorString(cudaDeviceSynchronize()) << endl;
cudaMemcpy(rdata, ddata, NUM_OF_DATA * sizeof(int), cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
cout << "Trial " << sel << " elapsed time: " << et << "ms" << endl;
/*
cout << "after kernel processing" << endl;
for (int i = 0; i < NUM_OF_DATA; i++)
cout << rdata[i] << " ";
cout << endl;
*/
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
int main(int argc, char **argv){
int h_data[NUM_OF_DATA];
int r_data[NUM_OF_DATA];
int* d_data;
int i;
cudaSetDevice(0);
srand(time(NULL));
/*
cout << "before kernel processing" << endl;
for (i = 0; i < NUM_OF_DATA; i++){
h_data[i] = rand()%100;
cout << h_data[i] << " ";
}
cout << endl;
*/
cudaMalloc(&d_data, sizeof(int) * NUM_OF_DATA);
run_test<1>(r_data, h_data, d_data);
run_test<2>(r_data, h_data, d_data);
run_test<3>(r_data, h_data, d_data);
run_test<4>(r_data, h_data, d_data);
run_test<5>(r_data, h_data, d_data);
run_test<6>(r_data, h_data, d_data);
return 0;
}
#包括
#包括
#包括
#包括
#包括
const int NUM_OF_块=1;
const int NUM_每个块的线程数=8;
const int NUM_OF_DATA=6000;
const int NUM_OF_COPIES=1000;
//const int NUM_OF_COPIES=1000000;
错误状态;
__共享的开始索引[每个块的线程数];
__共享的线程结束索引[每个块的线程数];
__shared_uuuint src[NUM_OF_DATA];
__共享的目的地[数据的数量];
使用名称空间std;
__设备\无效初始化(){
每个线程的未签名整数num\u=每个块的线程数/num\u;
unsigned int extra_data=NUM_OF_data%NUM_OF_THREADS_PER_BLOCK;
int size[每个块的线程数];
start_index[threadIdx.x]=threadIdx.x*num_每个_线程的_数据_;
if(threadIdx.x<额外数据){
开始索引[threadIdx.x]=开始索引[threadIdx.x]+threadIdx.x;
size[threadIdx.x]=每个线程的线程数据数量+1;
}否则{
开始索引[threadIdx.x]=开始索引[threadIdx.x]+额外数据;
size[threadIdx.x]=每个线程的线程数;
}
end_index[threadIdx.x]=start_index[threadIdx.x]+size[threadIdx.x]-1;
//printf(“开始索引[%d]=%d,结束索引[%d]=%d\n”,threadIdx.x,开始索引[threadIdx.x],threadIdx.x,结束索引[threadIdx.x]);
}
__设备无效inc\u src\u数据(int*src){
int i;
对于(i=0;i 对于(i=start_index[threadIdx.x];i来说,\uuuuuthreadfence\u block()
似乎是一个昂贵的操作。4个最长的测试用例都使用\uthreadfence\u block()
。两个最短的测试用例不使用
如果我将\uuuu threadfence\u block()
添加到第三个(即最短的)测试用例中,则计时(对我而言)从~2秒更改为~17秒
请注意,您的测试用例并非都在做完全相同的事情,输出结果的差异就证明了这一点。我对您的代码进行了修改,以更清楚地说明这一点:
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <stdlib.h>
#include <assert.h>
const int NUM_OF_BLOCKS = 1;
const int NUM_OF_THREADS_PER_BLOCK = 8;
const int NUM_OF_DATA = 50;
const int NUM_OF_COPIES= 10000000;
cudaError_t cuda_status;
__shared__ int start_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int end_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int src[NUM_OF_DATA];
__shared__ int dest[NUM_OF_DATA];
using namespace std;
__device__ void init(){
unsigned int num_of_data_per_thread = NUM_OF_DATA / NUM_OF_THREADS_PER_BLOCK;
unsigned int extra_data = NUM_OF_DATA % NUM_OF_THREADS_PER_BLOCK;
int size[NUM_OF_THREADS_PER_BLOCK];
start_index[threadIdx.x] = threadIdx.x * num_of_data_per_thread;
if (threadIdx.x < extra_data){
start_index[threadIdx.x] = start_index[threadIdx.x] + threadIdx.x;
size[threadIdx.x] = num_of_data_per_thread + 1;
}else{
start_index[threadIdx.x] = start_index[threadIdx.x] + extra_data;
size[threadIdx.x] = num_of_data_per_thread ;
}
end_index[threadIdx.x] = start_index[threadIdx.x] + size[threadIdx.x] -1;
}
__device__ void inc_src_data(int* src){
int i;
for (i = 0; i < NUM_OF_DATA; i++, src++){
*src += 1;
}
//__threadfence_block();
}
template <int sel>
__device__ void copy_to_dest_array(int* src, int* dest){
int i;
switch (sel){
case 1:
// Approach 1: every thread executes memcpy
memcpy(dest, src, NUM_OF_DATA);
break;
case 2:
// Approach 2: one thread executes memcpy and then threadfence
if (threadIdx.x == 0){
memcpy(dest, src, NUM_OF_DATA);
__threadfence_block();
}
break;
case 3:
// Approach 3: every thread copies each element individually
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
__threadfence_block(); // added this line to demonstrate timing difference
break;
case 4:
// Approach 4: one thread copy each element individually and then threadfence
if (threadIdx.x == 0)
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
__threadfence_block();
break;
case 5:
// Approach 5: every thread execute memcpy and then threadfence
memcpy(dest+start_index[threadIdx.x], src + start_index[threadIdx.x], end_index[threadIdx.x] - start_index[threadIdx.x] + 1);
__threadfence_block();
break;
case 6:
// Approach 6: every thread copies each element individually and then threadfence
for (i = start_index[threadIdx.x]; i <= end_index[threadIdx.x]; i++){
*(dest + i) = *(src + i);
}
__threadfence_block();
break;
default:
assert(0);
break;
}
}
template <int sel>
__global__ void copy_data_test(int* data){
init();
copy_to_dest_array<sel>(data, src);
for (int i = 0; i < NUM_OF_COPIES; i++){
inc_src_data(src);
copy_to_dest_array<sel>(&src[0], &dest[0]);
}
copy_to_dest_array<sel>(dest, data);
}
template <int sel>
void run_test(int *rdata, int *hdata, int *ddata){
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaMemcpy(ddata, hdata, NUM_OF_DATA * sizeof(int), cudaMemcpyHostToDevice);
cudaEventRecord(start);
copy_data_test<sel><<<NUM_OF_BLOCKS, NUM_OF_THREADS_PER_BLOCK>>>(ddata);
cudaEventRecord(stop);
cout << "kernel error: " << cudaGetErrorString(cudaPeekAtLastError()) << "---" << cudaGetErrorString(cudaDeviceSynchronize()) << endl;
cudaMemcpy(rdata, ddata, NUM_OF_DATA * sizeof(int), cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
cout << "Trial " << sel << " elapsed time: " << et << "ms" << endl;
cout << "after kernel processing" << endl;
for (int i = 0; i < NUM_OF_DATA; i++)
cout << rdata[i] << " ";
cout << endl;
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
int main(int argc, char **argv){
int h_data[NUM_OF_DATA];
int r_data[NUM_OF_DATA];
int* d_data;
int i;
cudaSetDevice(0);
srand(time(NULL));
cout << "before kernel processing" << endl;
for (i = 0; i < NUM_OF_DATA; i++){
h_data[i] = rand()%100;
cout << h_data[i] << " ";
}
cout << endl;
cudaMalloc(&d_data, sizeof(int) * NUM_OF_DATA);
run_test<1>(r_data, h_data, d_data);
run_test<2>(r_data, h_data, d_data);
run_test<3>(r_data, h_data, d_data);
run_test<4>(r_data, h_data, d_data);
run_test<5>(r_data, h_data, d_data);
run_test<6>(r_data, h_data, d_data);
return 0;
}
#包括
#包括
#包括
#包括
#包括
const int NUM_OF_块=1;
const int NUM_每个块的线程数=8;
const int NUM_OF_DATA=50;
const int NUM_OF_COPIES=10000000;
错误状态;
__共享的开始索引[每个块的线程数];
__共享的线程结束索引[每个块的线程数];
__shared_uuuint src[NUM_OF_DATA];
__共享的目的地[数据的数量];
使用名称空间std;
__设备\无效初始化(){
每个线程的未签名整数num\u=每个块的线程数/num\u;
unsigned int extra_data=NUM_OF_data%NUM_OF_THREADS_PER_BLOCK;
int size[每个块的线程数];
start_index[threadIdx.x]=threadIdx.x*num_每个_线程的_数据_;
if(threadIdx.x<额外数据){
开始索引[threadIdx.x]=开始索引[threadIdx.x]+threadIdx.x;
size[threadIdx.x]=每个线程的线程数据数量+1;
}否则{
开始索引[threadIdx.x]=开始索引[threadIdx.x]+额外数据;
size[threadIdx.x]=每个线程的线程数;
}
end_index[threadIdx.x]=start_index[threadIdx.x]+size[threadIdx.x]-1;
}
__设备无效inc\u src\u数据(int*src){
int i;
对于(i=0;i