Cuda共享内存拷贝比较:哪种方法更好

Cuda共享内存拷贝比较:哪种方法更好,cuda,Cuda,在共享内存中将一个数组转换为另一个数组时,我尝试了六种不同的方法(请参阅程序中的注释)。经过讨论和测试,我的结论是: (1) memcpy并不比数组的元素拷贝快。 (2) 对于小阵列,方法3是最好的。对于较大的阵列,方法6是最好的 #include <stdio.h> #include <iostream> #include <time.h> #include <stdlib.h> #include <assert.h> const

在共享内存中将一个数组转换为另一个数组时,我尝试了六种不同的方法(请参阅程序中的注释)。经过讨论和测试,我的结论是: (1) memcpy并不比数组的元素拷贝快。 (2) 对于小阵列,方法3是最好的。对于较大的阵列,方法6是最好的

#include <stdio.h>
#include <iostream>
#include <time.h>
#include <stdlib.h>
#include <assert.h>

const int NUM_OF_BLOCKS = 1;
const int NUM_OF_THREADS_PER_BLOCK = 8;
const int NUM_OF_DATA = 6000;
const int NUM_OF_COPIES= 1000;
//const int NUM_OF_COPIES= 1000000;

cudaError_t cuda_status;

__shared__ int start_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int end_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int src[NUM_OF_DATA];
__shared__ int dest[NUM_OF_DATA];

using namespace std;

__device__ void init(){
   unsigned int num_of_data_per_thread = NUM_OF_DATA / NUM_OF_THREADS_PER_BLOCK;
   unsigned int extra_data = NUM_OF_DATA % NUM_OF_THREADS_PER_BLOCK;
   int size[NUM_OF_THREADS_PER_BLOCK];

   start_index[threadIdx.x] = threadIdx.x * num_of_data_per_thread;

   if (threadIdx.x < extra_data){
       start_index[threadIdx.x] = start_index[threadIdx.x] + threadIdx.x;
       size[threadIdx.x] = num_of_data_per_thread + 1;
   }else{
       start_index[threadIdx.x] = start_index[threadIdx.x] + extra_data;
       size[threadIdx.x] = num_of_data_per_thread ;
   }

   end_index[threadIdx.x] = start_index[threadIdx.x] + size[threadIdx.x] -1;
   //printf("start_index[%d] = %d, end_index[%d] = %d\n", threadIdx.x, start_index[threadIdx.x], threadIdx.x, end_index[threadIdx.x]);
}

__device__ void inc_src_data(int* src){
  int i;

  for (i = 0; i < NUM_OF_DATA; i++, src++){
      *src += 1;
  }
  //__threadfence_block();
}

template <int sel>
__device__ void copy_to_dest_array(int* src, int* dest){
   int i;

   switch (sel){
   case 1:

   // Approach 1: every thread executes memcpy
   memcpy(dest, src, NUM_OF_DATA * sizeof(int));
   break;

   case 2:
   // Approach 2: one thread executes memcpy and then threadfence
   if (threadIdx.x == 0){
      memcpy(dest, src, NUM_OF_DATA * sizeof(int));
      __threadfence_block();
   }
   break;
   case 3:
   // Approach 3: every thread copies each element individually
   for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
       *dest = *src;
      //__threadfence_block();  // added this line to demonstrate timing difference
   break;
   case 4:
   // Approach 4: one thread copy each element individually and then threadfence
   if (threadIdx.x == 0)
       for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
            *dest = *src;
   __threadfence_block();
   break;
   case 5:
   // Approach 5: every thread execute memcpy and then threadfence
   memcpy(dest+start_index[threadIdx.x], src + start_index[threadIdx.x], (end_index[threadIdx.x] - start_index[threadIdx.x] + 1) * sizeof(int));
   __threadfence_block();
   break;
   case 6:
   // Approach 6: every thread copies each element individually and then threadfence
   for (i = start_index[threadIdx.x]; i <= end_index[threadIdx.x]; i++){
       dest[i] = src[i];
   }
   __threadfence_block();
   break;
   default:
   assert(0);
   break;
   }
}

template <int sel>
__global__ void copy_data_test(int* data){
   init();

   copy_to_dest_array<sel>(data, src);

   for (int i = 0; i < NUM_OF_COPIES; i++){
       inc_src_data(src);
       copy_to_dest_array<sel>(&src[0], &dest[0]);
   }
   copy_to_dest_array<sel>(dest, data);

}

template <int sel>
__global__ void copy_data_test(int* data){
   init();

   copy_to_dest_array<sel>(data, src);

   for (int i = 0; i < NUM_OF_COPIES; i++){
       inc_src_data(src);
       copy_to_dest_array<sel>(&src[0], &dest[0]);
   }
   copy_to_dest_array<sel>(dest, data);

}

template <int sel>
void run_test(int *rdata, int *hdata, int *ddata){
  cudaEvent_t start, stop;
  cudaEventCreate(&start); cudaEventCreate(&stop);
  cudaMemcpy(ddata, hdata, NUM_OF_DATA * sizeof(int), cudaMemcpyHostToDevice);
  cudaEventRecord(start);
  copy_data_test<sel><<<NUM_OF_BLOCKS, NUM_OF_THREADS_PER_BLOCK>>>(ddata);
  cudaEventRecord(stop);
  cout << "kernel error: " << cudaGetErrorString(cudaPeekAtLastError()) << "---" << cudaGetErrorString(cudaDeviceSynchronize()) << endl;

  cudaMemcpy(rdata, ddata, NUM_OF_DATA * sizeof(int), cudaMemcpyDeviceToHost);
  cudaEventSynchronize(stop);
  float et;
  cudaEventElapsedTime(&et, start, stop);
  cout << "Trial " << sel << " elapsed time: " << et << "ms" << endl;
/*
  cout << "after kernel processing" << endl;
  for (int i = 0; i < NUM_OF_DATA; i++)
      cout << rdata[i] << " ";
  cout << endl;
*/
  cudaEventDestroy(start);
  cudaEventDestroy(stop);

}

int main(int argc, char **argv){
  int h_data[NUM_OF_DATA];
  int r_data[NUM_OF_DATA];
  int* d_data;
  int i;

  cudaSetDevice(0);

  srand(time(NULL));
/*
  cout << "before kernel processing" << endl;
  for (i = 0; i < NUM_OF_DATA; i++){
      h_data[i] = rand()%100;
      cout << h_data[i] << " ";
  }
  cout << endl;
*/
  cudaMalloc(&d_data, sizeof(int) * NUM_OF_DATA);

  run_test<1>(r_data, h_data, d_data);
  run_test<2>(r_data, h_data, d_data);
  run_test<3>(r_data, h_data, d_data);
  run_test<4>(r_data, h_data, d_data);
  run_test<5>(r_data, h_data, d_data);
  run_test<6>(r_data, h_data, d_data);

  return 0;
}
#包括
#包括
#包括
#包括
#包括
const int NUM_OF_块=1;
const int NUM_每个块的线程数=8;
const int NUM_OF_DATA=6000;
const int NUM_OF_COPIES=1000;
//const int NUM_OF_COPIES=1000000;
错误状态;
__共享的开始索引[每个块的线程数];
__共享的线程结束索引[每个块的线程数];
__shared_uuuint src[NUM_OF_DATA];
__共享的目的地[数据的数量];
使用名称空间std;
__设备\无效初始化(){
每个线程的未签名整数num\u=每个块的线程数/num\u;
unsigned int extra_data=NUM_OF_data%NUM_OF_THREADS_PER_BLOCK;
int size[每个块的线程数];
start_index[threadIdx.x]=threadIdx.x*num_每个_线程的_数据_;
if(threadIdx.x<额外数据){
开始索引[threadIdx.x]=开始索引[threadIdx.x]+threadIdx.x;
size[threadIdx.x]=每个线程的线程数据数量+1;
}否则{
开始索引[threadIdx.x]=开始索引[threadIdx.x]+额外数据;
size[threadIdx.x]=每个线程的线程数;
}
end_index[threadIdx.x]=start_index[threadIdx.x]+size[threadIdx.x]-1;
//printf(“开始索引[%d]=%d,结束索引[%d]=%d\n”,threadIdx.x,开始索引[threadIdx.x],threadIdx.x,结束索引[threadIdx.x]);
}
__设备无效inc\u src\u数据(int*src){
int i;
对于(i=0;i对于(i=start_index[threadIdx.x];i来说,
\uuuuuthreadfence\u block()
似乎是一个昂贵的操作。4个最长的测试用例都使用
\uthreadfence\u block()
。两个最短的测试用例不使用

如果我将
\uuuu threadfence\u block()
添加到第三个(即最短的)测试用例中,则计时(对我而言)从~2秒更改为~17秒

请注意,您的测试用例并非都在做完全相同的事情,输出结果的差异就证明了这一点。我对您的代码进行了修改,以更清楚地说明这一点:

#include <stdio.h>
#include <iostream>
#include <time.h>
#include <stdlib.h>
#include <assert.h>

const int NUM_OF_BLOCKS = 1;
const int NUM_OF_THREADS_PER_BLOCK = 8;
const int NUM_OF_DATA = 50;
const int NUM_OF_COPIES= 10000000;

cudaError_t cuda_status;

__shared__ int start_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int end_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int src[NUM_OF_DATA];
__shared__ int dest[NUM_OF_DATA];

using namespace std;

__device__ void init(){
   unsigned int num_of_data_per_thread = NUM_OF_DATA / NUM_OF_THREADS_PER_BLOCK;
   unsigned int extra_data = NUM_OF_DATA % NUM_OF_THREADS_PER_BLOCK;
   int size[NUM_OF_THREADS_PER_BLOCK];

   start_index[threadIdx.x] = threadIdx.x * num_of_data_per_thread;

   if (threadIdx.x < extra_data){
       start_index[threadIdx.x] = start_index[threadIdx.x] + threadIdx.x;
       size[threadIdx.x] = num_of_data_per_thread + 1;
   }else{
       start_index[threadIdx.x] = start_index[threadIdx.x] + extra_data;
       size[threadIdx.x] = num_of_data_per_thread ;
   }

   end_index[threadIdx.x] = start_index[threadIdx.x] + size[threadIdx.x] -1;
}

__device__ void inc_src_data(int* src){
  int i;

  for (i = 0; i < NUM_OF_DATA; i++, src++){
      *src += 1;
  }
  //__threadfence_block();
}

template <int sel>
__device__ void copy_to_dest_array(int* src, int* dest){
   int i;

   switch (sel){
   case 1:

   // Approach 1: every thread executes memcpy
   memcpy(dest, src, NUM_OF_DATA);
   break;

   case 2:
   // Approach 2: one thread executes memcpy and then threadfence
   if (threadIdx.x == 0){
      memcpy(dest, src, NUM_OF_DATA);
      __threadfence_block();
   }
   break;
   case 3:
   // Approach 3: every thread copies each element individually
   for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
       *dest = *src;
      __threadfence_block();  // added this line to demonstrate timing difference
   break;
   case 4:
   // Approach 4: one thread copy each element individually and then threadfence
   if (threadIdx.x == 0)
       for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
            *dest = *src;
   __threadfence_block();
   break;
   case 5:
   // Approach 5: every thread execute memcpy and then threadfence
   memcpy(dest+start_index[threadIdx.x], src + start_index[threadIdx.x], end_index[threadIdx.x] - start_index[threadIdx.x] + 1);
   __threadfence_block();
   break;
   case 6:
   // Approach 6: every thread copies each element individually and then threadfence
   for (i = start_index[threadIdx.x]; i <= end_index[threadIdx.x]; i++){
       *(dest + i) = *(src + i);
   }
   __threadfence_block();
   break;
   default:
   assert(0);
   break;
   }
}

template <int sel>
__global__ void copy_data_test(int* data){
   init();

   copy_to_dest_array<sel>(data, src);

   for (int i = 0; i < NUM_OF_COPIES; i++){
       inc_src_data(src);
       copy_to_dest_array<sel>(&src[0], &dest[0]);
   }
   copy_to_dest_array<sel>(dest, data);

}

template <int sel>
void run_test(int *rdata, int *hdata, int *ddata){
  cudaEvent_t start, stop;
  cudaEventCreate(&start); cudaEventCreate(&stop);
  cudaMemcpy(ddata, hdata, NUM_OF_DATA * sizeof(int), cudaMemcpyHostToDevice);
  cudaEventRecord(start);
  copy_data_test<sel><<<NUM_OF_BLOCKS, NUM_OF_THREADS_PER_BLOCK>>>(ddata);
  cudaEventRecord(stop);
  cout << "kernel error: " << cudaGetErrorString(cudaPeekAtLastError()) << "---" << cudaGetErrorString(cudaDeviceSynchronize()) << endl;

  cudaMemcpy(rdata, ddata, NUM_OF_DATA * sizeof(int), cudaMemcpyDeviceToHost);
  cudaEventSynchronize(stop);
  float et;
  cudaEventElapsedTime(&et, start, stop);
  cout << "Trial " << sel << " elapsed time: " << et << "ms" << endl;
  cout << "after kernel processing" << endl;
  for (int i = 0; i < NUM_OF_DATA; i++)
      cout << rdata[i] << " ";
  cout << endl;
  cudaEventDestroy(start);
  cudaEventDestroy(stop);

}



int main(int argc, char **argv){
  int h_data[NUM_OF_DATA];
  int r_data[NUM_OF_DATA];
  int* d_data;
  int i;

  cudaSetDevice(0);

  srand(time(NULL));
  cout << "before kernel processing" << endl;
  for (i = 0; i < NUM_OF_DATA; i++){
      h_data[i] = rand()%100;
      cout << h_data[i] << " ";
  }
  cout << endl;
  cudaMalloc(&d_data, sizeof(int) * NUM_OF_DATA);

  run_test<1>(r_data, h_data, d_data);
  run_test<2>(r_data, h_data, d_data);
  run_test<3>(r_data, h_data, d_data);
  run_test<4>(r_data, h_data, d_data);
  run_test<5>(r_data, h_data, d_data);
  run_test<6>(r_data, h_data, d_data);

  return 0;
}
#包括
#包括
#包括
#包括
#包括
const int NUM_OF_块=1;
const int NUM_每个块的线程数=8;
const int NUM_OF_DATA=50;
const int NUM_OF_COPIES=10000000;
错误状态;
__共享的开始索引[每个块的线程数];
__共享的线程结束索引[每个块的线程数];
__shared_uuuint src[NUM_OF_DATA];
__共享的目的地[数据的数量];
使用名称空间std;
__设备\无效初始化(){
每个线程的未签名整数num\u=每个块的线程数/num\u;
unsigned int extra_data=NUM_OF_data%NUM_OF_THREADS_PER_BLOCK;
int size[每个块的线程数];
start_index[threadIdx.x]=threadIdx.x*num_每个_线程的_数据_;
if(threadIdx.x<额外数据){
开始索引[threadIdx.x]=开始索引[threadIdx.x]+threadIdx.x;
size[threadIdx.x]=每个线程的线程数据数量+1;
}否则{
开始索引[threadIdx.x]=开始索引[threadIdx.x]+额外数据;
size[threadIdx.x]=每个线程的线程数;
}
end_index[threadIdx.x]=start_index[threadIdx.x]+size[threadIdx.x]-1;
}
__设备无效inc\u src\u数据(int*src){
int i;
对于(i=0;i