多gpu CUDA推力 我有一个CUDA C++代码,它使用当前在单个GPU上正确工作的推力。我现在想修改它的多gpu。我有一个主机函数,它包括许多在设备阵列上排序、复制、计算差异等的推力调用。我想使用每个GPU同时在它自己的(独立的)阵列集上运行这一系列推力调用。我已经读到返回值的推力函数是同步的,但是我可以使用OpenMP让每个主机线程调用一个在单独的GPU上运行的函数(带有推力调用)吗
例如(在浏览器中编码):多gpu CUDA推力 我有一个CUDA C++代码,它使用当前在单个GPU上正确工作的推力。我现在想修改它的多gpu。我有一个主机函数,它包括许多在设备阵列上排序、复制、计算差异等的推力调用。我想使用每个GPU同时在它自己的(独立的)阵列集上运行这一系列推力调用。我已经读到返回值的推力函数是同步的,但是我可以使用OpenMP让每个主机线程调用一个在单独的GPU上运行的函数(带有推力调用)吗,cuda,openmp,thrust,Cuda,Openmp,Thrust,例如(在浏览器中编码): #pragma omp parallel for 对于(int-dev=0;devYes),您可以将推力和OpenMP结合使用 下面是一个完整的工作示例和结果: $ cat t340.cu #include <omp.h> #include <stdio.h> #include <stdlib.h> #include <thrust/host_vector.h> #include <thrust/device_ve
#pragma omp parallel for
对于(int-dev=0;devYes),您可以将推力和OpenMP结合使用
下面是一个完整的工作示例和结果:
$ cat t340.cu
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <time.h>
#include <sys/time.h>
#define DSIZE 200000000
using namespace std;
int main(int argc, char *argv[])
{
timeval t1, t2;
int num_gpus = 0; // number of CUDA GPUs
printf("%s Starting...\n\n", argv[0]);
// determine the number of CUDA capable GPUs
cudaGetDeviceCount(&num_gpus);
if (num_gpus < 1)
{
printf("no CUDA capable devices were detected\n");
return 1;
}
// display CPU and GPU configuration
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
printf("number of CUDA devices:\t%d\n", num_gpus);
for (int i = 0; i < num_gpus; i++)
{
cudaDeviceProp dprop;
cudaGetDeviceProperties(&dprop, i);
printf(" %d: %s\n", i, dprop.name);
}
printf("initialize data\n");
// initialize data
typedef thrust::device_vector<int> dvec;
typedef dvec *p_dvec;
std::vector<p_dvec> dvecs;
for(unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
p_dvec temp = new dvec(DSIZE);
dvecs.push_back(temp);
}
thrust::host_vector<int> data(DSIZE);
thrust::generate(data.begin(), data.end(), rand);
// copy data
for (unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
thrust::copy(data.begin(), data.end(), (*(dvecs[i])).begin());
}
printf("start sort\n");
gettimeofday(&t1,NULL);
// run as many CPU threads as there are CUDA devices
omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
#pragma omp parallel
{
unsigned int cpu_thread_id = omp_get_thread_num();
cudaSetDevice(cpu_thread_id);
thrust::sort((*(dvecs[cpu_thread_id])).begin(), (*(dvecs[cpu_thread_id])).end());
cudaDeviceSynchronize();
}
gettimeofday(&t2,NULL);
printf("finished\n");
unsigned long et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
if (cudaSuccess != cudaGetLastError())
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
printf("sort time = %fs\n", (float)et/(float)(1000000));
// check results
thrust::host_vector<int> result(DSIZE);
thrust::sort(data.begin(), data.end());
for (int i = 0; i < num_gpus; i++)
{
cudaSetDevice(i);
thrust::copy((*(dvecs[i])).begin(), (*(dvecs[i])).end(), result.begin());
for (int j = 0; j < DSIZE; j++)
if (data[j] != result[j]) { printf("mismatch on device %d at index %d, host: %d, device: %d\n", i, j, data[j], result[j]); return 1;}
}
printf("Success\n");
return 0;
}
$ nvcc -Xcompiler -fopenmp -O3 -arch=sm_20 -o t340 t340.cu -lgomp
$ CUDA_VISIBLE_DEVICES="0" ./t340
./t340 Starting...
number of host CPUs: 12
number of CUDA devices: 1
0: Tesla M2050
initialize data
start sort
finished
sort time = 0.398922s
Success
$ ./t340
./t340 Starting...
number of host CPUs: 12
number of CUDA devices: 4
0: Tesla M2050
1: Tesla M2070
2: Tesla M2050
3: Tesla M2070
initialize data
start sort
finished
sort time = 0.460058s
Success
$
$cat t340.cu
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义DSIZE 200000000
使用名称空间std;
int main(int argc,char*argv[])
{
时间值t1,t2;
int num_gpu=0;//CUDA GPU的数量
printf(“%s开始…\n\n”,argv[0]);
//确定支持CUDA的GPU的数量
cudaGetDeviceCount(&numgpu);
如果(numgpus<1)
{
printf(“未检测到支持CUDA的设备\n”);
返回1;
}
//显示CPU和GPU配置
printf(“主机CPU数量:\t%d\n”,omp_get_num_procs());
printf(“CUDA设备的数量:\t%d\n”,num\u gpu);
对于(int i=0;i
我们可以看到,当我将程序限制为使用单个设备时,排序操作大约需要0.4秒。然后,当我允许它使用所有4个设备(在所有4个设备上重复相同的排序)时,整个操作只需要0.46秒,即使我们做了4倍的工作
对于这个特殊情况,我碰巧使用了CUDA 5.0和推力v1.7,以及gcc 4.4.6(RHEL 6.2)为什么您需要从不同的主机线程设置设备?您能用一个像中那样简单的例子吗?大概是一种解决某些推力操作阻塞主机线程这一事实的方法。这样的加速比不是理想的4x,而是0.4*4/0.46=3.48x。干得好!
$ cat t340.cu
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <time.h>
#include <sys/time.h>
#define DSIZE 200000000
using namespace std;
int main(int argc, char *argv[])
{
timeval t1, t2;
int num_gpus = 0; // number of CUDA GPUs
printf("%s Starting...\n\n", argv[0]);
// determine the number of CUDA capable GPUs
cudaGetDeviceCount(&num_gpus);
if (num_gpus < 1)
{
printf("no CUDA capable devices were detected\n");
return 1;
}
// display CPU and GPU configuration
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
printf("number of CUDA devices:\t%d\n", num_gpus);
for (int i = 0; i < num_gpus; i++)
{
cudaDeviceProp dprop;
cudaGetDeviceProperties(&dprop, i);
printf(" %d: %s\n", i, dprop.name);
}
printf("initialize data\n");
// initialize data
typedef thrust::device_vector<int> dvec;
typedef dvec *p_dvec;
std::vector<p_dvec> dvecs;
for(unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
p_dvec temp = new dvec(DSIZE);
dvecs.push_back(temp);
}
thrust::host_vector<int> data(DSIZE);
thrust::generate(data.begin(), data.end(), rand);
// copy data
for (unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
thrust::copy(data.begin(), data.end(), (*(dvecs[i])).begin());
}
printf("start sort\n");
gettimeofday(&t1,NULL);
// run as many CPU threads as there are CUDA devices
omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
#pragma omp parallel
{
unsigned int cpu_thread_id = omp_get_thread_num();
cudaSetDevice(cpu_thread_id);
thrust::sort((*(dvecs[cpu_thread_id])).begin(), (*(dvecs[cpu_thread_id])).end());
cudaDeviceSynchronize();
}
gettimeofday(&t2,NULL);
printf("finished\n");
unsigned long et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
if (cudaSuccess != cudaGetLastError())
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
printf("sort time = %fs\n", (float)et/(float)(1000000));
// check results
thrust::host_vector<int> result(DSIZE);
thrust::sort(data.begin(), data.end());
for (int i = 0; i < num_gpus; i++)
{
cudaSetDevice(i);
thrust::copy((*(dvecs[i])).begin(), (*(dvecs[i])).end(), result.begin());
for (int j = 0; j < DSIZE; j++)
if (data[j] != result[j]) { printf("mismatch on device %d at index %d, host: %d, device: %d\n", i, j, data[j], result[j]); return 1;}
}
printf("Success\n");
return 0;
}
$ nvcc -Xcompiler -fopenmp -O3 -arch=sm_20 -o t340 t340.cu -lgomp
$ CUDA_VISIBLE_DEVICES="0" ./t340
./t340 Starting...
number of host CPUs: 12
number of CUDA devices: 1
0: Tesla M2050
initialize data
start sort
finished
sort time = 0.398922s
Success
$ ./t340
./t340 Starting...
number of host CPUs: 12
number of CUDA devices: 4
0: Tesla M2050
1: Tesla M2070
2: Tesla M2050
3: Tesla M2070
initialize data
start sort
finished
sort time = 0.460058s
Success
$