Matrix CUDA:将数据从GPU发送到GPU
我有两个GPU卡Tesla C2070(P2P和UAV支持),我想使用CUDA发送和接收数据Matrix CUDA:将数据从GPU发送到GPU,matrix,cuda,send,recv,Matrix,Cuda,Send,Recv,我有两个GPU卡Tesla C2070(P2P和UAV支持),我想使用CUDA发送和接收数据 在GPU A中,我有一个矩阵:a11 a12 a13 a14a21 a22 a23 a24 在GPU B中,我有另一个矩阵: b11 b12 b13 b14 b21 b22 b23 b24 我只能按以下代码发送连续元素: int main(void) { float *d_a, *d_b; int N = 4; int M = 2; size_t pitch;
- 在GPU A中,我有一个矩阵:
a11 a12 a13 a14
a21 a22 a23 a24 - 在GPU B中,我有另一个矩阵:
b11 b12 b13 b14
b21 b22 b23 b24
int main(void)
{
float *d_a, *d_b;
int N = 4;
int M = 2;
size_t pitch;
cudaSetDevice(0);
cudaMallocPitch(&d_a, &pitch, sizeof(float)*N, M);
cudaDeviceEnablePeerAccess(1, 0);
cudaSetDevice(1);
cudaMallocPitch(&d_b, &pitch, sizeof(float)*N, M);
cudaDeviceEnablePeerAccess(0, 0);
//Initialization for d_a
//Initialization for d_b
//Copy M*N/2 element from d_a to d_b, starting from d_a[1]
cudaMemcpy(&d_b[1], &d_a[1], M*N/2*sizeof(float), cudaMemcpyDefault);
//Print result d_b
}
如何将矩阵的最后两列从GPU A直接发送到GPU B,因此在GPU B上我将得到:b11 b12a13 a14
b21 b22a23 a24 类似地,如何将矩阵的第一行从GPU A发送到GPU B,因此在GPU B上我将得到:
a11 a12 a13 a14
b21 b22 b23 b24 如果我有如下一维阵列:a1 a2 a3 a4 a5 a6 a7 a8
如何从GPU A发送元素1、4、7、…(每3个元素)以替换GPU B上的相同元素?您需要查看的API调用是
cudaMemcpy2D
。这允许相当直接地复制所有或部分倾斜数据,是cudamallocitch
的自然对应
如果我们暂时撇开您问题的多GPU方面不谈,只关注倾斜数据的复制(在UVA平台中,如何处理GPU到GPU的传输基本上是一个您不需要了解的实现细节),只需要做三件事即可:
cudamallotch
返回)。注意,您应该为分配的每个指针保持一个音调。无法保证API会为相同大小的两个不同分配返回相同的音调,如果分配不在同一设备上,则尤其如此printf
来显示复制操作:
#include <cstdio>
struct mat
{
int m, n;
size_t pitch;
char *ptr;
__device__ __host__
mat(int _m, int _n, size_t _pitch, char *_ptr) : m(_m), n(_n), pitch(_pitch), ptr(_ptr) {};
__device__ __host__ float * getptr(int i=0, int j=0) {
float * col = (float*)(ptr + j*pitch);
return col + i;
};
__device__ __host__ float& operator() (int i, int j) {
return *getptr(i,j);
};
__device__ __host__
void print() {
for(int i=0; i<m; i++) {
for(int j=0; j<n; j++) {
printf("%4.f ", (*this)(i,j));
}
printf("\n");
}
};
};
__global__ void printmat(struct mat x) { x.print(); }
int main(void)
{
const int M = 5, N = 10;
const size_t hostpitch = M * sizeof(float);
float *a = new float[M*N], *b = new float[M*N];
mat A(M, N, hostpitch, (char *)(a));
mat B(M, N, hostpitch, (char *)(b));
for(int v=0, j=0; j<N; j++) {
for(int i=0; i<M; i++) {
A(i,j) = (float)v; B(i,j) = (float)(100+v++);
}
}
char *d_a, *d_b;
size_t pitch_a, pitch_b;
cudaMallocPitch((void **)&d_a, &pitch_a, sizeof(float)*M, N);
cudaMallocPitch((void **)&d_b, &pitch_b, sizeof(float)*M, N);
mat Ad(M, N, pitch_a, d_a); mat Bd(M, N, pitch_b, d_b);
cudaMemcpy2D(Ad.getptr(), Ad.pitch, A.getptr(), A.pitch,
A.pitch, A.n, cudaMemcpyHostToDevice);
printmat<<<1,1>>>(Ad);
cudaMemcpy2D(Bd.getptr(), Bd.pitch, B.getptr(), B.pitch,
B.pitch, B.n, cudaMemcpyHostToDevice);
printmat<<<1,1>>>(Bd);
int ci = 3, cj = 3;
cudaMemcpy2D(Ad.getptr(1,1), Ad.pitch, Bd.getptr(1,1), Bd.pitch,
ci*sizeof(float), cj, cudaMemcpyDeviceToDevice);
printmat<<<1,1>>>(Ad); cudaDeviceSynchronize();
return 0;
}
欢迎来到堆栈溢出!不幸的是,我们无法真正帮助你,除非你展示你迄今为止的尝试。你想看看CUBLAS,你想给我们展示一些代码:)我添加了一些代码。非常感谢,谢谢你的回复。非常清楚。这正是我想要的。
>nvcc -m32 -Xptxas="-v" -arch=sm_21 pitched.cu
pitched.cu
tmpxft_00001348_00000000-5_pitched.cudafe1.gpu
tmpxft_00001348_00000000-10_pitched.cudafe2.gpu
pitched.cu
ptxas : info : 0 bytes gmem, 8 bytes cmem[2]
ptxas : info : Compiling entry function '_Z8printmat3mat' for 'sm_21'
ptxas : info : Function properties for _Z8printmat3mat
8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 23 registers, 48 bytes cmem[0]
tmpxft_00001348_00000000-5_pitched.cudafe1.cpp
tmpxft_00001348_00000000-15_pitched.ii
>cuda-memcheck a.exe
========= CUDA-MEMCHECK
0 5 10 15 20 25 30 35 40 45
1 6 11 16 21 26 31 36 41 46
2 7 12 17 22 27 32 37 42 47
3 8 13 18 23 28 33 38 43 48
4 9 14 19 24 29 34 39 44 49
100 105 110 115 120 125 130 135 140 145
101 106 111 116 121 126 131 136 141 146
102 107 112 117 122 127 132 137 142 147
103 108 113 118 123 128 133 138 143 148
104 109 114 119 124 129 134 139 144 149
0 5 10 15 20 25 30 35 40 45
1 106 111 116 21 26 31 36 41 46
2 107 112 117 22 27 32 37 42 47
3 108 113 118 23 28 33 38 43 48
4 9 14 19 24 29 34 39 44 49
========= ERROR SUMMARY: 0 errors