使用CUDA的矩阵乘法(线程的编号方式)
我试图抓住我的程序中的一个错误,该程序使用CUDA将平方矩阵相乘。只要无法从内核中打印f()(请告诉您是否知道如何执行),我就会向内核发送一个附加结构,以获取线程、块、行和列的数量。结构的所有元素都是-1(用于识别它们)。因此,矩阵乘积部分是不正确的,但是来自内核的数据完全把我弄糊涂了。 代码如下:使用CUDA的矩阵乘法(线程的编号方式),cuda,matrix-multiplication,Cuda,Matrix Multiplication,我试图抓住我的程序中的一个错误,该程序使用CUDA将平方矩阵相乘。只要无法从内核中打印f()(请告诉您是否知道如何执行),我就会向内核发送一个附加结构,以获取线程、块、行和列的数量。结构的所有元素都是-1(用于识别它们)。因此,矩阵乘积部分是不正确的,但是来自内核的数据完全把我弄糊涂了。 代码如下: typedef struct { int tx; int ty; int bx; int by; int rw; int cl; } THREADS; __global__ void mult1
typedef struct {
int tx;
int ty;
int bx;
int by;
int rw;
int cl;
} THREADS;
__global__ void mult1_kernel (float *a, float *b, float *c, THREADS *d, int n) {
int k;
float sum = 0.0f;
int rw = blockIdx.y * blockDim.y + threadIdx.y;
int cl = blockIdx.x * blockDim.x + threadIdx.x;
for (k = 0; k < n; k++)
sum += a[rw*n + k] * b[k*n + cl];
c[rw*n + cl] = sum;
d[rw*n + cl].tx = threadIdx.y;
d[rw*n + cl].ty = threadIdx.x;
d[rw*n + cl].bx = blockIdx.y;
d[rw*n + cl].by = blockIdx.x;
d[rw*n + cl].rw = rw;
d[rw*n + cl].cl = cl;
}
#include <stdio.h>
#include <stdlib.h>
#include "cuPrintf.cuh"
#include "cuPrintf.cu"
__global__ void mult1_kernel (float *a, float *b, float *c, int n) {
int k;
float sum = 0.0f;
int rw = blockIdx.y * blockDim.y + threadIdx.y;
int cl = blockIdx.x * blockDim.x + threadIdx.x;
if ((rw < n) && (cl < n)) {
rw *= n;
for (k = 0; k < n; k++)
sum += a[rw + k] * b[k*n + cl];
cuPrintf ("Thread[%d][%d] from block[%d][%d]:\n rw = %d, cl = %d, sum = %f\n\n",
threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, rw/n, cl, sum);
}
c[rw + cl] = sum;
}
__host__ int mult1_host (float *a, float *b, float *c, int n) {
cudaEvent_t start, stop;
cudaError_t cuerr;
int i, j;
int size = n * n * sizeof (float);
float gpuTime = 0.0f;
float *aDev = NULL, *bDev = NULL, *cDev = NULL;
cuerr = cudaMalloc ((void**)&aDev, size);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot allocate GPU memory for aDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaMalloc ((void**)&bDev, size);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot allocate GPU memory for bDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaMalloc ((void**)&cDev, size);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot allocate GPU memory for cDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
dim3 blockSize = dim3 (16, 16);
dim3 gridSize = dim3 ((n+15)/16, (n+15)/16);
cuerr = cudaMemcpy (aDev, a, size, cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot copy data from a to aDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaMemcpy (bDev, b, size, cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot copy data from a to bDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cudaPrintfInit ();
cudaEventCreate (&start);
cudaEventCreate (&stop);
cudaEventRecord (start, 0);
mult1_kernel <<< gridSize, blockSize >>> (aDev, bDev, cDev, n);
cudaEventRecord (stop, 0);
cudaEventSynchronize (stop);
cudaEventElapsedTime (&gpuTime, start, stop);
cuerr = cudaGetLastError ();
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot launch CUDA kernel: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaDeviceSynchronize ();
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot synchronize CUDA kernel: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cudaPrintfDisplay (stdout, true);
cudaPrintfEnd ();
cuerr = cudaMemcpy (c, cDev, size, cudaMemcpyDeviceToHost);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot copy data from c to cDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
printf ("Time spent executing on the GPU: %.2f millseconds\n", gpuTime);
printf ("\n");
cudaEventDestroy (start);
cudaEventDestroy (stop);
cudaFree (aDev);
cudaFree (bDev);
cudaFree (cDev);
return (0);
}
int main () {
int i, j, k;
int n;
float *a, *b, *c;
printf ("Enter the matrix size: ");
scanf ("%d", &n);
printf ("Blocks = %d\n", n*n/512 + 1);
if (n < 1) {
printf ("Invalid matrix size\n");
return (-1);
}
if (!(a = (float*) malloc (n * n * sizeof (float)))) {
fprintf (stderr, "Cannot allocate CPU memory for a\n");
return (-1);
}
if (!(b = (float*) malloc (n * n * sizeof (float)))) {
fprintf (stderr, "Cannot allocate CPU memory for b\n");
return (-1);
}
if (!(c = (float*) malloc (n * n * sizeof (float)))) {
fprintf (stderr, "Cannot allocate CPU memory for c\n");
return (-1);
}
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
a[i*n + j] = (float)(i + j + 1);
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
b[i*n + j] = 1 / (float)(i + j + 1);
printf ("Input matrix A:\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", a[i*n + j]);
printf ("\n");
}
printf ("\n");
printf ("Input matrix B:\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", b[i*n + j]);
printf ("\n");
}
printf ("\n");
if (mult1_host (a, b, c, n) < 0) {
return (-1);
}
printf ("Matrix product of A and B (CUDA):\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", c[i*n + j]);
printf ("\n");
}
printf ("\n");
for (i = 0; i < n; i++)
for (j = 0; j < n; j++) {
c[i*n +j] = 0;
for (k = 0; k < n; k++)
c[i*n + j] += a[i*n + k] * b[k*n + j];
}
printf ("Matrix product of A and B (Check):\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", c[i*n + j]);
printf ("\n");
}
free (a);
free (b);
free (c);
return (0);
}
现在给出的是:
Input matrix A:
1 2 3 4
2 3 4 5
3 4 5 6
4 5 6 7
Input matrix B:
1 0.5 0.333 0.25
0.5 0.333 0.25 0.2
0.333 0.25 0.2 0.167
0.25 0.2 0.167 0.143
Time spent executing on the GPU: 0.05 millseconds
Matrix product of A and B (CUDA):
4 2.72 2.1 1.72
6.08 4 3.05 2.48
4 3.05 2.48 2.1
3.22 2.6 2.19 1.89
[ThreadIdx.x][ThreadIdx.y]:
[0][0] [0][1] [0][2] [0][3]
[1][0] [1][1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[BlockIdx.x][BlockIdx.y]:
[0][0] [0][0] [0][0] [0][0]
[0][0] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[rw][cl]:
[0][0] [0][1] [0][2] [0][3]
[1][0] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
Matrix product of A and B (Correct result):
4 2.72 2.1 1.72
6.08 4 3.05 2.48
8.17 5.28 4 3.24
10.2 6.57 4.95 4
因此,每个线程都会写入其threadIdx、blockIdx、行数和列数,但大多数值在初始化后都没有更改。请帮我弄清楚这一切是如何运作的
第二部分。
每个线程输出正确的和,但在将数据从设备内存复制到主机后,某些元素变为零。代码如下:
typedef struct {
int tx;
int ty;
int bx;
int by;
int rw;
int cl;
} THREADS;
__global__ void mult1_kernel (float *a, float *b, float *c, THREADS *d, int n) {
int k;
float sum = 0.0f;
int rw = blockIdx.y * blockDim.y + threadIdx.y;
int cl = blockIdx.x * blockDim.x + threadIdx.x;
for (k = 0; k < n; k++)
sum += a[rw*n + k] * b[k*n + cl];
c[rw*n + cl] = sum;
d[rw*n + cl].tx = threadIdx.y;
d[rw*n + cl].ty = threadIdx.x;
d[rw*n + cl].bx = blockIdx.y;
d[rw*n + cl].by = blockIdx.x;
d[rw*n + cl].rw = rw;
d[rw*n + cl].cl = cl;
}
#include <stdio.h>
#include <stdlib.h>
#include "cuPrintf.cuh"
#include "cuPrintf.cu"
__global__ void mult1_kernel (float *a, float *b, float *c, int n) {
int k;
float sum = 0.0f;
int rw = blockIdx.y * blockDim.y + threadIdx.y;
int cl = blockIdx.x * blockDim.x + threadIdx.x;
if ((rw < n) && (cl < n)) {
rw *= n;
for (k = 0; k < n; k++)
sum += a[rw + k] * b[k*n + cl];
cuPrintf ("Thread[%d][%d] from block[%d][%d]:\n rw = %d, cl = %d, sum = %f\n\n",
threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, rw/n, cl, sum);
}
c[rw + cl] = sum;
}
__host__ int mult1_host (float *a, float *b, float *c, int n) {
cudaEvent_t start, stop;
cudaError_t cuerr;
int i, j;
int size = n * n * sizeof (float);
float gpuTime = 0.0f;
float *aDev = NULL, *bDev = NULL, *cDev = NULL;
cuerr = cudaMalloc ((void**)&aDev, size);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot allocate GPU memory for aDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaMalloc ((void**)&bDev, size);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot allocate GPU memory for bDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaMalloc ((void**)&cDev, size);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot allocate GPU memory for cDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
dim3 blockSize = dim3 (16, 16);
dim3 gridSize = dim3 ((n+15)/16, (n+15)/16);
cuerr = cudaMemcpy (aDev, a, size, cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot copy data from a to aDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaMemcpy (bDev, b, size, cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot copy data from a to bDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cudaPrintfInit ();
cudaEventCreate (&start);
cudaEventCreate (&stop);
cudaEventRecord (start, 0);
mult1_kernel <<< gridSize, blockSize >>> (aDev, bDev, cDev, n);
cudaEventRecord (stop, 0);
cudaEventSynchronize (stop);
cudaEventElapsedTime (&gpuTime, start, stop);
cuerr = cudaGetLastError ();
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot launch CUDA kernel: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaDeviceSynchronize ();
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot synchronize CUDA kernel: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cudaPrintfDisplay (stdout, true);
cudaPrintfEnd ();
cuerr = cudaMemcpy (c, cDev, size, cudaMemcpyDeviceToHost);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot copy data from c to cDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
printf ("Time spent executing on the GPU: %.2f millseconds\n", gpuTime);
printf ("\n");
cudaEventDestroy (start);
cudaEventDestroy (stop);
cudaFree (aDev);
cudaFree (bDev);
cudaFree (cDev);
return (0);
}
int main () {
int i, j, k;
int n;
float *a, *b, *c;
printf ("Enter the matrix size: ");
scanf ("%d", &n);
printf ("Blocks = %d\n", n*n/512 + 1);
if (n < 1) {
printf ("Invalid matrix size\n");
return (-1);
}
if (!(a = (float*) malloc (n * n * sizeof (float)))) {
fprintf (stderr, "Cannot allocate CPU memory for a\n");
return (-1);
}
if (!(b = (float*) malloc (n * n * sizeof (float)))) {
fprintf (stderr, "Cannot allocate CPU memory for b\n");
return (-1);
}
if (!(c = (float*) malloc (n * n * sizeof (float)))) {
fprintf (stderr, "Cannot allocate CPU memory for c\n");
return (-1);
}
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
a[i*n + j] = (float)(i + j + 1);
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
b[i*n + j] = 1 / (float)(i + j + 1);
printf ("Input matrix A:\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", a[i*n + j]);
printf ("\n");
}
printf ("\n");
printf ("Input matrix B:\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", b[i*n + j]);
printf ("\n");
}
printf ("\n");
if (mult1_host (a, b, c, n) < 0) {
return (-1);
}
printf ("Matrix product of A and B (CUDA):\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", c[i*n + j]);
printf ("\n");
}
printf ("\n");
for (i = 0; i < n; i++)
for (j = 0; j < n; j++) {
c[i*n +j] = 0;
for (k = 0; k < n; k++)
c[i*n + j] += a[i*n + k] * b[k*n + j];
}
printf ("Matrix product of A and B (Check):\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", c[i*n + j]);
printf ("\n");
}
free (a);
free (b);
free (c);
return (0);
}
谢谢 而不是在内核末尾执行此操作:
}
c[rw + cl] = sum;
}
这样做:
c[(rw*n) + cl] = sum;
}
}
请注意区别:
如果您有compute capability 2.0或更高版本的设备,则可以执行
printf()
。如果你有一个更早的设备,它也会起到类似的作用。如果你发布完整的代码可能会更好。例如,我想知道传递给n的是什么,以及内核调用的样子,以及各种数据集的malloc操作!事实上,我曾试图限制线程的范围,但现在发现错误不在内核中(在cuPrintf()的帮助下,它变得很明显)。现在我明确知道每个线程都正确计算其元素。但由于某种原因,我在cudaMemcpy之后收到的数据是错误的(我将更新我的帖子).非常感谢!真不敢相信我错过了这个支架。