Cuda CUBLAS库未给出正确的结果
我试图探索CUBLAS库,因此使用其API编写了矩阵乘法代码。但我得到了奇怪的输出。我正在粘贴下面的代码和输出。请帮帮我Cuda CUBLAS库未给出正确的结果,cuda,matrix-multiplication,cublas,Cuda,Matrix Multiplication,Cublas,我试图探索CUBLAS库,因此使用其API编写了矩阵乘法代码。但我得到了奇怪的输出。我正在粘贴下面的代码和输出。请帮帮我 #include<cublas.h> // Thread block size #define BLOCK_SIZE 3 #define WA 3 // Matrix A width #define HA 3 // Matrix A height #define WB 3 // Matrix B width #define HB WA // Ma
#include<cublas.h>
// Thread block size
#define BLOCK_SIZE 3
#define WA 3 // Matrix A width
#define HA 3 // Matrix A height
#define WB 3 // Matrix B width
#define HB WA // Matrix B height
#define WC WB // Matrix C width
#define HC HA // Matrix C height
// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = i;
}
/////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
// 1. allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
cublasStatus_t status;
// 2. initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// 3. print out A and B
printf("\n\nMatrix A\n");
for(int i = 0; i < size_A; i++)
{
printf("%f ", h_A[i]);
if(((i + 1) % WA) == 0)
printf("\n");
}
printf("\n\nMatrix B\n");
for(int i = 0; i < size_B; i++)
{
printf("%f ", h_B[i]);
if(((i + 1) % WB) == 0)
printf("\n");
}
// 8. allocate device memory
float* d_A;
float* d_B;
cudaMalloc((void**) &d_A, mem_size_A);
cudaMalloc((void**) &d_B, mem_size_B);
// 9. copy host memory to device
status = cublasSetMatrix(BLOCK_SIZE,BLOCK_SIZE,sizeof(float), h_A, BLOCK_SIZE,d_A, BLOCK_SIZE);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
status = cublasSetMatrix(BLOCK_SIZE,BLOCK_SIZE,sizeof(float), h_B, BLOCK_SIZE,d_B, BLOCK_SIZE);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
// 4. allocate host memory for the result C
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);
// 10. allocate device memory for the result
float* d_C;
cudaMalloc((void**) &d_C, mem_size_C);
// 5. perform the calculation
cublasSgemm('N','N',BLOCK_SIZE,BLOCK_SIZE,BLOCK_SIZE,1.0f,d_A,BLOCK_SIZE,d_B,BLOCK_SIZE,1.0f,d_C,BLOCK_SIZE);
status = cublasGetError();
if (status) {
fprintf (stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
// 11. copy result from device to host
status = cublasGetMatrix(BLOCK_SIZE,BLOCK_SIZE,sizeof(float),d_C, BLOCK_SIZE,h_C,BLOCK_SIZE);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}
// 6. print out the results
printf("\n\nMatrix C (Results)\n");
for(int i = 0; i < size_C; i++)
{
printf("%f ", h_C[i]);
if(((i + 1) % WC) == 0)
printf("\n");
}
printf("\n");
// 7. clean up memory
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
#包括
//螺纹块尺寸
#定义块大小3
#定义WA 3//矩阵的宽度
#定义HA 3//矩阵A高度
#定义WB 3//矩阵B宽度
#定义HB WA//矩阵B高度
#定义WC WB//矩阵C宽度
#定义HC HA//矩阵C高度
//分配带有随机浮点项的矩阵。
void randomInit(浮点*数据,整数大小)
{
对于(int i=0;i
---------输出-------------
矩阵A
0.0000001.0000002.000000
3.000000 4.000000 5.000000
6.000000 7.000000 8.000000
矩阵B
0.0000001.0000002.000000
3.000000 4.000000 5.000000
6.000000 7.000000 8.000000
矩阵C(结果)
-1998397155538108416.000000-1998397155538108416.000000-1998397155538108416.000000
-1998397155538108416.000000-1998397155538108416.000000-1998397155538108416.000000
-1998397155538108416.000000-1998397155538108416.000000-1998397155538108416.000000您的问题是在sgemm调用中使用未初始化的内存
cublas_sgemm()
,与所有BLAS gemm运算一样
C = alpha * op(A) * op(B) + beta * C
在您的代码中,您正在传递op(A)=A
,op(B)=B
,alpha=1.
,以及beta=1.
。但是,您从未将C
的值设置为任何值,GPU中的内存未初始化,可能包含随机值,从而给出您看到的损坏结果。将函数调用更改为:
cublasSgemm('N','N',BLOCK_SIZE,BLOCK_SIZE,BLOCK_SIZE,1.0f,d_A,
BLOCK_SIZE,d_B,BLOCK_SIZE,0.f,d_C,BLOCK_SIZE);
计算
C = 1.0 * A * B + 0. * C
你应该得到更合理的产量。一旦你得到它产生的输出,请保持在发现库布拉斯假设矩阵存储列的主要顺序,所以正确的打印输出的输入你打印出来应该是
15 18 21
42 54 66
69 90 111
@用户1439690:如果这个答案解决了你的问题,也许你会很乐意。