Matrix Cuda矩阵乘法——不适用于某些非方矩阵_Matrix_Cuda_Gpu_Multiplication

Matrix Cuda矩阵乘法——不适用于某些非方矩阵

matrix cuda

Matrix Cuda矩阵乘法——不适用于某些非方矩阵,matrix,cuda,gpu,multiplication,Matrix,Cuda,Gpu,Multiplication,我现在正在尝试Cuda编程。作为这项工作的一部分，我正在尝试开发一种在GPU上运行的矩阵乘法算法。该算法适用于平方矩阵，但不适用于非平方矩阵。这是我的内核 float* multiply_gpu(float* matrix1 , float* matrix2); __global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a, float *b, float

我现在正在尝试Cuda编程。作为这项工作的一部分，我正在尝试开发一种在GPU上运行的矩阵乘法算法。该算法适用于平方矩阵，但不适用于非平方矩阵。这是我的内核

    float* multiply_gpu(float* matrix1 , float* matrix2);
    __global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a,
            float *b, float *result) {
        int index = blockIdx.x * blockDim.x + threadIdx.x;
        int result_size = rowsA*columnsB;
        int value = 0;//the final result
        //indices of values from input matrices
        if (index < result_size) {
            int index1 = (index/rowsA)*rowsA; //get nearest row
            int index2 = index%columnsB; //get start column
            int k = 0;
            while (k<columnsA) { //columnsA == rowsB
               value += a[index1]*b[index2]; //v = sum a_ik * b_kj
               index1 ++;
               index2 += columnsB;
               k++;
            }
            result[index] = value;
        }
    }

float*multiply\u gpu（float*matrix1，float*matrix2）；
__全局无效mult（int-rowsA，int-columnsA，int-rowsB，int-columnsB，float*a，
浮动*b，浮动*结果）{
int index=blockIdx.x*blockDim.x+threadIdx.x；
int result_size=rowsA*columnsB；
int value=0；//最终结果
//输入矩阵值的索引
如果（索引<结果大小）{
int index1=（索引/rowsA）*rowsA；//获取最近的行
int index2=索引%columnsB；//获取开始列
int k=0；
而（k）则无济于事
mult（高度1，宽度1，高度2，宽度2，d_矩阵1，d_矩阵2，d_结果）；
printf（“%d%d%d%d\n”，高1，宽1，高2，宽2）；
//创建主机块，直到mult完成运行
//printf（“完成乘法\n”）；
cudaDeviceSynchronize（）；
//将结果复制回
错误=cudaMemcpy（结果，d_结果，高度1*width2*sizeof（float），cudaMemcpyDeviceToHost）；
如果（错误！=cudaSuccess）{
fprintf（stderr，“无法复制内存（错误代码%s）！\n”，cudaGetErrorString（错误））；
退出（退出失败）；
}
//释放现在不需要的cuda内存
cudaFree（d_matrix1）；
cudaFree（d_matrix2）；
cudaFree（d_结果）；
printf（“获得结果\n”）；
对于（int i=0；i这条线是错误的：
        int index1 = (index/rowsA)*rowsA; //get nearest row

应该是这样的：
        int index1 = (index/columnsB)*columnsA; //get nearest row

为什么这个公式是正确的？index1
用于索引A
中与我们正在计算的输出矩阵位置指示的行相对应的行元素。输出矩阵位置只是线程索引。如果我们（整数）将线程索引除以输出矩阵中的列数，即C
，我们得到有问题的行号。然后，为了在A
中找到该行的第一个元素，我们将乘以A
中的列数。这将正确地将我们索引到A
中相关行的第一个元素
这是一个完整的应用程序以及我的测试用例——我对您的代码所做的唯一更改就是上面所示的更改
$ cat t290.cu
#include <stdio.h>

__global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a, float *b, float *result) {
        int index = blockIdx.x * blockDim.x + threadIdx.x;
        int result_size = rowsA*columnsB;
        int value = 0;//the final result
        //indices of values from input matrices
        if (index < result_size) {
            int index1 = (index/columnsB)*columnsA; //get nearest row
            int index2 = index%columnsB; //get start column
            int k = 0;
            while (k<columnsA) { //columnsA == rowsB
               value += a[index1]*b[index2]; //v = sum a_ik * b_kj
               index1 ++;
               index2 += columnsB;
               k++;
            }
            result[index] = value;
        }
    }

float* multiply_gpu(float* matrix1 , float* matrix2) {
    //the dimensions of the matrices
    size_t available, total;
    cudaError_t error;
    cudaError err = cudaMemGetInfo(&available, &total);
    if(err != cudaSuccess){
        printf("There was an error: %s\n", cudaGetErrorString(err));
    }
    int height1 = matrix1[0];
    int width1 = matrix1[1];
    int height2 = matrix2[0];
    int width2 = matrix2[1];
    if (width1!=height2) {
        printf("fail!\n");
        return NULL;
    }
    //this array contains the result of the operation
    float* result = (float *) malloc(height1*width2*sizeof(float));
    //pointers for device matrices
    float *d_matrix1;
    float *d_matrix2;
    float *d_result;
    //allocate memory for matrices
    error = cudaMalloc((void **)&d_matrix1,(size_t)height1*width1*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMalloc((void **)&d_matrix2,height2*width2*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMalloc((void **)&d_result,height1*width2*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //now copy matrices onto device -- note the offset of 2
    error = cudaMemcpy(d_matrix1 , matrix1+2 , height1*width1*sizeof(float), cudaMemcpyHostToDevice);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMemcpy(d_matrix2 , matrix2+2 , height2*width2*sizeof(float), cudaMemcpyHostToDevice);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //launch multiplication kernel
//note I have tried adjusting the kernel values between <<< , >>> to no avail
    mult<<<height1,width2>>>(height1,width1,height2,width2,d_matrix1,d_matrix2,d_result);
    printf("%d %d %d %d\n",height1,width1,height2,width2);
    error = cudaGetLastError();
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //make the host block until mult is finished running
    //printf("finished multiplying\n");
    error = cudaDeviceSynchronize();
    if (error != cudaSuccess) {
        fprintf(stderr, "kernel fail (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //copy result back
    error = cudaMemcpy(result,d_result,height1*width2*sizeof(float),cudaMemcpyDeviceToHost);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //free now unneeded cuda memory
    cudaFree(d_matrix1);
    cudaFree(d_matrix2);
    cudaFree(d_result);
    printf("GOT RESULT\n");
    for (int i=0;i<height1*width2;i++) {
        printf("%f ",result[i]);
    }
    printf("\n");
    //result ready to be returned
    return result;
}

int main(){

  float m1[8] = {2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
  float m2[6] = {2.0, 2.0, 1.0, 1.0, 2.0, 2.0};
  float *my_result1 = multiply_gpu(m2, m1);
  float m3[8] = {2,3,1,2,3,4,5,6};
  float m4[8] = {3,2,1,2,3,4,5,6};
  float *my_result2 = multiply_gpu(m3, m4);
  float *my_result3 = multiply_gpu(m4, m3);
  float m5[12] = {2,5,1,1,1,1,1,1,1,1,1,1};
  float m6[22] = {5,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
  float *my_result4 = multiply_gpu(m5, m6);
  return 0;
}

$ nvcc -arch=sm_20 -o t290 t290.cu
t290.cu: In function âfloat* multiply_gpu(float*, float*)â:
t290.cu:30: warning: converting to âintâ from âfloatâ
t290.cu:31: warning: converting to âintâ from âfloatâ
t290.cu:32: warning: converting to âintâ from âfloatâ
t290.cu:33: warning: converting to âintâ from âfloatâ
$ cuda-memcheck ./t290
========= CUDA-MEMCHECK
2 2 2 3
GOT RESULT
5.000000 7.000000 9.000000 10.000000 14.000000 18.000000
2 3 3 2
GOT RESULT
22.000000 28.000000 49.000000 64.000000
3 2 2 3
GOT RESULT
9.000000 12.000000 15.000000 19.000000 26.000000 33.000000 29.000000 40.000000 51.000000
2 5 5 4
GOT RESULT
5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000
========= ERROR SUMMARY: 0 errors
$

$cat t290.cu
#包括
__全局无效mult（int-rowsA、int-columnsA、int-rowsB、int-columnsB、float*a、float*b、float*result）{
int index=blockIdx.x*blockDim.x+threadIdx.x；
int result_size=rowsA*columnsB；
int value=0；//最终结果
//输入矩阵值的索引
如果（索引<结果大小）{
int index1=（index/columnsB）*columnsA；//获取最近的行
int index2=索引%columnsB；//获取开始列
int k=0；
而（k）则无济于事
mult（高度1，宽度1，高度2，宽度2，d_矩阵1，d_矩阵2，d_结果）；
printf（“%d%d%d%d\n”，高1，宽1，高2，宽2）；
错误=cudaGetLastError（）；
如果（错误！=cudaSuccess）{
fprintf（stderr，“无法复制内存（错误代码%s）！\n”，cudaGetErrorString（错误））；
退出（退出失败）；
}
//创建主机块，直到mult完成运行
//printf（“完成乘法\n”）；
错误=cudaDeviceSynchronize（）；
如果（错误！=cudaSuccess）{
fprintf（stderr，“内核失败（错误代码%s）！\n”，cudaGetErrorString（错误））；
退出（退出失败）；
}
//将结果复制回
错误=cudaMemcpy（结果，d_结果，高度1*width2*sizeof（float），cudaMemcpyDeviceToHost）；
如果（错误！=cudaSuccess）{
fprintf（stderr，“无法复制内存（错误代码%s）！\n”，cudaGetErrorString（错误））；
退出（退出失败）；
}
//释放现在不需要的cuda内存
cudaFree（d_matrix1）；
cudaFree（d_matrix2）；
cudaFree（d_结果）；
printf（“获得结果\n”）；
对于（inti=0；i，所以在仔细检查我的矩阵代码后，我发现了一个简单的问题
我的数学运算
这句话确实是错的
 int index1 = (index/rowsA)*rowsA; //get nearest row

我注意到，因为我的矩阵是按行排序的，所以从元素at（I，j）获取正确索引的公式是
因此，index1的赋值应为
int index1 = (index/rowsA)*columnsA

为什么？很明显，要导航到n行的索引，我们必须移动n行长度（这是矩阵中的列数）.我的代码适用于方形矩阵，但不适用于其他矩形矩阵，因为列数与此类矩阵中的行数不匹配。
CUDA标记上有很多关于矩阵乘法的问题。您是否查看了任何问题？如果使用CUDA memcheck运行代码会发生什么情况？因此：“与您编写的代码问题有关的问题必须在问题本身中描述具体问题，并包括重现问题的有效代码。有关指导信息，请参阅SSCCE.org。"投票结束。你还没有提供SSCCE.org代码。是的，矩阵乘法在GPU上很常见，而且有很多关于它的问题。我已经通读了它们，但可能还不够彻底。我只是不知所措，来到这里是为了检查是否正常。谢谢你链接到SSCCE.org——我现在正在复习。我也在学习g cuda memcheck。总的来说，我所面临的这个错误正在困扰着我。我认为我需要更多地关注我自己的代码和对其他矩阵乘法器的审查。我更新了我的答案，因为我仍然没有完全正确的答案。我认为现在它是正确的-它适用于你提到的案例以及我尝试过的其他三个案例。嗯，不完全正确。但你是对的跟踪。我自己发现了正确的答案，正在发布。这是错误的。我提供了一个完整的代码。将您的公式插入我的完整代码，它会给出错误的结果。
index = i*rowLength + j

int index1 = (index/rowsA)*columnsA