Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/tensorflow/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
n×1向量中m×n矩阵的Cuda乘法_Cuda_Matrix Multiplication - Fatal编程技术网

n×1向量中m×n矩阵的Cuda乘法

n×1向量中m×n矩阵的Cuda乘法,cuda,matrix-multiplication,Cuda,Matrix Multiplication,以下内核将两个n乘n矩阵相乘: __global__ void matrixMultiplication(const double *A, const double *B, double *C, int N) { int i = blockDim.y * blockIdx.y + threadIdx.y; int j = blockDim.x * blockIdx.x + threadIdx.x; double value = 0; for(int k =

以下内核将两个n乘n矩阵相乘:

    __global__ void matrixMultiplication(const double *A, const double *B, double *C, int N)
{
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;
    double value = 0;
    for(int k = 0; k < N; k++){
    value += A[k * N + j] * B[i * N + k];
    }
    C[i * N + j] = value;
    }

但结果不是零!如何更改此内核,以便在n×1向量中乘以m×n矩阵?

因为它表明您正在使用Matlab(数组以列大顺序存储),您必须对内核代码进行非常小的修改:

#include <thrust/iterator/counting_iterator.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <iostream>

__global__ void matrixMultiplication(const double *A, const double *B, double *C, int M, int N)
{
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;
    double value = 0;
    for(int k = 0; k < N; k++){
        value += A[k * M + j] * B[i * M + k];
    }
    C[i * N + j] = value;
}

int main()
{
    const int M = 3, N = 4, K = 1;
    thrust::device_vector<double> A(M*N), B(N*K), C(M*K);

    thrust::counting_iterator<double> counter(1.0);
    thrust::copy(counter, counter + (M*N), A.begin());
    thrust::copy(counter, counter + (N*K), B.begin());

    dim3 grid(1,1), block(M,K);

    matrixMultiplication<<<grid, block>>>( thrust::raw_pointer_cast(A.data()),
                                           thrust::raw_pointer_cast(B.data()),
                                           thrust::raw_pointer_cast(C.data()),
                                           M, N );

    cudaDeviceSynchronize();

    for(int i=0; i<M*K; i++)
        std::cout << C[i] << std::endl;

    return 0;
}

这对于列主订单存储是正确的。

您不需要更改anything@talonmied但是我得到的结果与普通乘法不同!该内核设计为在输出矩阵中的每个条目上运行CUDA线程。唯一可以改变的是内核中的索引计算,这取决于矩阵存储的字节顺序(您没有说过)@talonmied非常感谢您的回答。。。但由于我是CUDA的新手,如果您能多解释一点或分享一个类似问题的链接,我将不胜感激。我还编辑了我的问题并添加了更多信息。@Talonmed,我将k.ThreadBlockSize从[3,4,1]更改为[3,1,1],但D-C仍然不是零向量!据我所知,问题的大小很小,只需要一个网格,所以我没有更改网格大小。请纠正我的错误!非常感谢,我刚刚更改了内核并在MATLAB中进行了测试。。。它工作得很好。。。你是最好的。。。顺便问一下,对于那些没有太多C/C++/C#经验的人,你建议从哪个来源学习CUDA?这里可以找到一些关于学习CUDA的好信息:
#include <thrust/iterator/counting_iterator.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <iostream>

__global__ void matrixMultiplication(const double *A, const double *B, double *C, int M, int N)
{
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;
    double value = 0;
    for(int k = 0; k < N; k++){
        value += A[k * M + j] * B[i * M + k];
    }
    C[i * N + j] = value;
}

int main()
{
    const int M = 3, N = 4, K = 1;
    thrust::device_vector<double> A(M*N), B(N*K), C(M*K);

    thrust::counting_iterator<double> counter(1.0);
    thrust::copy(counter, counter + (M*N), A.begin());
    thrust::copy(counter, counter + (N*K), B.begin());

    dim3 grid(1,1), block(M,K);

    matrixMultiplication<<<grid, block>>>( thrust::raw_pointer_cast(A.data()),
                                           thrust::raw_pointer_cast(B.data()),
                                           thrust::raw_pointer_cast(C.data()),
                                           M, N );

    cudaDeviceSynchronize();

    for(int i=0; i<M*K; i++)
        std::cout << C[i] << std::endl;

    return 0;
}
$ nvcc -arch=sm_52 -std=c++11 -o spoonfull spoonfull.cu 
$ ./spoonfull 
70
80
90