Matlab 基于CUDA的GPU并行Kronecker张量积

Matlab 基于CUDA的GPU并行Kronecker张量积,matlab,parallel-processing,cuda,gpu,linear-algebra,Matlab,Parallel Processing,Cuda,Gpu,Linear Algebra,我正在GPU上使用[PTX文件with matlab parallel.GPU.CUDAkernel][2]并行处理[this file][1]。我对[kron张量积][3]的问题如下。我的代码应该用第一个向量的每个元素乘以另一个向量的所有元素来乘以两个向量kron(a,b),输出向量大小将是k=a.*b。我尝试用C++编写它,它工作,因为我只关心总结所有元素的二维数组。我想我可以把它简化为一维数组,因为m=sum(sum(kron(a,b))是我正在编写的代码 for(i=0;i<32;

我正在GPU上使用[PTX文件with matlab parallel.GPU.CUDAkernel][2]并行处理[this file][1]。我对[kron张量积][3]的问题如下。我的代码应该用第一个向量的每个元素乘以另一个向量的所有元素来乘以两个向量
kron(a,b)
,输出向量大小将是
k=a.*b
。我尝试用C++编写它,它工作,因为我只关心总结所有元素的二维数组。我想我可以把它简化为一维数组,因为
m=sum(sum(kron(a,b))
是我正在编写的代码

for(i=0;i<32;i++)
 for(j=0;j<32;j++)
   k[i*32+j]=a[i]*b[j]

这应该是一个技巧,因为
blockIdx.x
是外部循环,但它不是。有谁能告诉我在哪里,我可以问一下并行求和的方法吗。

你的意思可能是这样的:

__global__ void myKrom(int* c,int* a, int*b)
{
  int i=blockDim.x*blockIdx.x+threadIdx.x;
  if(i<32*32){
    c[i]=a[blockIdx.x]+b[threadIdx.x];
  }

}
\uuuu全局\uuuuu无效myKrom(int*c,int*a,int*b)
{
int i=blockDim.x*blockIdx.x+threadIdx.x;

如果(i在第一个操作数是单位矩阵的情况下,则Kronecker乘积的结果可以简单地使用cuSPARSE的
bsr
格式表示

下面是一个实现以下Matlab指令的简单示例

 m = 5;
 I = speye(m);
 e = ones(m, 1);
 T = spdiags([e -4 * e e],[-1 0 1], m, m);
 kron(I, T)
KRON(I,T)

KRON(S,I)

#包括
#包括
#包括
#定义blockMatrixSize 3/---稀疏块矩阵的每个块都是blockMatrixSize x blockMatrixSize
/*******************/
/*iDivUp函数*/
/*******************/
intidivup(inta,intb){返回((a%b)!=0)?(a/b+1):(a/b);}
/********************/
/*CUDA错误检查*/
/********************/
//---归功于http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError\u t代码,常量字符*文件,int行,bool abort=true)
{
如果(代码!=cudaSuccess)
{
fprintf(标准,“GPUassert:%s%s%d\n”,cudaGetErrorString(代码)、文件、行);
if(abort){exit(code);}
}
}
void gpuerchk(cudaError_t ans){gpuAssert((ans),uuuu文件,uuu行uuu)}
/***************************/
/*解析错误检查*/
/***************************/
静态常量字符*\u cusparseGetErrorEnum(cusparseStatus\u t错误)
{
开关(错误)
{
案例分析(状态)(成功):
返回“CUSPARSE\u STATUS\u SUCCESS”;
未初始化状态的情况:
返回“CUSPARSE\u STATUS\u NOT\u INITIALIZED”;
案例CUSPARSE\u STATUS\u ALLOC\u失败:
返回“CUSPARSE\u STATUS\u ALLOC\u FAILED”;
案例库解析\状态\无效\值:
返回“CUSPARSE\u STATUS\u INVALID\u VALUE”;
案例CUSPARSE\u状态\u拱\u不匹配:
返回“CUSPARSE\u STATUS\u ARCH\u MISMATCH”;
案例分析\状态\映射\错误:
返回“CUSPARSE\u STATUS\u MAPPING\u ERROR”;
案例分析\状态\执行\失败:
返回“CUSPARSE\u STATUS\u EXECUTION\u FAILED”;
案例分析\状态\内部\错误:
返回“CUSPARSE\u STATUS\u INTERNAL\u ERROR”;
不支持状态矩阵类型的情况:
返回“CUSPASE\u状态\u矩阵\u类型\u不受支持”;
案例1\u状态\u零\u轴:
返回“CUSPARSE\u STATUS\u ZERO\u PIVOT”;
}
返回“”;
}
内联void\uu cusparseSafeCall(cusparseStatus\u t err,const char*文件,const int行)
{
if(CUSPARSE\u STATUS\u SUCCESS!=错误){
fprintf(stderr,“文件“%s”中的CUSPASE错误,第%d行,错误%s\n终止!\n“,\uuuu文件\uuuuuuuu,\uuuuuuu行\
_cusparseGetErrorEnum(err))\
断言(0)\
}
}
外部“C”void cusparseSafeCall(cusparseStatus_t err){{uuuu cusparseSafeCall(err,uuuu文件,uuu行uu)}
/********/
/*主要*/
/********/
int main(){
//---初始化cuSPARSE
cusparseHandle_t handle;cusparseSafeCall(cusparseCreate(&handle));
//---初始化矩阵描述符
cusparseMatDescr\t descrA,descrC;
cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseCreateMatDescr(&descrC));
const int Mb=5;//--沿行的块数
const int Nb=5;//--沿列的块数
const int M=Mb*块矩阵大小;//---行数
const int N=Nb*blockMatrixSize;//--列数
const int nnzb=2*(Mb-1);/——非零块数
浮点h_块[blockMatrixSize*blockMatrixSize]={1.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f};
//---定义块稀疏矩阵的主向量
float*h_bsrValA=(float*)malloc(blockMatrixSize*blockMatrixSize*nnzb*sizeof(float));
int*h_bsrRowPtrA=(int*)malloc((Mb+1)*sizeof(int));
int*h_bsrColIndA=(int*)malloc(nnzb*sizeof(int));
对于(int k=0;k m = 5;
 I = speye(m);
 e = ones(m, 1);
 T = spdiags([e -4 * e e],[-1 0 1], m, m);
 kron(I, T)
#include <stdio.h>
#include <assert.h>

#include <cusparse.h>

#define blockMatrixSize         3           // --- Each block of the sparse block matrix is blockMatrixSize x blockMatrixSize

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { exit(code); }
    }
}

void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }

/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
    switch (error)
    {

    case CUSPARSE_STATUS_SUCCESS:
        return "CUSPARSE_STATUS_SUCCESS";

    case CUSPARSE_STATUS_NOT_INITIALIZED:
        return "CUSPARSE_STATUS_NOT_INITIALIZED";

    case CUSPARSE_STATUS_ALLOC_FAILED:
        return "CUSPARSE_STATUS_ALLOC_FAILED";

    case CUSPARSE_STATUS_INVALID_VALUE:
        return "CUSPARSE_STATUS_INVALID_VALUE";

    case CUSPARSE_STATUS_ARCH_MISMATCH:
        return "CUSPARSE_STATUS_ARCH_MISMATCH";

    case CUSPARSE_STATUS_MAPPING_ERROR:
        return "CUSPARSE_STATUS_MAPPING_ERROR";

    case CUSPARSE_STATUS_EXECUTION_FAILED:
        return "CUSPARSE_STATUS_EXECUTION_FAILED";

    case CUSPARSE_STATUS_INTERNAL_ERROR:
        return "CUSPARSE_STATUS_INTERNAL_ERROR";

    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
        return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

    case CUSPARSE_STATUS_ZERO_PIVOT:
        return "CUSPARSE_STATUS_ZERO_PIVOT";
    }

    return "<unknown>";
}

inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
    if (CUSPARSE_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUSPARSE error in file '%s', line %d, error %s\nterminating!\n", __FILE__, __LINE__, \
            _cusparseGetErrorEnum(err)); \
            assert(0); \
    }
}

extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }

/********/
/* MAIN */
/********/
int main() {

    // --- Initialize cuSPARSE
    cusparseHandle_t handle;    cusparseSafeCall(cusparseCreate(&handle));

    // --- Initialize matrix descriptors
    cusparseMatDescr_t descrA, descrC;
    cusparseSafeCall(cusparseCreateMatDescr(&descrA));
    cusparseSafeCall(cusparseCreateMatDescr(&descrC));

    const int Mb = 5;                                       // --- Number of blocks along rows
    const int Nb = 5;                                       // --- Number of blocks along columns

    const int M = Mb * blockMatrixSize;                     // --- Number of rows
    const int N = Nb * blockMatrixSize;                     // --- Number of columns

    const int nnzb = Mb;                                    // --- Number of non-zero blocks

    float h_block[blockMatrixSize * blockMatrixSize] = { 4.f, -1.f, 0.f, -1.f, 4.f, -1.f, 0.f, -1.f, 4.f };

    // --- Host vectors defining the block-sparse matrix
    float *h_bsrValA = (float *)malloc(blockMatrixSize * blockMatrixSize * nnzb * sizeof(float));
    int *h_bsrRowPtrA = (int *)malloc((Mb + 1) * sizeof(int));
    int *h_bsrColIndA = (int *)malloc(nnzb * sizeof(int));

    for (int k = 0; k < nnzb; k++) memcpy(h_bsrValA + k * blockMatrixSize * blockMatrixSize, h_block, blockMatrixSize * blockMatrixSize * sizeof(float));

    h_bsrRowPtrA[0] = 0;
    h_bsrRowPtrA[1] = 1;
    h_bsrRowPtrA[2] = 2;
    h_bsrRowPtrA[3] = 3;
    h_bsrRowPtrA[4] = 4;
    h_bsrRowPtrA[5] = 5;

    h_bsrColIndA[0] = 0;
    h_bsrColIndA[1] = 1;
    h_bsrColIndA[2] = 2;
    h_bsrColIndA[3] = 3;
    h_bsrColIndA[4] = 4;

    // --- Device vectors defining the block-sparse matrix
    float *d_bsrValA;       gpuErrchk(cudaMalloc(&d_bsrValA, blockMatrixSize * blockMatrixSize * nnzb * sizeof(float)));
    int *d_bsrRowPtrA;      gpuErrchk(cudaMalloc(&d_bsrRowPtrA, (Mb + 1) * sizeof(int)));
    int *d_bsrColIndA;      gpuErrchk(cudaMalloc(&d_bsrColIndA, nnzb * sizeof(int)));

    gpuErrchk(cudaMemcpy(d_bsrValA, h_bsrValA, blockMatrixSize * blockMatrixSize * nnzb * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_bsrRowPtrA, h_bsrRowPtrA, (Mb + 1) * sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_bsrColIndA, h_bsrColIndA, nnzb * sizeof(int), cudaMemcpyHostToDevice));

    // --- Transforming bsr to csr format
    cusparseDirection_t dir = CUSPARSE_DIRECTION_COLUMN;
    const int nnz = nnzb * blockMatrixSize * blockMatrixSize; // --- Number of non-zero elements
    int *d_csrRowPtrC;      gpuErrchk(cudaMalloc(&d_csrRowPtrC, (M + 1) * sizeof(int)));
    int *d_csrColIndC;      gpuErrchk(cudaMalloc(&d_csrColIndC, nnz     * sizeof(int)));
    float *d_csrValC;       gpuErrchk(cudaMalloc(&d_csrValC, nnz        * sizeof(float)));
    cusparseSafeCall(cusparseSbsr2csr(handle, dir, Mb, Nb, descrA, d_bsrValA, d_bsrRowPtrA, d_bsrColIndA, blockMatrixSize, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC));

    // --- Transforming csr to dense format
    float *d_A;             gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
    cusparseSafeCall(cusparseScsr2dense(handle, M, N, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC, d_A, M));

    float *h_A = (float *)malloc(M * N * sizeof(float));
    gpuErrchk(cudaMemcpy(h_A, d_A, M * N * sizeof(float), cudaMemcpyDeviceToHost));

    // --- m is row index, n column index
    for (int m = 0; m < M; m++) {
        for (int n = 0; n < N; n++) {
            printf("%f ", h_A[m + n * M]);
        }
        printf("\n");
    }

    return 0;
}
m = 5;
I = speye(3);
e = ones(m, 1);
S = spdiags([e e], [-1 1], m, m);
kron(S, I)
#include <stdio.h>
#include <assert.h>

#include <cusparse.h>

#define blockMatrixSize         3           // --- Each block of the sparse block matrix is blockMatrixSize x blockMatrixSize

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { exit(code); }
    }
}

void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }

/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
    switch (error)
    {

    case CUSPARSE_STATUS_SUCCESS:
        return "CUSPARSE_STATUS_SUCCESS";

    case CUSPARSE_STATUS_NOT_INITIALIZED:
        return "CUSPARSE_STATUS_NOT_INITIALIZED";

    case CUSPARSE_STATUS_ALLOC_FAILED:
        return "CUSPARSE_STATUS_ALLOC_FAILED";

    case CUSPARSE_STATUS_INVALID_VALUE:
        return "CUSPARSE_STATUS_INVALID_VALUE";

    case CUSPARSE_STATUS_ARCH_MISMATCH:
        return "CUSPARSE_STATUS_ARCH_MISMATCH";

    case CUSPARSE_STATUS_MAPPING_ERROR:
        return "CUSPARSE_STATUS_MAPPING_ERROR";

    case CUSPARSE_STATUS_EXECUTION_FAILED:
        return "CUSPARSE_STATUS_EXECUTION_FAILED";

    case CUSPARSE_STATUS_INTERNAL_ERROR:
        return "CUSPARSE_STATUS_INTERNAL_ERROR";

    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
        return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

    case CUSPARSE_STATUS_ZERO_PIVOT:
        return "CUSPARSE_STATUS_ZERO_PIVOT";
    }

    return "<unknown>";
}

inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
    if (CUSPARSE_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUSPARSE error in file '%s', line %d, error %s\nterminating!\n", __FILE__, __LINE__, \
            _cusparseGetErrorEnum(err)); \
            assert(0); \
    }
}

extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }

/********/
/* MAIN */
/********/
int main() {

    // --- Initialize cuSPARSE
    cusparseHandle_t handle;    cusparseSafeCall(cusparseCreate(&handle));

    // --- Initialize matrix descriptors
    cusparseMatDescr_t descrA, descrC;
    cusparseSafeCall(cusparseCreateMatDescr(&descrA));
    cusparseSafeCall(cusparseCreateMatDescr(&descrC));

    const int Mb = 5;                                       // --- Number of blocks along rows
    const int Nb = 5;                                       // --- Number of blocks along columns

    const int M = Mb * blockMatrixSize;                     // --- Number of rows
    const int N = Nb * blockMatrixSize;                     // --- Number of columns

    const int nnzb = 2 * (Mb - 1);                          // --- Number of non-zero blocks

    float h_block[blockMatrixSize * blockMatrixSize] = { 1.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 1.f };

    // --- Host vectors defining the block-sparse matrix
    float *h_bsrValA = (float *)malloc(blockMatrixSize * blockMatrixSize * nnzb * sizeof(float));
    int *h_bsrRowPtrA = (int *)malloc((Mb + 1) * sizeof(int));
    int *h_bsrColIndA = (int *)malloc(nnzb * sizeof(int));

    for (int k = 0; k < nnzb; k++) memcpy(h_bsrValA + k * blockMatrixSize * blockMatrixSize, h_block, blockMatrixSize * blockMatrixSize * sizeof(float));

    h_bsrRowPtrA[0] = 0;
    h_bsrRowPtrA[1] = 1;
    h_bsrRowPtrA[2] = 3;
    h_bsrRowPtrA[3] = 5;
    h_bsrRowPtrA[4] = 7;
    h_bsrRowPtrA[5] = 2 * (Mb - 1);

    h_bsrColIndA[0] = 1;
    h_bsrColIndA[1] = 0;
    h_bsrColIndA[2] = 2;
    h_bsrColIndA[3] = 1;
    h_bsrColIndA[4] = 3;
    h_bsrColIndA[5] = 2;
    h_bsrColIndA[6] = 4;
    h_bsrColIndA[7] = 3;

    // --- Device vectors defining the block-sparse matrix
    float *d_bsrValA;       gpuErrchk(cudaMalloc(&d_bsrValA, blockMatrixSize * blockMatrixSize * nnzb * sizeof(float)));
    int *d_bsrRowPtrA;      gpuErrchk(cudaMalloc(&d_bsrRowPtrA, (Mb + 1) * sizeof(int)));
    int *d_bsrColIndA;      gpuErrchk(cudaMalloc(&d_bsrColIndA, nnzb * sizeof(int)));

    gpuErrchk(cudaMemcpy(d_bsrValA, h_bsrValA, blockMatrixSize * blockMatrixSize * nnzb * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_bsrRowPtrA, h_bsrRowPtrA, (Mb + 1) * sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_bsrColIndA, h_bsrColIndA, nnzb * sizeof(int), cudaMemcpyHostToDevice));

    // --- Transforming bsr to csr format
    cusparseDirection_t dir = CUSPARSE_DIRECTION_COLUMN;
    const int nnz = nnzb * blockMatrixSize * blockMatrixSize; // --- Number of non-zero elements
    int *d_csrRowPtrC;      gpuErrchk(cudaMalloc(&d_csrRowPtrC, (M + 1) * sizeof(int)));
    int *d_csrColIndC;      gpuErrchk(cudaMalloc(&d_csrColIndC, nnz     * sizeof(int)));
    float *d_csrValC;       gpuErrchk(cudaMalloc(&d_csrValC, nnz        * sizeof(float)));
    cusparseSafeCall(cusparseSbsr2csr(handle, dir, Mb, Nb, descrA, d_bsrValA, d_bsrRowPtrA, d_bsrColIndA, blockMatrixSize, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC));

    // --- Transforming csr to dense format
    float *d_A;             gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
    cusparseSafeCall(cusparseScsr2dense(handle, M, N, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC, d_A, M));

    float *h_A = (float *)malloc(M * N * sizeof(float));
    gpuErrchk(cudaMemcpy(h_A, d_A, M * N * sizeof(float), cudaMemcpyDeviceToHost));

    // --- m is row index, n column index
    for (int m = 0; m < M; m++) {
        for (int n = 0; n < N; n++) {
            printf("%f ", h_A[m + n * M]);
        }
        printf("\n");
    }

    return 0;
}