Cuda 用cuSPARSE将密集矩阵转换为稀疏CSR格式
我想使用Cuda 用cuSPARSE将密集矩阵转换为稀疏CSR格式,cuda,Cuda,我想使用Scsrmvcusparse函数 阅读中的文档,我不知道如何定义csrRowPtrA和csrColIndA: csrRowPtrA:m+1元素的整数数组,包含 每行和最后一行的末尾加一 csrColIndA:nnz的整数数组(=csrRowPtrA(m)-csrRowPtrA(0)) 矩阵A的非零元素的列索引 例如: float *devRow; cudaMalloc((void **)&devRow, (m+1)*sizeof(float)); 如果A是矩阵,则: f
Scsrmv
cusparse函数
阅读中的文档,我不知道如何定义csrRowPtrA
和csrColIndA
:
csrRowPtrA:m+1元素的整数数组,包含
每行和最后一行的末尾加一
csrColIndA:nnz的整数数组(=csrRowPtrA(m)-csrRowPtrA(0))
矩阵A的非零元素的列索引
例如:
float *devRow;
cudaMalloc((void **)&devRow, (m+1)*sizeof(float));
如果A
是矩阵,则:
for (int i=0; i<m; i+= n) //m is rows , n is columns
devRow[i] = A[i];
您可以使用自己编写的代码将密集矩阵转换为稀疏矩阵。对于CSR(压缩稀疏行)公式,您也可以使用CUSPARSE来实现这一点
CSR稀疏矩阵表示的一般格式在许多地方都有文档记录,包括CUSPARSE。您可以使用自己编写的代码将密集矩阵转换为稀疏矩阵。对于CSR(压缩稀疏行)公式,您也可以使用CUSPARSE来实现这一点
CSR稀疏矩阵表示的一般格式在许多地方都有文档记录,包括CUSPARSE。根据Robert Crovella的回答,这是一个关于如何将密集格式存储的稀疏矩阵转换为CSR(压缩行存储)格式的完整示例。我希望它对其他用户有用
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>
#include "Utilities.cuh"
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if(CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs",__FILE__, __LINE__,err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int Nrows = 4; // --- Number of rows
const int Ncols = 5; // --- Number of columns
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));
// --- Column-major ordering
h_A_dense[0] = 1.0f; h_A_dense[4] = 4.0f; h_A_dense[8] = 0.0f; h_A_dense[12] = 0.0f; h_A_dense[16] = 0.0f;
h_A_dense[1] = 0.0f; h_A_dense[5] = 2.0f; h_A_dense[9] = 3.0f; h_A_dense[13] = 0.0f; h_A_dense[17] = 0.0f;
h_A_dense[2] = 5.0f; h_A_dense[6] = 0.0f; h_A_dense[10] = 0.0f; h_A_dense[14] = 7.0f; h_A_dense[18] = 8.0f;
h_A_dense[3] = 0.0f; h_A_dense[7] = 0.0f; h_A_dense[11] = 9.0f; h_A_dense[15] = 0.0f; h_A_dense[19] = 6.0f;
//create device array and copy host to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase (descrA, CUSPARSE_INDEX_BASE_ZERO);
int nnz = 0; // --- Number of nonzero elements in dense matrix
const int lda = Nrows; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row
int *d_nnzPerVector; gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Device side dense matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side dense matrix
double *h_A = (double *)malloc(nnz * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
}
#包括
#包括
#包括
#包括
#包括
#包括“Utilities.cuh”
/***************************/
/*解析错误检查*/
/***************************/
静态常量字符*\u cusparseGetErrorEnum(cusparseStatus\u t错误)
{
开关(错误)
{
案例分析(状态)(成功):
返回“CUSPARSE\u STATUS\u SUCCESS”;
未初始化状态的情况:
返回“CUSPARSE\u STATUS\u NOT\u INITIALIZED”;
案例CUSPARSE\u STATUS\u ALLOC\u失败:
返回“CUSPARSE\u STATUS\u ALLOC\u FAILED”;
案例库解析\状态\无效\值:
返回“CUSPARSE\u STATUS\u INVALID\u VALUE”;
案例CUSPARSE\u状态\u拱\u不匹配:
返回“CUSPARSE\u STATUS\u ARCH\u MISMATCH”;
案例分析\状态\映射\错误:
返回“CUSPARSE\u STATUS\u MAPPING\u ERROR”;
案例分析\状态\执行\失败:
返回“CUSPARSE\u STATUS\u EXECUTION\u FAILED”;
案例分析\状态\内部\错误:
返回“CUSPARSE\u STATUS\u INTERNAL\u ERROR”;
不支持状态矩阵类型的情况:
返回“CUSPASE\u状态\u矩阵\u类型\u不受支持”;
案例1\u状态\u零\u轴:
返回“CUSPARSE\u STATUS\u ZERO\u PIVOT”;
}
返回“”;
}
内联void\uu cusparseSafeCall(cusparseStatus\u t err,const char*文件,const int行)
{
if(CUSPARSE\u STATUS\u SUCCESS!=错误){
fprintf(stderr,“文件“%s”中的CUSPARSE错误,第%Ndims\Nobjs%s行\n错误%Ndims:%s\n正在删除!\Nobjs“,\uuuu文件\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu\
_cusparseGetErrorEnum(err))\
cudaDeviceReset();断言(0)\
}
}
外部“C”void cusparseSafeCall(cusparseStatus_t err){{uuuu cusparseSafeCall(err,uuuu文件,uuu行uu)}
/********/
/*主要*/
/********/
int main()
{
//---初始化cuSPARSE
cusparseHandle_t handle;cusparseSafeCall(cusparseCreate(&handle));
常量int Nrows=4;//--行数
常量int Ncols=5;//--列数
//---主机侧密集矩阵
double*h_A_densed=(double*)malloc(Nrows*Ncols*sizeof(*h_A_densed));
//---列主要顺序
高密度[0]=1.0f;高密度[4]=4.0f;高密度[8]=0.0f;高密度[12]=0.0f;高密度[16]=0.0f;
高密度[1]=0.0f;高密度[5]=2.0f;高密度[9]=3.0f;高密度[13]=0.0f;高密度[17]=0.0f;
高密度[2]=5.0f;高密度[6]=0.0f;高密度[10]=0.0f;高密度[14]=7.0f;高密度[18]=8.0f;
高密度[3]=0.0f;高密度[7]=0.0f;高密度[11]=9.0f;高密度[15]=0.0f;高密度[19]=6.0f;
//创建设备阵列并将主机复制到其中
双*d_A_稠密;gpuErrchk(Cudamaloc(&d_A_稠密,Nrows*Ncols*sizeof(*d_A_稠密));
gpuErrchk(cudaMemcpy(d_A_密集,h_A_密集,Nrows*Ncols*sizeof(*d_A_密集),cudaMemcpyHostToDevice));
//---稀疏矩阵A的描述符
cusparseMatDescr_t descrA;cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSetMatType(描述、CUSPARSE_矩阵_类型_概述);
CusParseSetMatiIndexBase(描述,CUSPARSE_索引_基数_零);
int nnz=0;/——稠密矩阵中非零元素的个数
const int lda=Nrows;//--稠密矩阵的前导维数
//---每行非零元素的设备端数量
int*d_nnzPerVector;gpuerchk(cudaMalloc(&d_nnzPerVector,Nrows*sizeof(*d_nnzPerVector));
CusParsesSafeCall(句柄、CusParsedNz方向行、Nrows、Ncols、descrA、d_A_稠密、lda、d_nnzPerVector和nnz));
//---每行的主机端非零元素数
int*h_nnzPerVector=(int*)malloc(Nrows*sizeof(*h_nnzPerVector));
gpuErrchk(cudaMemcpy(h_nnzPerVector,d_nnzPerVector,Nrows*sizeof(*h_nnzPerVector),cudaMemcpyDeviceToHost));
printf(“密集矩阵中非零元素的数量=%i\n\n”,nnz);
对于(int i=0;i#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>
#include "Utilities.cuh"
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if(CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs",__FILE__, __LINE__,err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int Nrows = 4; // --- Number of rows
const int Ncols = 5; // --- Number of columns
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));
// --- Column-major ordering
h_A_dense[0] = 1.0f; h_A_dense[4] = 4.0f; h_A_dense[8] = 0.0f; h_A_dense[12] = 0.0f; h_A_dense[16] = 0.0f;
h_A_dense[1] = 0.0f; h_A_dense[5] = 2.0f; h_A_dense[9] = 3.0f; h_A_dense[13] = 0.0f; h_A_dense[17] = 0.0f;
h_A_dense[2] = 5.0f; h_A_dense[6] = 0.0f; h_A_dense[10] = 0.0f; h_A_dense[14] = 7.0f; h_A_dense[18] = 8.0f;
h_A_dense[3] = 0.0f; h_A_dense[7] = 0.0f; h_A_dense[11] = 9.0f; h_A_dense[15] = 0.0f; h_A_dense[19] = 6.0f;
//create device array and copy host to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase (descrA, CUSPARSE_INDEX_BASE_ZERO);
int nnz = 0; // --- Number of nonzero elements in dense matrix
const int lda = Nrows; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row
int *d_nnzPerVector; gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Device side dense matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side dense matrix
double *h_A = (double *)malloc(nnz * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
}