Cuda 使用cuSPARSE进行稠密到稀疏和稀疏到稠密的转换

Cuda 使用cuSPARSE进行稠密到稀疏和稀疏到稠密的转换,cuda,sparse-matrix,Cuda,Sparse Matrix,下面的程序使用cuSPARSE测试密集到稀疏的转换。它在输出的前几行中产生垃圾。但是,如果我将标有(2)的行移动到标有(1)的行之后的位置,程序工作正常。有人能告诉我是什么原因吗 编辑: 为了让演示更清晰,我用推力重写了程序,同样的问题仍然存在 编辑: 正如Robert所建议的,我将其改回了没有推力的版本,并添加了api级错误检查代码 #include <iostream> #include <cusparse_v2.h> using std::cerr; using

下面的程序使用
cuSPARSE
测试密集到稀疏的转换。它在输出的前几行中产生垃圾。但是,如果我将标有
(2)
的行移动到标有
(1)
的行之后的位置,程序工作正常。有人能告诉我是什么原因吗

编辑: 为了让演示更清晰,我用
推力重写了程序,同样的问题仍然存在

编辑: 正如Robert所建议的,我将其改回了没有推力的版本,并添加了api级错误检查代码

#include <iostream>
#include <cusparse_v2.h>

using std::cerr;
using std::cout;
using std::endl;

#define WRAP(x) do {x} while (0)
#define CHKcusparse(x) WRAP(                                        \
  cusparseStatus_t err = (x);                                       \
  if (err != CUSPARSE_STATUS_SUCCESS) {                             \
    cerr << "Cusparse Error #" << int(err) << "\"TODO\" at Line "   \
         << __LINE__ << " of " << __FILE__ << ": " << #x << endl;   \
    exit(1);                                                        \
  }                                                                 \
)
#define CHKcuda(x) WRAP(                                             \
  cudaError_t err = (x);                                             \
  if (err != cudaSuccess) {                                          \
    cerr << "Cuda Error #" << int(err) << ", \""                     \
         << cudaGetErrorString(err) << "\" at Line " << __LINE__     \
         << " of " << __FILE__ << ": " << #x << endl;                \
    exit(1);                                                         \
  }                                                                  \
)
#define ALLOC(X, T, N) do {                            \
  h##X = (T*) malloc(sizeof(T) * (N));                 \
  CHKcuda(cudaMalloc((void**)&d##X, sizeof(T) * (N))); \
} while(0)

int main() {
  srand(100);

  cusparseHandle_t g_cusparse_handle;
  CHKcusparse(cusparseCreate(&g_cusparse_handle));

  const int n = 100, in_degree = 10;
  int nnz = n * in_degree, nn = n * n;

  int *dnnz, *dridx, *dcols;
  int *hnnz, *hridx, *hcols;
  float *dvals, *dmat;
  float *hvals, *hmat;

  // (1) The number of non-zeros in each column.
  ALLOC(nnz, int, n);

  // The dense matrix.
  ALLOC(mat, float, nn);

  // The values in sparse matrix.
  ALLOC(vals, float, nnz);

  // (2) The row indices of the sparse matrix.
  ALLOC(ridx, int, nnz);

  // The column offsets of the sparse matrix.
  ALLOC(cols, int, n+1);

  // Fill and copy dense matrix and number of non-zeros.
  for (int i = 0; i < nn; i++) {hmat[i] = rand();}
  for (int i = 0; i < n; i++) {hnnz[i] = in_degree;}
  CHKcuda(cudaMemcpyAsync(dnnz, hnnz, sizeof(int) * n, cudaMemcpyHostToDevice));
  CHKcuda(cudaMemcpyAsync(dmat, hmat, sizeof(float) * nn, cudaMemcpyHostToDevice));
  CHKcuda(cudaDeviceSynchronize());

  // Perform dense to CSC format
  cusparseMatDescr_t cspMatDesc;
  CHKcusparse(cusparseCreateMatDescr(&cspMatDesc));
  CHKcusparse(cusparseSdense2csc(
      g_cusparse_handle, n, n, cspMatDesc, dmat, n,
      dnnz, dvals, dridx, dcols
  ));

  // Copy row indices back.
  CHKcuda(cudaMemcpyAsync(hridx, dridx, sizeof(int) * nnz, cudaMemcpyDeviceToHost));
  CHKcuda(cudaDeviceSynchronize());
  CHKcusparse(cusparseDestroyMatDescr(cspMatDesc));

  // Display row indices.
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < in_degree; j++) {
      std::cout << hridx[i * in_degree + j] << ", ";
    }
    std::cout << std::endl;
  }

  CHKcuda(cudaFree(dnnz));
  CHKcuda(cudaFree(dvals));
  CHKcuda(cudaFree(dridx));
  CHKcuda(cudaFree(dcols));
  CHKcuda(cudaFree(dmat));
  free(hnnz);
  free(hmat);
  free(hvals);
  free(hridx);
  free(hcols);
  return 0;
}
#包括
#包括
使用std::cerr;
使用std::cout;
使用std::endl;
#定义换行(x)do{x}while(0)
#定义CHKcusparse(x)包裹(\
cusparseStatus\u t err=(x)\
如果(err!=CUSPARSE_STATUS_SUCCESS){\

cerr基本问题是,您正在向传递内部不一致的数据。您正在传递一个密集矩阵,该矩阵每列有100个非零元素,但您正在告诉CUSPASE每列只有10个非零元素

如果您使用
cuda memcheck
运行代码,您将看到CUSPASE中出现了错误

对于此代码,您可以通过将
in_degree
变量更改为100来解决此问题


对于一般情况,cusparse提供了正确填充每列非零元素的数量。

正如Robert Crovella已经强调的那样,通过
cusparsenz()
cusparsedense2csr()使用
cusparse
可以有效地执行从密集到稀疏的传递
例程。反之亦然,可以通过
cusparsecsr2dense()
例程来完成。下面是一个完整的示例,演示如何使用CSR格式的
cuSPARSE
从密集传递到稀疏,反之亦然

cuSparseUtilities.cuh

#ifndef CUSPARSEUTILITIES_CUH
#define CUSPARSEUTILITIES_CUH

#include "cusparse_v2.h"

void setUpDescriptor(cusparseMatDescr_t &, cusparseMatrixType_t, cusparseIndexBase_t);
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A,
    int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA,
    const cusparseHandle_t handle, const int Nrows, const int Ncols);

#endif
cuSparseUtilities.cu

#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"

/*****************************/
/* SETUP DESCRIPTOR FUNCTION */
/*****************************/
void setUpDescriptor(cusparseMatDescr_t &descrA, cusparseMatrixType_t matrixType, cusparseIndexBase_t indexBase) {
    cusparseSafeCall(cusparseCreateMatDescr(&descrA));
    cusparseSafeCall(cusparseSetMatType(descrA, matrixType));
    cusparseSafeCall(cusparseSetMatIndexBase(descrA, indexBase));
}

/********************************************************/
/* DENSE TO SPARSE CONVERSION FOR REAL DOUBLE PRECISION */
/********************************************************/
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A, 
                   int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA, 
                   const cusparseHandle_t handle, const int Nrows, const int Ncols) {

    const int lda = Nrows;                      // --- Leading dimension of dense matrix

    gpuErrchk(cudaMalloc(&d_nnzPerVector[0], Nrows * sizeof(int)));

    // --- Compute the number of nonzero elements per row and the total number of nonzero elements in the dense d_A_dense
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], &nnz));

    // --- Device side sparse matrix
    gpuErrchk(cudaMalloc(&d_A[0], nnz * sizeof(double)));
    gpuErrchk(cudaMalloc(&d_A_RowIndices[0], (Nrows + 1) * sizeof(int)));
    gpuErrchk(cudaMalloc(&d_A_ColIndices[0], nnz * sizeof(int)));

    cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], d_A[0], d_A_RowIndices[0], d_A_ColIndices[0]));

}
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <cusparse_v2.h>

#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"

/********/
/* MAIN */
/********/
int main() {

    cusparseHandle_t    handle;

    // --- Initialize cuSPARSE
    cusparseSafeCall(cusparseCreate(&handle));

    cusparseMatDescr_t  descrA = 0;

    /**************************/
    /* SETTING UP THE PROBLEM */
    /**************************/
    const int Nrows = 5;                        // --- Number of rows
    const int Ncols = 4;                        // --- Number of columns
    const int N = Nrows;

    // --- Host side dense matrix
    double *h_A_dense = (double*)malloc(Nrows * Ncols * sizeof(*h_A_dense));

    // --- Column-major storage
    h_A_dense[ 0] = 0.4612f;  h_A_dense[ 5] = -0.0006f;   h_A_dense[10] = 1.3f;     h_A_dense[15] = 0.0f;
    h_A_dense[ 1] = 0.0f;     h_A_dense[ 6] = 1.443f;     h_A_dense[11] = 0.0f;     h_A_dense[16] = 0.0f;
    h_A_dense[ 2] = -0.0006f; h_A_dense[ 7] = 0.4640f;    h_A_dense[12] = 0.0723f;  h_A_dense[17] = 0.0f;
    h_A_dense[ 3] = 0.3566f;  h_A_dense[ 8] = 0.0723f;    h_A_dense[13] = 0.7543f;  h_A_dense[18] = 0.0f;
    h_A_dense[ 4] = 0.f;      h_A_dense[ 9] = 0.0f;       h_A_dense[14] = 0.0f;     h_A_dense[19] = 0.1f;

    // --- Create device array and copy host array to it
    double *d_A_dense;  gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));

    /*******************************/
    /* FROM DENSE TO SPARSE MATRIX */
    /*******************************/
    // --- Descriptor for sparse matrix A
    setUpDescriptor(descrA, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ONE);

    int nnz = 0;                                // --- Number of nonzero elements in dense matrix
    int *d_nnzPerVector;                        // --- Device side number of nonzero elements per row

    double *d_A;                                // --- Sparse matrix values - array of size nnz
    int *d_A_RowIndices;                        // --- "Row indices"
    int *d_A_ColIndices;                        // --- "Column indices"

    dense2SparseD(d_A_dense, &d_nnzPerVector, &d_A, &d_A_RowIndices, &d_A_ColIndices, nnz, descrA, handle, Nrows, Ncols);

    /*******************************************************/
    /* CHECKING THE RESULTS FOR DENSE TO SPARSE CONVERSION */
    /*******************************************************/
    // --- Host side number of nonzero elements per row
    int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(int));
    gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(int), cudaMemcpyDeviceToHost));

    printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
    for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
    printf("\n");

    // --- Host side sparse matrix
    double *h_A = (double *)malloc(nnz * sizeof(double));
    int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(int));
    int *h_A_ColIndices = (int *)malloc(nnz * sizeof(int));
    gpuErrchk(cudaMemcpy(h_A, d_A, nnz * sizeof(double), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(int), cudaMemcpyDeviceToHost));

    printf("\nOriginal matrix in CSR format\n\n");
    for (int i = 0; i < nnz; ++i) printf("A[%i] = %f\n", i, h_A[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");

    for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);

    /*******************************/
    /* FROM SPARSE TO DENSE MATRIX */
    /*******************************/
    double *d_A_denseReconstructed; gpuErrchk(cudaMalloc(&d_A_denseReconstructed, Nrows * Ncols * sizeof(double)));
    cusparseSafeCall(cusparseDcsr2dense(handle, Nrows, Ncols, descrA, d_A, d_A_RowIndices, d_A_ColIndices,
                                        d_A_denseReconstructed, Nrows));

    /*******************************************************/
    /* CHECKING THE RESULTS FOR SPARSE TO DENSE CONVERSION */
    /*******************************************************/
    double *h_A_denseReconstructed = (double *)malloc(Nrows * Ncols * sizeof(double));
    gpuErrchk(cudaMemcpy(h_A_denseReconstructed, d_A_denseReconstructed, Nrows * Ncols * sizeof(double), cudaMemcpyDeviceToHost));

    printf("\nReconstructed dense matrix \n");
    for (int m = 0; m < Nrows; m++) {
        for (int n = 0; n < Ncols; n++) 
            printf("%f\t", h_A_denseReconstructed[n * Nrows + m]);
        printf("\n");
    }

    return 0;
}
内核.cu

#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"

/*****************************/
/* SETUP DESCRIPTOR FUNCTION */
/*****************************/
void setUpDescriptor(cusparseMatDescr_t &descrA, cusparseMatrixType_t matrixType, cusparseIndexBase_t indexBase) {
    cusparseSafeCall(cusparseCreateMatDescr(&descrA));
    cusparseSafeCall(cusparseSetMatType(descrA, matrixType));
    cusparseSafeCall(cusparseSetMatIndexBase(descrA, indexBase));
}

/********************************************************/
/* DENSE TO SPARSE CONVERSION FOR REAL DOUBLE PRECISION */
/********************************************************/
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A, 
                   int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA, 
                   const cusparseHandle_t handle, const int Nrows, const int Ncols) {

    const int lda = Nrows;                      // --- Leading dimension of dense matrix

    gpuErrchk(cudaMalloc(&d_nnzPerVector[0], Nrows * sizeof(int)));

    // --- Compute the number of nonzero elements per row and the total number of nonzero elements in the dense d_A_dense
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], &nnz));

    // --- Device side sparse matrix
    gpuErrchk(cudaMalloc(&d_A[0], nnz * sizeof(double)));
    gpuErrchk(cudaMalloc(&d_A_RowIndices[0], (Nrows + 1) * sizeof(int)));
    gpuErrchk(cudaMalloc(&d_A_ColIndices[0], nnz * sizeof(int)));

    cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], d_A[0], d_A_RowIndices[0], d_A_ColIndices[0]));

}
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <cusparse_v2.h>

#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"

/********/
/* MAIN */
/********/
int main() {

    cusparseHandle_t    handle;

    // --- Initialize cuSPARSE
    cusparseSafeCall(cusparseCreate(&handle));

    cusparseMatDescr_t  descrA = 0;

    /**************************/
    /* SETTING UP THE PROBLEM */
    /**************************/
    const int Nrows = 5;                        // --- Number of rows
    const int Ncols = 4;                        // --- Number of columns
    const int N = Nrows;

    // --- Host side dense matrix
    double *h_A_dense = (double*)malloc(Nrows * Ncols * sizeof(*h_A_dense));

    // --- Column-major storage
    h_A_dense[ 0] = 0.4612f;  h_A_dense[ 5] = -0.0006f;   h_A_dense[10] = 1.3f;     h_A_dense[15] = 0.0f;
    h_A_dense[ 1] = 0.0f;     h_A_dense[ 6] = 1.443f;     h_A_dense[11] = 0.0f;     h_A_dense[16] = 0.0f;
    h_A_dense[ 2] = -0.0006f; h_A_dense[ 7] = 0.4640f;    h_A_dense[12] = 0.0723f;  h_A_dense[17] = 0.0f;
    h_A_dense[ 3] = 0.3566f;  h_A_dense[ 8] = 0.0723f;    h_A_dense[13] = 0.7543f;  h_A_dense[18] = 0.0f;
    h_A_dense[ 4] = 0.f;      h_A_dense[ 9] = 0.0f;       h_A_dense[14] = 0.0f;     h_A_dense[19] = 0.1f;

    // --- Create device array and copy host array to it
    double *d_A_dense;  gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));

    /*******************************/
    /* FROM DENSE TO SPARSE MATRIX */
    /*******************************/
    // --- Descriptor for sparse matrix A
    setUpDescriptor(descrA, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ONE);

    int nnz = 0;                                // --- Number of nonzero elements in dense matrix
    int *d_nnzPerVector;                        // --- Device side number of nonzero elements per row

    double *d_A;                                // --- Sparse matrix values - array of size nnz
    int *d_A_RowIndices;                        // --- "Row indices"
    int *d_A_ColIndices;                        // --- "Column indices"

    dense2SparseD(d_A_dense, &d_nnzPerVector, &d_A, &d_A_RowIndices, &d_A_ColIndices, nnz, descrA, handle, Nrows, Ncols);

    /*******************************************************/
    /* CHECKING THE RESULTS FOR DENSE TO SPARSE CONVERSION */
    /*******************************************************/
    // --- Host side number of nonzero elements per row
    int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(int));
    gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(int), cudaMemcpyDeviceToHost));

    printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
    for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
    printf("\n");

    // --- Host side sparse matrix
    double *h_A = (double *)malloc(nnz * sizeof(double));
    int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(int));
    int *h_A_ColIndices = (int *)malloc(nnz * sizeof(int));
    gpuErrchk(cudaMemcpy(h_A, d_A, nnz * sizeof(double), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(int), cudaMemcpyDeviceToHost));

    printf("\nOriginal matrix in CSR format\n\n");
    for (int i = 0; i < nnz; ++i) printf("A[%i] = %f\n", i, h_A[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");

    for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);

    /*******************************/
    /* FROM SPARSE TO DENSE MATRIX */
    /*******************************/
    double *d_A_denseReconstructed; gpuErrchk(cudaMalloc(&d_A_denseReconstructed, Nrows * Ncols * sizeof(double)));
    cusparseSafeCall(cusparseDcsr2dense(handle, Nrows, Ncols, descrA, d_A, d_A_RowIndices, d_A_ColIndices,
                                        d_A_denseReconstructed, Nrows));

    /*******************************************************/
    /* CHECKING THE RESULTS FOR SPARSE TO DENSE CONVERSION */
    /*******************************************************/
    double *h_A_denseReconstructed = (double *)malloc(Nrows * Ncols * sizeof(double));
    gpuErrchk(cudaMemcpy(h_A_denseReconstructed, d_A_denseReconstructed, Nrows * Ncols * sizeof(double), cudaMemcpyDeviceToHost));

    printf("\nReconstructed dense matrix \n");
    for (int m = 0; m < Nrows; m++) {
        for (int n = 0; n < Ncols; n++) 
            printf("%f\t", h_A_denseReconstructed[n * Nrows + m]);
        printf("\n");
    }

    return 0;
}
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#包括
#包括“cuSparseUtilities.cuh”
#包括“Utilities.cuh”
/********/
/*主要*/
/********/
int main(){
尖柄(t)柄;;
//---初始化cuSPARSE
cusparseSafeCall(cusparseCreate(&handle));
cusparseMatDescr\u t descrA=0;
/**************************/
/*设置问题*/
/**************************/
常量int Nrows=5;//--行数
const int Ncols=4;//--列数
常数int N=Nrows;
//---主机侧密集矩阵
double*h_A_densed=(double*)malloc(Nrows*Ncols*sizeof(*h_A_densed));
//---列主存储
高密度[0]=0.4612f;高密度[5]=-0.0006f;高密度[10]=1.3f;高密度[15]=0.0f;
高密度[1]=0.0f;高密度[6]=1.443f;高密度[11]=0.0f;高密度[16]=0.0f;
高密度[2]=-0.0006f;高密度[7]=0.4640f;高密度[12]=0.0723f;高密度[17]=0.0f;
高密度[3]=0.3566f;高密度[8]=0.0723f;高密度[13]=0.7543f;高密度[18]=0.0f;
高密度[4]=0.f;高密度[9]=0.0f;高密度[14]=0.0f;高密度[19]=0.1f;
//---创建设备阵列并将主机阵列复制到其中
double*d_A_dense;gpuErrchk(Cudamaloc(&d_A_dense,Nrows*Ncols*sizeof(double));
gpuErrchk(cudaMemcpy(d_A_密集,h_A_密集,Nrows*Ncols*sizeof(*d_A_密集),cudaMemcpyHostToDevice));
/*******************************/
/*从稠密矩阵到稀疏矩阵*/
/*******************************/
//---稀疏矩阵A的描述符
setUpDescriptor(描述、CUSPARSE\u矩阵类型\u常规、CUSPARSE\u索引\u基础\u一);
int nnz=0;/——稠密矩阵中非零元素的个数
int*d_nnzPerVector;//---每行非零元素的设备端数
double*d_A;//---稀疏矩阵值-大小为nnz的数组
int*d_A_行索引;//--“行索引”
int*d_A_ColIndices;//--“列索引”
解析密度2(密度、密度、密度、密度指数、密度指数、密度指数、共指数、密度指数、描述指数、句柄、Nrows、Ncols);
/*******************************************************/
/*检查密集到稀疏转换的结果*/
/*******************************************************/
//---每行的主机端非零元素数
int*h_nnzPerVector=(int*)malloc(Nrows*sizeof(int));
gpuErrchk(cudaMemcpy(h_nnzPerVector,d_nnzPerVector,Nrows*sizeof(int),cudaMemcpyDeviceToHost));
printf(“密集矩阵中非零元素的数量=%i\n\n”,nnz);
对于(int i=0;i