使用CUDA并行实现多个SVD_Cuda_Parallel Processing_Gpu_Svd

使用CUDA并行实现多个SVD

cuda parallel-processing

使用CUDA并行实现多个SVD,cuda,parallel-processing,gpu,svd,Cuda,Parallel Processing,Gpu,Svd,我是使用GPU进行并行编程的新手，所以如果问题涉及面太广或太模糊，我深表歉意。我知道CULA库中有一些并行SVD函数，但是如果我有大量相对较小的矩阵要分解，那么应该采取什么策略呢？例如，我有维度为d的n矩阵，n较大，d较小。如何并行化这个过程？有人能给我一个提示吗？你可以看一下CULA博客的帖子来讨论你的问题编辑根据我从您下面的评论中了解到的情况，您希望每个线程计算一个单独的SVD。因此，基本上每个线程都应该执行一个标准的顺序SVD方案。为此，请参阅一些可能有用的参考资料：但是，如果您

我是使用GPU进行并行编程的新手，所以如果问题涉及面太广或太模糊，我深表歉意。我知道CULA库中有一些并行SVD函数，但是如果我有大量相对较小的矩阵要分解，那么应该采取什么策略呢？例如，我有维度为

的

矩阵，

较大，

较小。如何并行化这个过程？有人能给我一个提示吗？

你可以看一下CULA博客的帖子来讨论你的问题

编辑

根据我从您下面的评论中了解到的情况，您希望每个线程计算一个单独的SVD。因此，基本上每个线程都应该执行一个标准的顺序SVD方案。为此，请参阅一些可能有用的参考资料：

但是，如果您使用这种方法，恐怕您将无法再使用cuBLAS，因为它们是

设备

无法调用的

主机

函数（除非您没有计算能力

>3.5

，请参阅

simpleDevLibCUBLAS

示例）。但基本上，我认为您是在以某种方式自行实现批处理概念

如果您决定使用更标准的并行GPU实现，下面的参考可能会很有趣：

我以前的答案现在已经过时了。截至2015年2月，CUDA 7（目前正在发布候选版本）在其cuSOLVER库中提供完整的SVD功能。下面，我将提供一个使用CUDA cuSOLVER生成奇异值分解的示例

关于您提出的具体问题（计算几个小尺寸矩阵的奇异值分解），您应该使用streams来修改我下面提供的示例。将流与您可以使用的每个任务关联

cudaStreamCreate()

及

内核.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>

#include <cusolverDn.h>
#include <cuda_runtime_api.h>

#include "Utilities.cuh"

/********/
/* MAIN */
/********/
int main(){

    // --- gesvd only supports Nrows >= Ncols
    // --- column major memory ordering

    const int Nrows = 7;
    const int Ncols = 5;

    // --- cuSOLVE input/output parameters/arrays
    int work_size = 0;
    int *devInfo;           gpuErrchk(cudaMalloc(&devInfo,          sizeof(int)));

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle;
    cusolverDnCreate(&solver_handle);

    // --- Setting the host, Nrows x Ncols matrix
    double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
    for(int j = 0; j < Nrows; j++)
        for(int i = 0; i < Ncols; i++)
            h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j));

    // --- Setting the device matrix and moving the host matrix to the device
    double *d_A;            gpuErrchk(cudaMalloc(&d_A,      Nrows * Ncols * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));

    // --- host side SVD results space
    double *h_U = (double *)malloc(Nrows * Nrows     * sizeof(double));
    double *h_V = (double *)malloc(Ncols * Ncols     * sizeof(double));
    double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double));

    // --- device side SVD workspace and matrices
    double *d_U;            gpuErrchk(cudaMalloc(&d_U,  Nrows * Nrows     * sizeof(double)));
    double *d_V;            gpuErrchk(cudaMalloc(&d_V,  Ncols * Ncols     * sizeof(double)));
    double *d_S;            gpuErrchk(cudaMalloc(&d_S,  min(Nrows, Ncols) * sizeof(double)));

    // --- CUDA SVD initialization
    cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
    double *work;   gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));

    // --- CUDA SVD execution
    cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
    int devInfo_h = 0;  gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    if (devInfo_h != 0) std::cout   << "Unsuccessful SVD execution\n\n";

    // --- Moving the results from device to host
    gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows     * sizeof(double), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols     * sizeof(double), cudaMemcpyDeviceToHost));

    std::cout << "Singular values\n";
    for(int i = 0; i < min(Nrows, Ncols); i++) 
        std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;

    std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
    for(int j = 0; j < Nrows; j++) {
        printf("\n");
        for(int i = 0; i < Nrows; i++)
            printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
    }

    std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
    for(int i = 0; i < Ncols; i++) {
        printf("\n");
        for(int j = 0; j < Ncols; j++)
            printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
    }

    cusolverDnDestroy(solver_handle);

    return 0;

}

#include <stdio.h>
#include <assert.h>

#include "cuda_runtime.h"
#include <cuda.h>

#include <cusolverDn.h>

/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) { exit(code); }
   }
}

extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }

/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
    switch (error)
    {
        case CUSOLVER_STATUS_SUCCESS:
            return "CUSOLVER_SUCCESS";

        case CUSOLVER_STATUS_NOT_INITIALIZED:
            return "CUSOLVER_STATUS_NOT_INITIALIZED";

        case CUSOLVER_STATUS_ALLOC_FAILED:
            return "CUSOLVER_STATUS_ALLOC_FAILED";

        case CUSOLVER_STATUS_INVALID_VALUE:
            return "CUSOLVER_STATUS_INVALID_VALUE";

        case CUSOLVER_STATUS_ARCH_MISMATCH:
            return "CUSOLVER_STATUS_ARCH_MISMATCH";

        case CUSOLVER_STATUS_EXECUTION_FAILED:
            return "CUSOLVER_STATUS_EXECUTION_FAILED";

        case CUSOLVER_STATUS_INTERNAL_ERROR:
            return "CUSOLVER_STATUS_INTERNAL_ERROR";

        case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
            return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

    }

    return "<unknown>";
}

inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
    if(CUSOLVER_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                                _cudaGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
    }
}

extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }

实用程序.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>

#include <cusolverDn.h>
#include <cuda_runtime_api.h>

#include "Utilities.cuh"

/********/
/* MAIN */
/********/
int main(){

    // --- gesvd only supports Nrows >= Ncols
    // --- column major memory ordering

    const int Nrows = 7;
    const int Ncols = 5;

    // --- cuSOLVE input/output parameters/arrays
    int work_size = 0;
    int *devInfo;           gpuErrchk(cudaMalloc(&devInfo,          sizeof(int)));

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle;
    cusolverDnCreate(&solver_handle);

    // --- Setting the host, Nrows x Ncols matrix
    double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
    for(int j = 0; j < Nrows; j++)
        for(int i = 0; i < Ncols; i++)
            h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j));

    // --- Setting the device matrix and moving the host matrix to the device
    double *d_A;            gpuErrchk(cudaMalloc(&d_A,      Nrows * Ncols * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));

    // --- host side SVD results space
    double *h_U = (double *)malloc(Nrows * Nrows     * sizeof(double));
    double *h_V = (double *)malloc(Ncols * Ncols     * sizeof(double));
    double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double));

    // --- device side SVD workspace and matrices
    double *d_U;            gpuErrchk(cudaMalloc(&d_U,  Nrows * Nrows     * sizeof(double)));
    double *d_V;            gpuErrchk(cudaMalloc(&d_V,  Ncols * Ncols     * sizeof(double)));
    double *d_S;            gpuErrchk(cudaMalloc(&d_S,  min(Nrows, Ncols) * sizeof(double)));

    // --- CUDA SVD initialization
    cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
    double *work;   gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));

    // --- CUDA SVD execution
    cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
    int devInfo_h = 0;  gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    if (devInfo_h != 0) std::cout   << "Unsuccessful SVD execution\n\n";

    // --- Moving the results from device to host
    gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows     * sizeof(double), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols     * sizeof(double), cudaMemcpyDeviceToHost));

    std::cout << "Singular values\n";
    for(int i = 0; i < min(Nrows, Ncols); i++) 
        std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;

    std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
    for(int j = 0; j < Nrows; j++) {
        printf("\n");
        for(int i = 0; i < Nrows; i++)
            printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
    }

    std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
    for(int i = 0; i < Ncols; i++) {
        printf("\n");
        for(int j = 0; j < Ncols; j++)
            printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
    }

    cusolverDnDestroy(solver_handle);

    return 0;

}

#include <stdio.h>
#include <assert.h>

#include "cuda_runtime.h"
#include <cuda.h>

#include <cusolverDn.h>

/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) { exit(code); }
   }
}

extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }

/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
    switch (error)
    {
        case CUSOLVER_STATUS_SUCCESS:
            return "CUSOLVER_SUCCESS";

        case CUSOLVER_STATUS_NOT_INITIALIZED:
            return "CUSOLVER_STATUS_NOT_INITIALIZED";

        case CUSOLVER_STATUS_ALLOC_FAILED:
            return "CUSOLVER_STATUS_ALLOC_FAILED";

        case CUSOLVER_STATUS_INVALID_VALUE:
            return "CUSOLVER_STATUS_INVALID_VALUE";

        case CUSOLVER_STATUS_ARCH_MISMATCH:
            return "CUSOLVER_STATUS_ARCH_MISMATCH";

        case CUSOLVER_STATUS_EXECUTION_FAILED:
            return "CUSOLVER_STATUS_EXECUTION_FAILED";

        case CUSOLVER_STATUS_INTERNAL_ERROR:
            return "CUSOLVER_STATUS_INTERNAL_ERROR";

        case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
            return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

    }

    return "<unknown>";
}

inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
    if(CUSOLVER_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                                _cudaGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
    }
}

extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }

#包括
#包括
#包括“cuda_runtime.h”
#包括
#包括
/*******************/
/*iDivUp函数*/
/*******************/
外部“C”intIDIVUP（inta，intb）{返回（（a%b）！=0）？（a/b+1）：（a/b）；}
/********************/
/*CUDA错误检查*/
/********************/
//---归功于http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert（cudaError\u t代码，字符*文件，int行，bool abort=true）
{
如果（代码！=cudaSuccess）
{
fprintf（标准，“GPUassert:%s%s%d\n”，cudaGetErrorString（代码）、文件、行）；
if（abort）{exit（code）；}
}
}
外部“C”void gpuerchk（cudaError_t ans）{gpuAssert（（ans），uuuu文件，uuu行）}
/**************************/
/*CUSOLVE错误检查*/
/**************************/
静态常量字符*\u cudaGetErrorEnum（cusolverStatus\u t错误）
{
开关（错误）
{
案例CUSOLVER_状态_成功：
返回“CUSOLVER_SUCCESS”；
案例CUSOLVER\u状态\u未初始化：
返回“CUSOLVER\u状态未初始化”；
案例CUSOLVER_状态_ALLOC_失败：
返回“CUSOLVER\u STATUS\u ALLOC\u FAILED”；
案例CUSOLVER\u状态\u无效\u值：
返回“CUSOLVER\u STATUS\u INVALID\u VALUE”；
案例CUSOLVER\u状态\u拱\u不匹配：
返回“CUSOLVER\u STATUS\u ARCH\u MISMATCH”；
案例CUSOLVER_状态_执行_失败：
返回“CUSOLVER_状态_执行_失败”；
案例CUSOLVER\u状态\u内部错误：
返回“CUSOLVER\u状态\u内部错误”；
案例CUSOLVER\u状态\u矩阵\u类型\u不受支持：
返回“CUSOLVER\u状态\u矩阵\u类型\u不受支持”；
}
返回“”；
}
内联void\u cusolveSafeCall（cusolverStatus\u t err，const char*文件，const int line）
{
if（CUSOLVER\u STATUS\u SUCCESS！=错误）{
fprintf（stderr，“文件“%s”中的CUSOLVE错误，第%d行\n%s\n错误%d:%s\n正在删除！\n“，\uuuu文件，\uuu行\uuu，错误\
_cudaGetErrorEnum（err））\
cudaDeviceReset（）；断言（0）\
}
}
外部“C”void cusolveSafeCall（cusolverStatus_t err）{{uuuu cusolveSafeCall（err，uuuu文件，uuu行）}

以上答案现已过时。自CUDA 9.0起，

cuSOLVER

库配备了基于Jacobi方法的分批奇异值分解计算。下面是一个完整的示例：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cusolverDn.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

//#define FULLSVD
//#define PRINTRESULTS

/********/
/* MAIN */
/********/
int main() {

    const int           M           = 3;
    const int           N           = 3;
    const int           lda         = M;
    //const int         numMatrices = 3;
    const int           numMatrices = 16384;

    TimingGPU timerGPU;

    // --- Setting the host matrix
    double *h_A = (double *)malloc(lda * N * numMatrices * sizeof(double));
    for (unsigned int k = 0; k < numMatrices; k++)
        for (unsigned int i = 0; i < M; i++){
            for (unsigned int j = 0; j < N; j++){
                h_A[k * M * N + j * M + i] = (1. / (k + 1)) * (i + j * j) * (i + j);
                //printf("%d %d %f\n", i, j, h_A[j*M + i]);
            }
        }

    // --- Setting the device matrix and moving the host matrix to the device
    double *d_A;         gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(double), cudaMemcpyHostToDevice));

    // --- host side SVD results space
    double *h_S = (double *)malloc(N *     numMatrices * sizeof(double));
    double *h_U = NULL;
    double *h_V = NULL;
#ifdef FULLSVD
            h_U = (double *)malloc(M * M * numMatrices * sizeof(double));
            h_V = (double *)malloc(N * N * numMatrices * sizeof(double));
#endif

    // --- device side SVD workspace and matrices
    int work_size = 0;

    int *devInfo;        gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
    double *d_S;         gpuErrchk(cudaMalloc(&d_S, N *     numMatrices * sizeof(double)));
    double *d_U = NULL;
    double *d_V = NULL;
#ifdef FULLSVD
                         gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(double)));
                         gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(double)));
#endif

    double *d_work = NULL; /* devie workspace for gesvdj */
    int devInfo_h = 0; /* host copy of error devInfo_h */

    // --- Parameters configuration of Jacobi-based SVD
    const double            tol             = 1.e-7;
    const int               maxSweeps       = 15;
          cusolverEigMode_t jobz;                                   // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
        jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
        jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif

    const int               econ            = 0;                            // --- econ = 1 for economy size 

    // --- Numerical result parameters of gesvdj 
    double                  residual        = 0;
    int                     executedSweeps  = 0;

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle = NULL;
    cusolveSafeCall(cusolverDnCreate(&solver_handle));

    // --- Configuration of gesvdj
    gesvdjInfo_t gesvdj_params = NULL;
    cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));

    // --- Set the computation tolerance, since the default tolerance is machine precision
    cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));

    // --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
    cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));

    // --- Query the SVD workspace 
    cusolveSafeCall(cusolverDnDgesvdjBatched_bufferSize(
        solver_handle,
        jobz,                                       // --- Compute the singular vectors or not
        M,                                          // --- Nubmer of rows of A, 0 <= M
        N,                                          // --- Number of columns of A, 0 <= N 
        d_A,                                        // --- M x N
        lda,                                        // --- Leading dimension of A
        d_S,                                        // --- Square matrix of size min(M, N) x min(M, N)
        d_U,                                        // --- M x M if econ = 0, M x min(M, N) if econ = 1
        lda,                                        // --- Leading dimension of U, ldu >= max(1, M)
        d_V,                                        // --- N x N if econ = 0, N x min(M,N) if econ = 1
        lda,                                        // --- Leading dimension of V, ldv >= max(1, N)
        &work_size,
        gesvdj_params,
        numMatrices));

    gpuErrchk(cudaMalloc(&d_work, sizeof(double) * work_size));

    // --- Compute SVD
    timerGPU.StartCounter();
    cusolveSafeCall(cusolverDnDgesvdjBatched(
        solver_handle,
        jobz,                                       // --- Compute the singular vectors or not
        M,                                          // --- Number of rows of A, 0 <= M
        N,                                          // --- Number of columns of A, 0 <= N 
        d_A,                                        // --- M x N
        lda,                                        // --- Leading dimension of A
        d_S,                                        // --- Square matrix of size min(M, N) x min(M, N)
        d_U,                                        // --- M x M if econ = 0, M x min(M, N) if econ = 1
        lda,                                        // --- Leading dimension of U, ldu >= max(1, M)
        d_V,                                        // --- N x N if econ = 0, N x min(M, N) if econ = 1
        lda,                                        // --- Leading dimension of V, ldv >= max(1, N)
        d_work,
        work_size,
        devInfo,
        gesvdj_params,
        numMatrices));

    printf("Calculation of the singular values only: %f ms\n\n", timerGPU.GetCounter());

    gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(double) *       N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
    gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(double) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(double) * lda * N * numMatrices, cudaMemcpyDeviceToHost));
#endif

#ifdef PRINTRESULTS
    printf("SINGULAR VALUES \n");
    printf("_______________ \n");
    for (int k = 0; k < numMatrices; k++) {
        for (int p = 0; p < N; p++)
            printf("Matrix nr. %d; SV nr. %d; Value = %f\n", k, p, h_S[k * N + p]);
        printf("\n");
    }
#ifdef FULLSVD
    printf("SINGULAR VECTORS U \n");
    printf("__________________ \n");
    for (int k = 0; k < numMatrices; k++) {
        for (int q = 0; q < (1 - econ) * M + econ * min(M, N); q++)
            for (int p = 0; p < M; p++)
                printf("Matrix nr. %d; U nr. %d; Value = %f\n", k, p, h_U[((1 - econ) * M + econ * min(M, N)) * M * k + q * M + p]);
        printf("\n");
    }

    printf("SINGULAR VECTORS V \n");
    printf("__________________ \n");
    for (int k = 0; k < numMatrices; k++) {
        for (int q = 0; q < (1 - econ) * N + econ * min(M, N); q++)
            for (int p = 0; p < N; p++)
                printf("Matrix nr. %d; V nr. %d; Value = %f\n", k, p, h_V[((1 - econ) * N + econ * min(M, N)) * N * k + q * N + p]);
        printf("\n");
    }
#endif
#endif

    if (0 == devInfo_h){
        printf("gesvdj converges \n");
    }
    else if (0 > devInfo_h){
        printf("%d-th parameter is wrong \n", -devInfo_h);
        exit(1);
    }
    else{
        printf("WARNING: devInfo_h = %d : gesvdj does not converge \n", devInfo_h);
    }

    // --- Free resources
    if (d_A) gpuErrchk(cudaFree(d_A));
    if (d_S) gpuErrchk(cudaFree(d_S));
#ifdef FULLSVD
    if (d_U) gpuErrchk(cudaFree(d_U));
    if (d_V) gpuErrchk(cudaFree(d_V));
#endif
    if (devInfo) gpuErrchk(cudaFree(devInfo));
    if (d_work) gpuErrchk(cudaFree(d_work));
    if (solver_handle) cusolveSafeCall(cusolverDnDestroy(solver_handle));
    if (gesvdj_params) cusolveSafeCall(cusolverDnDestroyGesvdjInfo(gesvdj_params));

    gpuErrchk(cudaDeviceReset());

    return 0;
}

#包括
#包括
#包括
#包括
#包括
#包括
#包括“Utilities.cuh”
#包括“TimingGPU.cuh”
//#定义FULLSVD
//#定义打印结果
/********/
/*主要*/
/********/
int main（）{
常数int M=3；
常数int N=3；
常数int lda=M；
//常量整数=3；
常量整数=16384；
定时GPU时间RGPU；
//---设置主机矩阵
double*h_A=（double*）malloc（lda*N*numMatrices*sizeof（double））；
for（无符号整数k=0；k