Algorithm CUDA中求解线性系统的QR分解_Algorithm_Cuda_Gpgpu_Gpu

Algorithm CUDA中求解线性系统的QR分解

algorithm cuda

Algorithm CUDA中求解线性系统的QR分解,algorithm,cuda,gpgpu,gpu,Algorithm,Cuda,Gpgpu,Gpu,我正在GPU上写一个图像恢复算法，详细内容见求解线性系统的QR分解法 Ax=b 工作如下 min||Ax-b|| ---> ||QRx-b|| ---> ||(Q^T)QRx-(Q^T)b|| ---> ||Rx-(Q^T)b|| 其中R是上三角矩阵。由此产生的上三角线性系统易于求解我想使用CULA工具来实现这个方法。CULA例程GEQRF计算QR分解。手册上说：退出时，数组对角线上及其上方的元素包含 min（M，N）-by-N上部梯形矩阵R（R为上部三角形

我正在GPU上写一个图像恢复算法，详细内容见

求解线性系统的QR分解法

Ax=b

工作如下

min||Ax-b|| ---> ||QRx-b||  ---> ||(Q^T)QRx-(Q^T)b|| ---> ||Rx-(Q^T)b||

其中

是上三角矩阵。由此产生的上三角线性系统易于求解

我想使用CULA工具来实现这个方法。CULA例程

GEQRF

计算QR分解。手册上说：

退出时，数组对角线上及其上方的元素包含

min（M，N）-by-N

上部梯形矩阵

（

为上部三角形如果

m>=n）

；对角线下方的元素，带有数组

TAU

，将正交/酉矩阵

表示为乘积基本反射器的

min（m，n）

我不知道

存储在哪里，而且算法对我来说太复杂了。你能给点建议吗

谢谢

void GEQRF(int M,int N,T* A,int LDA, T* TAU, T* WORK,int LWORK,int &INFO)

在GEQRF之后，R存储在A的上三角部分。然后，可以使用xORGQR生成Q，A和TAU作为输入

更多说明：

自2015年2月起，CUDA 7.0（现为候选版本）提供了新的cuSOLVER库，包括计算矩阵的QR分解的可能性。这与库布拉斯图书馆一起，能够根据《库索尔弗用户指南》附录C中阐述的指南求解线性系统

您必须遵循以下三个步骤：

1）

geqrf

：它通过返回

的上三角部分中的上三角矩阵

，以及

的下三角部分中以Householder向量形式存储的矩阵

来计算矩阵的QR分解，而户主向量的比例因子由

TAU

参数返回

2）

ormqr

：通过覆盖

和矩阵

的乘积

3）

trsm

：它求解一个上三角线性系统

下面，我将提供这些例程使用的完整示例

#include "cuda_runtime.h"
#include "device_launch_paraMeters.h"

#include<iostream>
#include<fstream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>

#include <cusolverDn.h>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define BLOCK_SIZE 32

#define prec_save 10

/***************/
/* COPY KERNEL */
/***************/
__global__ void copy_kernel(const double * __restrict d_in, double * __restrict d_out, const int M, const int N) {

    const int i = blockIdx.x * blockDim.x + threadIdx.x;
    const int j = blockIdx.y * blockDim.y + threadIdx.y;

    if ((i < N) && (j < N)) d_out[j * N + i] = d_in[j * M + i];
}

/****************************************************/
/* LOAD INDIVIDUAL REAL MATRIX FROM txt FILE TO CPU */
/****************************************************/
// --- Load individual real matrix from txt file
template <class T>
void loadCPUrealtxt(T * __restrict h_out, const char *filename, const int M) {

    std::ifstream infile;
    infile.open(filename);
    for (int i = 0; i < M; i++) {
        double temp;
        infile >> temp;
        h_out[i] = (T)temp;
    }

    infile.close();

}

/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {

    T *h_in = (T *)malloc(M * sizeof(T));

    gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
    outfile.close();

}

/********/
/* MAIN */
/********/
int main(){

    // --- Extension of Appendix C.1 of cuSOLVER library User's Guide
    // --- See also http://www.netlib.org/lapack/lug/node40.html

    // --- ASSUMPTION Nrows >= Ncols
    const int Nrows = 500;
    const int Ncols = 500;

    TimingGPU timerGPU;
    double timingQR, timingSolve;

    // --- cuSOLVE input/output parameters/arrays
    int work_size = 0;
    int *devInfo;           gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle;
    cusolveSafeCall(cusolverDnCreate(&solver_handle));

    // --- CUBLAS initialization
    cublasHandle_t cublas_handle;
    cublasSafeCall(cublasCreate(&cublas_handle));

    /***********************/
    /* SETTING THE PROBLEM */
    /***********************/
    // --- Setting the host, Nrows x Ncols matrix
    double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
    loadCPUrealtxt(h_A, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\testMatrix.txt", Nrows * Ncols);

    // --- Setting the device matrix and moving the host matrix to the device
    double *d_A;            gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));

    // --- Initializing the data matrix C (Of course, this step could be done by a kernel function directly on the device).
    // --- Notice that, in this case, only the first column of C contains actual data, the others being empty (zeroed). However, cuBLAS trsm
    //     has the capability of solving triangular linear systems with multiple right hand sides.
    double *h_C = (double *)calloc(Nrows * Nrows, sizeof(double));
    loadCPUrealtxt(h_C, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\testVector.txt", Nrows);

    double *d_C;            gpuErrchk(cudaMalloc(&d_C, Nrows * Nrows * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_C, h_C, Nrows * Nrows * sizeof(double), cudaMemcpyHostToDevice));

    /**********************************/
    /* COMPUTING THE QR DECOMPOSITION */
    /**********************************/
    timerGPU.StartCounter();

    // --- CUDA QR GEQRF preliminary operations
    double *d_TAU;      gpuErrchk(cudaMalloc((void**)&d_TAU, min(Nrows, Ncols) * sizeof(double)));
    cusolveSafeCall(cusolverDnDgeqrf_bufferSize(solver_handle, Nrows, Ncols, d_A, Nrows, &work_size));
    double *work;   gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));

    // --- CUDA GEQRF execution: The matrix R is overwritten in upper triangular part of A, including diagonal 
    //     elements. The matrix Q is not formed explicitly, instead, a sequence of householder vectors are
    //     stored in lower triangular part of A.
    cusolveSafeCall(cusolverDnDgeqrf(solver_handle, Nrows, Ncols, d_A, Nrows, d_TAU, work, work_size, devInfo));
    int devInfo_h = 0;  gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    if (devInfo_h != 0) std::cout << "Unsuccessful gerf execution\n\n";

    timingQR = timerGPU.GetCounter();
    printf("Timing for QR calculation = %f [ms]\n", timingQR);

    /*****************************/
    /* SOLVING THE LINEAR SYSTEM */
    /*****************************/
    timerGPU.StartCounter();

    // --- CUDA ORMQR execution: Computes the multiplication Q^T * C and stores it in d_C
    cusolveSafeCall(cusolverDnDormqr(solver_handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, Nrows, Ncols, min(Nrows, Ncols), d_A, Nrows, d_TAU, d_C, Nrows, work, work_size, devInfo));

    // --- Reducing the linear system size
    double *d_R; gpuErrchk(cudaMalloc(&d_R, Ncols * Ncols * sizeof(double)));
    double *d_B; gpuErrchk(cudaMalloc(&d_B, Ncols * sizeof(double)));
    dim3 Grid(iDivUp(Ncols, BLOCK_SIZE), iDivUp(Ncols, BLOCK_SIZE));
    dim3 Block(BLOCK_SIZE, BLOCK_SIZE);
    copy_kernel << <Grid, Block >> >(d_A, d_R, Nrows, Ncols);
    gpuErrchk(cudaMemcpy(d_B, d_C, Ncols * sizeof(double), cudaMemcpyDeviceToDevice));

    // --- Solving an upper triangular linear system - compute x = R \ Q^T * B
    const double alpha = 1.;
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
        CUBLAS_DIAG_NON_UNIT, Ncols, 1, &alpha, d_R, Ncols, d_B, Ncols));

    timingSolve = timerGPU.GetCounter();
    printf("Timing for solution of the linear system = %f [ms]\n", timingSolve);
    printf("Overall timing = %f [ms]\n", timingQR + timingSolve);

    /************************/
    /* CHECKING THE RESULTS */
    /************************/
    // --- The upper triangular part of A contains the elements of R. Showing this.
    saveGPUrealtxt(d_A, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_R.txt", Nrows * Ncols);

    // --- The first Nrows elements of d_C contain the result of Q^T * C
    saveGPUrealtxt(d_C, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_QTC.txt", Nrows);

    // --- Initializing the output Q matrix (Of course, this step could be done by a kernel function directly on the device)
    double *h_Q = (double *)malloc(Nrows * Nrows * sizeof(double));
    for (int j = 0; j < Nrows; j++)
        for (int i = 0; i < Nrows; i++)
            if (j == i) h_Q[j + i*Nrows] = 1.;
            else        h_Q[j + i*Nrows] = 0.;

    double *d_Q;            gpuErrchk(cudaMalloc(&d_Q, Nrows * Nrows * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_Q, h_Q, Nrows * Nrows * sizeof(double), cudaMemcpyHostToDevice));

    // --- Calculation of the Q matrix
    cusolveSafeCall(cusolverDnDormqr(solver_handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_N, Nrows, Ncols, min(Nrows, Ncols), d_A, Nrows, d_TAU, d_Q, Nrows, work, work_size, devInfo));

    // --- d_Q contains the elements of Q. Showing this.
    saveGPUrealtxt(d_Q, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_Q.txt", Nrows * Nrows);

    // --- At this point, d_C contains the elements of Q^T * C, where C is the data vector. Showing this.
    // --- According to the above, only the first column of d_C makes sense.
    //gpuErrchk(cudaMemcpy(h_C, d_C, Nrows * Nrows * sizeof(double), cudaMemcpyDeviceToHost));
    //printf("\n\n");
    //for (int j = 0; j < Nrows; j++)
    //  for (int i = 0; i < Nrows; i++)
    //      printf("C[%i, %i] = %f\n", j, i, h_C[j + i*Nrows]);

    // --- Check final result
    saveGPUrealtxt(d_B, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_B.txt", Ncols);

    cusolveSafeCall(cusolverDnDestroy(solver_handle));

    return 0;
}

请根据需要对行进行注释/取消注释

定时

计时（毫秒）（在GTX960卡上执行的测试，抄送5.2）：

以下代码是Jackolanten对一般M-by-K输入RHS矩阵b的回答的略微扩展。基本上你需要复制R和中间b的上矩阵，这样矩阵就有了正确的步幅

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <iostream>
#include "cuda_runtime.h"
#include "cublas_v2.h"
#include "cusolverDn.h"
#include "cublas_test.h"
#include "Eigen/Dense"
#include "gpu_util.h"
//##############################################################################
template<typename T>
void PrintEMatrix(const T &mat, const char *name) {
    std::cout << name << " =\n";
    std::cout << mat << std::endl;
}
//##############################################################################
template<typename T>
__global__
void Ker_CopyUpperSubmatrix(const T *__restrict d_in,
                                  T *__restrict d_ou,
                            const int M, const int N, const int subM) {
    const int i = threadIdx.x + blockIdx.x*blockDim.x;
    const int j = threadIdx.y + blockIdx.y*blockDim.y;
    if (i>=subM || j>=N)
        return;
    d_ou[j*subM+i] = d_in[j*M+i];
}
//##############################################################################
int TestQR() {
    typedef double T; // NOTE: don't change this. blas has different func name
    typedef Eigen::Matrix<T,Eigen::Dynamic,Eigen::Dynamic> MatrixXd;
    typedef Eigen::Matrix<T,Eigen::Dynamic,1> VectorXd;

    // define handles
    cusolverDnHandle_t cusolverH = NULL;
    cublasHandle_t cublasH = NULL;

    const int M = 3;
    const int N = 2;
    const int K = 5;

    MatrixXd A;
    A = MatrixXd::Random(M,N);
    MatrixXd x_ref, x_sol;
    x_sol.resize(N,K);
    x_ref = MatrixXd::Random(N,K);
    MatrixXd b = A*x_ref;

    PrintEMatrix(A, "A");
    PrintEMatrix(b, "b");
    PrintEMatrix(x_ref, "x_ref");

#define CUSOLVER_ERRCHK(x) \
    assert(x == CUSOLVER_STATUS_SUCCESS && "cusolver failed");
#define CUBLAS_ERRCHK(x) \
    assert(x == CUBLAS_STATUS_SUCCESS && "cublas failed");

    CUSOLVER_ERRCHK(cusolverDnCreate(&cusolverH));
    CUBLAS_ERRCHK(cublasCreate(&cublasH));

    T *d_A, *d_b, *d_work, *d_work2, *d_tau;
    int *d_devInfo, devInfo;
    gpuErrchk(cudaMalloc((void**)&d_A, sizeof(T)*M*N));
    gpuErrchk(cudaMalloc((void**)&d_b, sizeof(T)*M*K));
    gpuErrchk(cudaMalloc((void**)&d_tau, sizeof(T)*M));
    gpuErrchk(cudaMalloc((void**)&d_devInfo, sizeof(int)));
    gpuErrchk(cudaMemcpy(d_A, A.data(), sizeof(T)*M*N, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, b.data(), sizeof(T)*M*K, cudaMemcpyHostToDevice));
    int bufSize,bufSize2;

    // in-place A = QR
    CUSOLVER_ERRCHK(
        cusolverDnDgeqrf_bufferSize(
            cusolverH,
            M,
            N,
            d_A,
            M,
            &bufSize
        )
    );
    gpuErrchk(cudaMalloc((void**)&d_work, sizeof(T)*bufSize));
    CUSOLVER_ERRCHK(
        cusolverDnDgeqrf(
            cusolverH,
            M,
            N,
            d_A,
            M,
            d_tau,
            d_work,
            bufSize,
            d_devInfo
        )
    );
    gpuErrchk(cudaMemcpy(&devInfo, d_devInfo, sizeof(int),
        cudaMemcpyDeviceToHost));
    assert(0 == devInfo && "QR factorization failed");

    // Q^T*b
    CUSOLVER_ERRCHK(                                                                                                                                                                                                                                                                  
        cusolverDnDormqr_bufferSize(                                        
            cusolverH,                                                      
            CUBLAS_SIDE_LEFT,                                               
            CUBLAS_OP_T,                                                    
            M,                                                              
            K,                                                              
            N,                                                              
            d_A,                                                            
            M,                                                              
            d_tau,                                                          
            d_b,                                                            
            M,                                                              
            &bufSize2                                                       
        )                                                                   
    );                                                                      
    gpuErrchk(cudaMalloc((void**)&d_work2, sizeof(T)*bufSize2));            
    CUSOLVER_ERRCHK(                                                        
        cusolverDnDormqr(                                                   
            cusolverH,                                                      
            CUBLAS_SIDE_LEFT,                                               
            CUBLAS_OP_T,                                                    
            M,                                                              
            K,                                                              
            min(M,N),                                                       
            d_A,                                                            
            M,                                                              
            d_tau,                                                          
            d_b,                                                            
            M,                                                              
            d_work2,                                                        
            bufSize2,                                                       
            d_devInfo                                                       
        )                                                                   
    );
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaMemcpy(&devInfo, d_devInfo, sizeof(int),
        cudaMemcpyDeviceToHost));
    assert(0 == devInfo && "Q^T b failed");

    // need to explicitly copy submatrix for the triangular solve
    T *d_R, *d_b_;
    gpuErrchk(cudaMalloc((void**)&d_R, sizeof(T)*N*N));
    gpuErrchk(cudaMalloc((void**)&d_b_,sizeof(T)*N*K));
    dim3 thd_size(32,32);
    dim3 blk_size((N+thd_size.x-1)/thd_size.x,(N+thd_size.y-1)/thd_size.y);
    Ker_CopyUpperSubmatrix<T><<<blk_size,thd_size>>>(d_A, d_R, M, N, N);
    blk_size = dim3((N+thd_size.x-1)/thd_size.x,(K+thd_size.y-1)/thd_size.y);
    Ker_CopyUpperSubmatrix<T><<<blk_size,thd_size>>>(d_b, d_b_, M, K, N);

    // solve x = R \ (Q^T*B)
    const double one = 1.0;
    CUBLAS_ERRCHK(
        cublasDtrsm(
            cublasH,
            CUBLAS_SIDE_LEFT,
            CUBLAS_FILL_MODE_UPPER,
            CUBLAS_OP_N,
            CUBLAS_DIAG_NON_UNIT,
            N,
            K,
            &one,
            d_R,
            N,
            d_b_,
            N
        )
    );
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(x_sol.data(), d_b_, sizeof(T)*N*K,
        cudaMemcpyDeviceToHost));

    PrintEMatrix(x_ref, "x_ref");
    PrintEMatrix(x_sol, "x_sol");
    std::cout << "solution l2 error = " << (x_ref-x_sol).norm()
              << std::endl;

    exit(0);
    return 0;
}
//##############################################################################

#包括
#包括
#包括
#包括
#包括“cuda_runtime.h”
#包括“cublas_v2.h”
#包括“cusolverDn.h”
#包括“cublas_test.h”
#包括“本征/密集”
#包括“gpu_util.h”
//##############################################################################
模板
无效打印矩阵（常量T&mat，常量字符*名称）{
std:：cout Thank现在我知道如何获得Q，使用更新的QR方法可以提供一些建议吗？culaDeviceSgels可以获得Q？那里没有TAU输入？是的，如果你愿意，伪代码是不够的，我真的不明白你打算做什么，因为你想解线性系统（Ax=b）使用QR分解，所以你的输入就是你的b和矩阵A，或者如果你想独立于问题计算Q和R，在这种情况下，你只需要矩阵A作为输入，这就是矩阵A的分解。我正在尝试解决一个称为OMP的图像恢复算法，这意味着Ax=b将被计算至少500在每次计算中，b保持不变，A将添加一列，因此我想使用上一个Q的结果，计算更新列的系数以节省运行时间。2019年，实用程序。cuh
已从您引用的项目中删除。您可以升级吗吃了你的解决方案？（这就是为什么引用可能发生变化的外部回购可能是危险的）。“TimingGPU.cuh”似乎也不见了！double*h_C=（double*）calloc（Nrows*Nrows，sizeof（double））
-Nrows*Nrows

是错误还是实际需要？我希望您的解决方案能够适应我的问题，因为我的问题矩阵（6列，但有数百万行）。这会杀了我。报告的只是一个例子，说明

可以有多个列，但我认为

cusolverndormqr

也可以用于单列矩阵

。您可以在此处找到正在搜索的文件。

Size         QR decomposition       Solving system       Overall
100x100      0.89                   1.41                 2.30
200x200      5.97                   3.23                 9.20
500x500      17.08                  21.6                 38.7

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <iostream>
#include "cuda_runtime.h"
#include "cublas_v2.h"
#include "cusolverDn.h"
#include "cublas_test.h"
#include "Eigen/Dense"
#include "gpu_util.h"
//##############################################################################
template<typename T>
void PrintEMatrix(const T &mat, const char *name) {
    std::cout << name << " =\n";
    std::cout << mat << std::endl;
}
//##############################################################################
template<typename T>
__global__
void Ker_CopyUpperSubmatrix(const T *__restrict d_in,
                                  T *__restrict d_ou,
                            const int M, const int N, const int subM) {
    const int i = threadIdx.x + blockIdx.x*blockDim.x;
    const int j = threadIdx.y + blockIdx.y*blockDim.y;
    if (i>=subM || j>=N)
        return;
    d_ou[j*subM+i] = d_in[j*M+i];
}
//##############################################################################
int TestQR() {
    typedef double T; // NOTE: don't change this. blas has different func name
    typedef Eigen::Matrix<T,Eigen::Dynamic,Eigen::Dynamic> MatrixXd;
    typedef Eigen::Matrix<T,Eigen::Dynamic,1> VectorXd;

    // define handles
    cusolverDnHandle_t cusolverH = NULL;
    cublasHandle_t cublasH = NULL;

    const int M = 3;
    const int N = 2;
    const int K = 5;

    MatrixXd A;
    A = MatrixXd::Random(M,N);
    MatrixXd x_ref, x_sol;
    x_sol.resize(N,K);
    x_ref = MatrixXd::Random(N,K);
    MatrixXd b = A*x_ref;

    PrintEMatrix(A, "A");
    PrintEMatrix(b, "b");
    PrintEMatrix(x_ref, "x_ref");

#define CUSOLVER_ERRCHK(x) \
    assert(x == CUSOLVER_STATUS_SUCCESS && "cusolver failed");
#define CUBLAS_ERRCHK(x) \
    assert(x == CUBLAS_STATUS_SUCCESS && "cublas failed");

    CUSOLVER_ERRCHK(cusolverDnCreate(&cusolverH));
    CUBLAS_ERRCHK(cublasCreate(&cublasH));

    T *d_A, *d_b, *d_work, *d_work2, *d_tau;
    int *d_devInfo, devInfo;
    gpuErrchk(cudaMalloc((void**)&d_A, sizeof(T)*M*N));
    gpuErrchk(cudaMalloc((void**)&d_b, sizeof(T)*M*K));
    gpuErrchk(cudaMalloc((void**)&d_tau, sizeof(T)*M));
    gpuErrchk(cudaMalloc((void**)&d_devInfo, sizeof(int)));
    gpuErrchk(cudaMemcpy(d_A, A.data(), sizeof(T)*M*N, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, b.data(), sizeof(T)*M*K, cudaMemcpyHostToDevice));
    int bufSize,bufSize2;

    // in-place A = QR
    CUSOLVER_ERRCHK(
        cusolverDnDgeqrf_bufferSize(
            cusolverH,
            M,
            N,
            d_A,
            M,
            &bufSize
        )
    );
    gpuErrchk(cudaMalloc((void**)&d_work, sizeof(T)*bufSize));
    CUSOLVER_ERRCHK(
        cusolverDnDgeqrf(
            cusolverH,
            M,
            N,
            d_A,
            M,
            d_tau,
            d_work,
            bufSize,
            d_devInfo
        )
    );
    gpuErrchk(cudaMemcpy(&devInfo, d_devInfo, sizeof(int),
        cudaMemcpyDeviceToHost));
    assert(0 == devInfo && "QR factorization failed");

    // Q^T*b
    CUSOLVER_ERRCHK(                                                                                                                                                                                                                                                                  
        cusolverDnDormqr_bufferSize(                                        
            cusolverH,                                                      
            CUBLAS_SIDE_LEFT,                                               
            CUBLAS_OP_T,                                                    
            M,                                                              
            K,                                                              
            N,                                                              
            d_A,                                                            
            M,                                                              
            d_tau,                                                          
            d_b,                                                            
            M,                                                              
            &bufSize2                                                       
        )                                                                   
    );                                                                      
    gpuErrchk(cudaMalloc((void**)&d_work2, sizeof(T)*bufSize2));            
    CUSOLVER_ERRCHK(                                                        
        cusolverDnDormqr(                                                   
            cusolverH,                                                      
            CUBLAS_SIDE_LEFT,                                               
            CUBLAS_OP_T,                                                    
            M,                                                              
            K,                                                              
            min(M,N),                                                       
            d_A,                                                            
            M,                                                              
            d_tau,                                                          
            d_b,                                                            
            M,                                                              
            d_work2,                                                        
            bufSize2,                                                       
            d_devInfo                                                       
        )                                                                   
    );
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaMemcpy(&devInfo, d_devInfo, sizeof(int),
        cudaMemcpyDeviceToHost));
    assert(0 == devInfo && "Q^T b failed");

    // need to explicitly copy submatrix for the triangular solve
    T *d_R, *d_b_;
    gpuErrchk(cudaMalloc((void**)&d_R, sizeof(T)*N*N));
    gpuErrchk(cudaMalloc((void**)&d_b_,sizeof(T)*N*K));
    dim3 thd_size(32,32);
    dim3 blk_size((N+thd_size.x-1)/thd_size.x,(N+thd_size.y-1)/thd_size.y);
    Ker_CopyUpperSubmatrix<T><<<blk_size,thd_size>>>(d_A, d_R, M, N, N);
    blk_size = dim3((N+thd_size.x-1)/thd_size.x,(K+thd_size.y-1)/thd_size.y);
    Ker_CopyUpperSubmatrix<T><<<blk_size,thd_size>>>(d_b, d_b_, M, K, N);

    // solve x = R \ (Q^T*B)
    const double one = 1.0;
    CUBLAS_ERRCHK(
        cublasDtrsm(
            cublasH,
            CUBLAS_SIDE_LEFT,
            CUBLAS_FILL_MODE_UPPER,
            CUBLAS_OP_N,
            CUBLAS_DIAG_NON_UNIT,
            N,
            K,
            &one,
            d_R,
            N,
            d_b_,
            N
        )
    );
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(x_sol.data(), d_b_, sizeof(T)*N*K,
        cudaMemcpyDeviceToHost));

    PrintEMatrix(x_ref, "x_ref");
    PrintEMatrix(x_sol, "x_sol");
    std::cout << "solution l2 error = " << (x_ref-x_sol).norm()
              << std::endl;

    exit(0);
    return 0;
}
//##############################################################################