如何在C和CUDA中用改进的Gram-Schmidt方法进行QR分解

如何在C和CUDA中用改进的Gram-Schmidt方法进行QR分解,c,cuda,qr-decomposition,C,Cuda,Qr Decomposition,任何人都知道如何在C和CUDA中通过改进的Gram-Schmidt方法进行QR分解。一些例子/来源/论文或其他?非常感谢 编辑:我无法回答我的问题,因为有人关闭了它,所以我决定更新我的问题 /* * QR decomposition via modified Gram-Schmidt algorithm * * @Package = QR-decomposition * @Program = QR_gpu * @Version = 13.0928 */ // Libraries

任何人都知道如何在C和CUDA中通过改进的Gram-Schmidt方法进行QR分解。一些例子/来源/论文或其他?非常感谢

编辑:我无法回答我的问题,因为有人关闭了它,所以我决定更新我的问题

/*
 * QR decomposition via modified Gram-Schmidt algorithm
 * 
 * @Package = QR-decomposition
 * @Program = QR_gpu
 * @Version = 13.0928
 */

// Libraries
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <time.h>
#include <math.h>
#include <cuda.h>

// Constant
#define PROG "QR_cpu"
#define VERSION "13.1003"
#define PACKAGE "QR-Decomposition"

// Threads per block
#define THREAD_P_BLOCK 512
// Blocks per grid
#define BLOCKS_P_GRID  512

// macro
/* wrap each API call with the gpuErrchk macro, which will process the return 
status of the API call it wraps: http://bit.ly/1dTD0ZE */
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

// Prototypes
__global__ void xTA (float *, float *, int, int, int, int, int);
__global__ void scale (float *, int, int, float *);
__global__ void r1_update(float *, float *, int, int, int, int);
__host__ void gpuAssert(cudaError_t, char *, int);
__host__ void print_matrix(float *, int, int, int);
__host__ void print_help (int);
__host__ int option_parser(int, char **, int *, int *);

// Host code
int main (int argc, char **argv) {

    int m, n, lda, ldr, i;
    float *A_d, *R_d;
    //cudaEvent_t t_start, t_stop;
    // Get "m" and "n" from command line
    if (0 != option_parser(argc, argv, &m, &n)) {
        fprintf(stderr, "Can\'t continue, exiting now!\n"); 
        exit(EXIT_FAILURE);
    }
    size_t A_size = m * n * sizeof(float);
    size_t R_size = n * n * sizeof(float);

    lda = n; ldr = n;

    // Allocate input matrices A_h and R_h in host memory
    float *A_h = (float *) malloc(A_size);
    float *R_h = (float *) malloc(R_size);
    memset(R_h, 0, R_size);

    // Initialize input matrix
    for (i = 0; i < n; i++)
        A_h[i*lda + i] = i + 1;

    // Allocate matrices in device memory
    gpuErrchk (cudaMalloc(&A_d, A_size));
    gpuErrchk (cudaMalloc(&R_d, R_size));
    // Copy the A matrix from host memory to device memory
    gpuErrchk (cudaMemcpy(A_d, A_h, A_size, cudaMemcpyHostToDevice));
    // Set R matrix to 0 
    gpuErrchk (cudaMemset(R_d, 0, R_size));

    /**** Invoke kernel ****/
    dim3 dimBlock (THREAD_P_BLOCK, 1, 1);
    // dimGrid 'monodimensional' (just x value)
    dim3 dimGrid_M ((m + THREAD_P_BLOCK - 1) / THREAD_P_BLOCK, 1, 1);
    // dimGrid 'bidimensional' (x and y values)
    dim3 dimGrid_B (BLOCKS_P_GRID, (m + THREAD_P_BLOCK - 1) / THREAD_P_BLOCK,1);
    // Gram-Schmidt algorithm step by step
    for (i = 0; i < n; i++) {
        // Step #1 --> R(i,i:n-1) = A'(:,i) * A(:,i:n-1)
        xTA <<< dimBlock, dimGrid_B >>> (R_d, A_d, m, n, lda, ldr, i);
        // Step #3 (Is the scale of a column vector)
        scale <<< dimBlock, dimGrid_M >>> (A_d + i, m, lda, R_d + i*ldr + i);
        // Step #4 (Is the scale of a row)
        scale <<< dimBlock, dimGrid_M >>> (R_d + ldr*i, m, 1, R_d + i*ldr + i);
        // Step #5 --> A(:,i+1:n−1) = A(:,i+1:n−1) − A(:,i) ∗ R(i,i+1:n−1)
        r1_update <<< dimBlock, dimGrid_B >>> (A_d, R_d + i*lda, m, n, lda, i);
    }
    // Copy the results from device memory to host memory
    gpuErrchk (cudaMemcpy(A_h, A_d, A_size, cudaMemcpyDeviceToHost));
    // Free device memory
    cudaFree(A_d); cudaFree(R_d);
    // Free host memory
    free(A_h); free(R_h);

    return 0;
}

/**
 * ## Kernel 1
 * 
 * Rank 1 update of columns of A
 */
__global__ void r1_update (float *A, float *R, int m, int n, int lda, int k) {
    // get x,y cordinates
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.x;

    if (x < m && y < n-k-1)
        A[x*lda + y + k + 1] -= A[x*lda + k] * R[y + k + 1];
}

/**
 * ## Kernel 2
 * 
 * matrix vector product
 * Performs R[i] =  x'A where x' is a row of A
 * 
 * How leading dimension is used for matrices: http://ibm.co/19PLtIX
 */
__global__ void xTA (float *R, float *A, int m, int n, int lda, int ldr, int k){
    // block column * block dim + column (computed by each thread)
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j;
    // upper triangular matrix 
    if (i < n - k) {
        for (j = 0; j < m; j++)
            R[k*ldr + k + i] += A[k*lda + j] * A[j*lda + k + i];
    }
}

/**
 * ## Kernel 3
 * 
 * mult. for constant s
 * d vector
 * ld leading dimension (distance from elements)
 */
__global__ void scale (float *d, int m, int ld, float *R_x) {
    // block colum * block dim + column (computed by each thread)
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    // s = sqrt(R(i,i))
    // Static initialization of shared variables is illegal in CUDA. 
    // The problem is that the semantics of how every thread should treat 
    // static initialization of shared memory is undefined in the 
    // programming model. Which thread should do the write? What happens if 
    // the value is not uniform between threads? How should the compiler 
    // emit code for such a case and how should the hardware run it?
    __shared__ float s; s = sqrt(*R_x);
    // and scale
    if (i < m) d[i*ld] /= s;
}

/*
 * GPU Error Handler (CUDA_SAFE_CALL deprecated from CUDA 5.0)
 */
__host__ void gpuAssert(cudaError_t code, char *file, int line) {

    if (code != cudaSuccess) {
        fprintf(stderr,"GPUassert: %s %s %d\n", 
            cudaGetErrorString(code), file, line);
        exit(code);
    }
}

/* 
 * Print matrix
 * 
 * Print a matrix passed as argument
 */
__host__ void print_matrix (float * matrix, int m, int n, int ld) {

    int i, j;

    for (i = 0; i < m; i++) {
        for (j = 0; j < n; j++)
            printf("%0.5f ", matrix[i*ld + j]);
        printf("\n");
    }
}

/*
 * The option parser
 *
 * The function parses the parameters passed from the command line and run 
 * their own procedures.
 *
 * Return value:
 *   0 on success
 *  -1 on failure
 *
 * Please, see http://www.gnu.org/software/libc/manual/html_node/Getopt.html
 * for further informations. (thanks to Frodo Looijaard)
 */
__host__ int option_parser (int argc, char **argv, int * m, int * n) {

    int opt;

    if (argc < 2) {
        fprintf(stderr, "The program needs arguments...\n\n");
        print_help(1);
    }

    opterr = 0;

    while ( -1 != (opt = getopt (argc, argv, "hr:c:"))) {
        switch (opt) {
            case 'h': 
                print_help(0);
            case 'r': 
                printf("optarg: %s\n", optarg); 
                if ((*m = atoi(optarg)) < 2) return -1; 
                break;
            case 'c': 
                printf("optarg: %s\n", optarg);
                if ((*n = atoi(optarg)) < 2 || *n > *m) return -1; 
                break;
            case '?':
                if (optopt == 'r' || optopt == 'c')
                    fprintf(stderr,"Option -%c requires an argument.\n",optopt);
                else if (isprint (optopt))
                    fprintf(stderr,"Unknown option `-%c'.\n", optopt);
                else
                    fprintf(stderr,"Unknown option chr `\\x%x'.\n", optopt);
                return -1;
            default:
                fprintf(stderr, "default switch-case statement reached\n");
                return -1;
        }
        //for (ii = optind; ii < argc; ii++)
        //  printf ("non-option argument %s\n", argv[ii]);
    }
    return 0;
}

/*
 * The helper
 *
 * Show the info to run the program in the correct way
 */
__host__ void print_help (int exit_code) {

    printf("\nPKG : %s\nPROGRAM : %s\nVERSION : %s\n\n",PACKAGE,PROG,VERSION);

    printf("%s [-h] [-r num of rows] [-c num of columns]\n\n", PROG);
    printf("  -h    print this help and exit\n");
    printf("  -r    provide the number of rows\n");
    printf("  -c    provide the number of colums\n\n");

    printf("  Example: ./qr_gpu -r 800 -c 600\n\n");

    exit_code == -1 ? exit(EXIT_FAILURE) : exit(EXIT_SUCCESS);
}
/*
*基于改进Gram-Schmidt算法的QR分解
* 
*@Package=QR分解
*@Program=QR\gpu
*@Version=13.0928
*/
//图书馆
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
//不变的
#定义程序“QR\U cpu”
#定义版本“13.1003”
#定义包“QR分解”
//每个块的线程数
#定义线程P_块512
//每格块数
#定义块和网格512
//宏
/*使用gpuerchk宏包装每个API调用,该宏将处理返回
它所包装的API调用的状态:http://bit.ly/1dTD0ZE */
#定义gpuerchk(ans){gpuAssert((ans),_文件_,_行__)}
//原型
__全局无效xTA(浮点*,浮点*,int,int,int,int,int,int,int);
__全局无效标度(浮点*,整数,整数,浮点*);
__全局无效r1更新(浮点*,浮点*,整数,整数,整数,整数,整数);
__主机无效gpuAssert(cudaError\u t,char*,int);
__主机无效打印矩阵(浮点*,整数,整数,整数);
__主机\无效打印\帮助(int);
__主机\ int选项\解析器(int,char**,int*,int*);
//主机代码
int main(int argc,字符**argv){
int m,n,lda,ldr,i;
浮动*A\u d,*R\u d;
//cudaEvent\t\u启动、停止;
//从命令行获取“m”和“n”
if(0!=选项_解析器(argc、argv、m和n)){
fprintf(stderr,“无法继续,正在退出!\n”);
退出(退出失败);
}
尺寸=m*n*sizeof(浮动);
大小\u t R\u size=n*n*sizeof(浮动);
lda=n;ldr=n;
//在主机内存中分配输入矩阵A_h和R_h
浮动*A_h=(浮动*)malloc(A_大小);
浮动*R\u h=(浮动*)malloc(R\u大小);
memset(R_-h,0,R_-size);
//初始化输入矩阵
对于(i=0;iR(i,i:n-1)=A'(:,i)*A(:,i:n-1)
xTA>(R_d,A_d,m,n,lda,ldr,i);
//步骤#3(是列向量的比例)
比例>(A_d+i,m,lda,R_d+i*ldr+i);
//步骤4(是一行的比例)
比例>(R\u d+ldr*i,m,1,R\u d+i*ldr+i);
//步骤5-->A(:,i+1:n−1) =A(:,i+1:n−1) − A(:,i)∗ R(i,i+1:n)−1)
r1\u更新>(A\u d,R\u d+i*lda,m,n,lda,i);
}
//将结果从设备内存复制到主机内存
gpuErrchk(cudaMemcpy(A_h,A_d,A_size,cudaMemcpyDeviceToHost));
//可用设备内存
cudaFree(A_d);cudaFree(R_d);
//空闲主机内存
自由(A_h);自由(R_h);
返回0;
}
/**
*##内核1
* 
*第1列的列更新
*/
__全局无效r1更新(浮点*A,浮点*R,整数m,整数n,整数lda,整数k){
//得到x,y-科迪酸盐
无符号整数x=blockIdx.x*blockDim.x+threadIdx.x;
无符号整数y=blockIdx.y*blockDim.y+threadIdx.x;
if(x/*
 * QR decomposition via modified Gram-Schmidt algorithm
 */

// Libraries
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <time.h>
#include <math.h>

// Constant
#define PROG "QR_cpu"
#define VERSION "28.0913"
#define PACKAGE "QR-Decomposition"

// Prototypes
void r1_update(double *, double *, int);
void xTA (double *, double *, int);
void scale (double *, int, double);
void gram (double *, double *);
void print_matrix(double *);
int option_parser(int, char **);
void print_help (int);

// Number of rows
int M = -1; 
// Number of columns
int N = -1;
// Leading Dimension of A and R
int LDA = -1;
int LDR = -1;
// The clock
clock_t start, stop;

/*
 *
 */
int main (int argc, char **argv) {

    int i;

    // Get M, N from command line
    if (0 != option_parser(argc, argv)) {
        fprintf(stderr, "Bad option man!!!\n");
        exit(EXIT_FAILURE);
    }
    // Check the size of M and N 
    if (M > 5000 || N > 5000) {
        fprintf(stderr, "Too big man!!!\n");
        exit(EXIT_FAILURE);
    }

    // Set the leading dimension of A and R
    LDA = N; LDR = N;

    // Allocate memory for A and R 
    double *A = calloc(M * N, sizeof(*A));
    double *R = calloc(N * N, sizeof(*R));

    // Set the diagonal of A as A(i,i) = i + 1 with i from 0 to N−1
    for (i = 0; i < N; i++) 
        A[i*LDA + i] = i + 1;

    start = clock();
    gram(A, R);
    stop = clock();

    printf("\nTime: %0.4f\n\n",(stop - start) / (double)(CLOCKS_PER_SEC));
    // print_matrix(A);
    // print_matrix(R);

    free(A); free(R);

    return 0;
}

/**
 * Rank 1 update of columns of A
 */
void r1_update (double *A, double *R, int k) {

    int i, j;

    for(i = 0; i < M; i++)
        for(j = k + 1; j < N; j++)
            A[i*LDA + j] -= A[i*LDA + k] * R[j];
}

/**
 * Matrix vector product
 * Performs R[i] =  x'A where x' is a row of A
 * A : m x k, leading dimebsion, lda
 * 
 * How leading dimension is used for matrices: http://ibm.co/19PLtIX
 */
void xTA (double *R, double *A, int k) {

    int i, j;
    // upper triangular matrix 
    for (i = 0; i < N-k; i++)
        for (j = 0; j < M; j++)
            R[k*LDR + k + i] += A[k*LDA + j] * A[j*LDA + k + i];
}

/**
 * Mult. for constant s
 * d vector
 * ld leading dimension (distance from elements)
 */
void scale (double *d, int ld, double s) {

    int i;

    for (i = 0; i < M; i++) d[i*ld] *= s;
}

/**
 * Performs Modified Gram Schmidt
 * ortogonalization of columns of A
 * A m x n
 * R n x n
 */ 
void gram (double *A, double *R) {

    int i;
    double s;
    // Modified Gram Schmidt algorithm step by step
    for (i = 0; i < N; i++) {
        // Step #1 --> R(i,i:n-1) = A'(:,i) * A(:,i:n-1)
        xTA(R, A, i);
        // Step #2 (Normalizing) --> s = sqrt(R(i,i))
        s = 1 / sqrt(R[i*LDR + i]);
        // Step #3 (Is the scale of a column vector)
        scale(A + i, LDA, s);
        // Step #4 (Is the scale of a row)
        scale(R + LDR*i, 1, s);
        // Step #5 --> A(:,i+1:n−1) = A(:,i+1:n−1) − A(:,i) ∗ R(i,i+1:n−1)
        r1_update(A, R + i*LDA, i);
    }
}

/* 
 * Print Matrix
 * 
 * Print a matrix passed as argument
 */
void print_matrix (double * matrix) {

    int i, j;

    for (i = 0; i < M; i++) {
        for (j = 0; j < N; j++)
            printf("%0.2f ", matrix[i*LDA + j]);
        printf("\n");
    }
}

/*
 * The option parser
 *
 * The function parses the parameters passed from the command line and run 
 * their own procedures.
 *
 * Return value:
 *   0 on success
 *  -1 on failure
 *
 * Please, see http://www.gnu.org/software/libc/manual/html_node/Getopt.html
 * for further informations. (thanks to Frodo Looijaard)
 */
int option_parser (int argc, char **argv) {

    int opt;

    if (argc < 2) {
        fprintf(stderr, "This program needs arguments...\n\n");
        print_help(1);
    }

    opterr = 0;

    while ( -1 != (opt = getopt (argc, argv, "hr:c:"))) {
        switch (opt) {
            case 'h': 
                print_help(0);
            case 'r': 
                printf("optarg: %s\n", optarg); 
                if ((M = atoi(optarg)) < 2) return -1; 
                break;
            case 'c': 
                printf("optarg: %s\n", optarg);
                if ((N = atoi(optarg)) < 2 || N > M) return -1; 
                break;
            case '?':
                if (optopt == 'r' || optopt == 'c')
                    fprintf(stderr,"Option -%c requires an argument.\n",optopt);
                else if (isprint (optopt))
                    fprintf(stderr,"Unknown option `-%c'.\n", optopt);
                else
                    fprintf(stderr,"Unknown option chr `\\x%x'.\n", optopt);
                return -1;
            default:
                fprintf(stderr, "default switch-case statement reached\n");
                return -1;
        }
        //for (ii = optind; ii < argc; ii++)
        //  printf ("Non-option argument %s\n", argv[ii]);
    }
    return 0;
}

/*
 * The helper
 *
 * Shows the info to run the program in the correct way
 */
void print_help (int exit_val) {

    printf("\nPKG : %s\nPROGRAM : %s\nVERSION : %s\n\n",PACKAGE,PROG,VERSION);

    printf("%s [-h] [-r num of rows] [-c num of columns]\n\n", PROG);
    printf("  -h    print this help and exit\n");
    printf("  -r    provide the number of rows\n");
    printf("  -c    provide the number of colums\n\n");

    printf("  Example: ./qr_cpu -r 800 -c 600\n\n");

    exit_val == -1 ? exit(EXIT_FAILURE) : exit(EXIT_SUCCESS);
}