用CUDA进行行列式计算_Cuda_Gpu_Linear Algebra_Gpgpu_Nvidia

用CUDA进行行列式计算

cuda

用CUDA进行行列式计算,cuda,gpu,linear-algebra,gpgpu,nvidia,Cuda,Gpu,Linear Algebra,Gpgpu,Nvidia,是否有任何库或免费提供的代码可以完全在GPU上计算小型（6x6）双精度矩阵的行列式？您可以使用OpenCL或CUDA作为库并编写一个短程序（OpenCL内核）在GPU上计算行列式库达 OpenCL 这篇论文应该包含CUDA的伪代码。这是一个计划，你需要缓冲100个这些小矩阵，然后启动一次内核，一次计算所有矩阵的行列式我不打算编写实际的代码，但这应该会有所帮助 1）启动#块=#矩阵。每个块计算每个矩阵的行列式 2） det（A）=det（A11*A22-A21*A12）；其中A为6x6

是否有任何库或免费提供的代码可以完全在GPU上计算小型（

6x6

）双精度矩阵的行列式？

您可以使用OpenCL或CUDA作为库并编写一个短程序（OpenCL内核）在GPU上计算行列式

库达

OpenCL

这篇论文应该包含CUDA的伪代码。这是一个计划，你需要缓冲100个这些小矩阵，然后启动一次内核，一次计算所有矩阵的行列式

我不打算编写实际的代码，但这应该会有所帮助

1）启动#块=#矩阵。每个块计算每个矩阵的行列式

2） det（A）=det（A11*A22-A21*A12）；其中A为6x6，A11、A12、A21、A22为A的3x3子矩阵

3）编写一个设备函数，对3x3矩阵进行矩阵乘法

4） 3x3矩阵的det易于计算：

编辑：显然（2）仅在A21*A12==A12*A21时有效

另一种选择是：

1）对于每个6x6矩阵

2）将U的对角线元素相乘得到行列式。

正如巴特在上述评论中已经指出的那样，使用GPU计算小矩阵（即使是其中的许多矩阵）的行列式并不能确保比其他计算平台更高的收益

我认为计算矩阵行列式的问题是一个有趣的问题，在应用中可能会出现多次。目前，我不知道有任何库提供使用CUDA进行行列式计算的例程（无论是

cuBLAS

还是

cuSOLVER

都没有这样的例程），因此您有两种可能性：

正如帕万所指出的，实施自己的方法

设法使用其他可用的例程

关于最后一点，一种可能性是使用，然后计算行列式作为Cholesky矩阵对角线元素乘积的平方。在Matlab中，它将显示为：

prod(diag(chol(A)))^2

下面，我提供了一个利用这个想法的代码。特别地，通过使用

cuSOLVER

的

potrf

函数来执行Cholesky分解，而Cholesky矩阵对角线上的元素的乘积是的应用

下面的代码适用于大型矩阵，因此对于需要计算大型矩阵行列式的人来说，它将非常有用。但是如何使它适应几个小矩阵呢？一种可能是使用

cuSOLVER

的流进行Cholesky分解，然后使用Thurst 1.8的新动态并行特性。请注意，从CUDA 7.0开始，

cuSOLVER

不允许使用动态并行

代码如下：

#include "cuda_runtime.h"
#include "device_launch_paraMeters.h"

#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<ostream>

#include <cusolverDn.h>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>

#include "Utilities.cuh"

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>

#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

#include <thrust/copy.h>

/*************************/
/* STRIDED RANGE FUNCTOR */
/*************************/
template <typename Iterator>
class strided_range
{
    public:

    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
    {
        difference_type stride;

        stride_functor(difference_type stride)
            : stride(stride) {}

        __host__ __device__
        difference_type operator()(const difference_type& i) const
        {
            return stride * i;
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last, difference_type stride)
        : first(first), last(last), stride(stride) {}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
    }

    iterator end(void) const
    {
        return begin() + ((last - first) + (stride - 1)) / stride;
    }

    protected:
    Iterator first;
    Iterator last;
    difference_type stride;
};

int main(void)
{
    const int Nrows = 5;
    const int Ncols = 5;

    const int STRIDE = Nrows + 1;

    // --- Setting the host, Nrows x Ncols matrix
    double h_A[Nrows][Ncols] = { 
        { 2.,    -2.,    -2.,    -2.,    -2.,},  
        {-2.,     4.,     0.,     0.,     0.,}, 
        {-2.,     0.,     6.,     2.,     2.,}, 
        {-2.,     0.,     2.,     8.,     4.,}, 
        {-2.,     0.,     2.,     4.,     10.,}
    };

    // --- Setting the device matrix and moving the host matrix to the device
    double *d_A;            gpuErrchk(cudaMalloc(&d_A,      Nrows * Ncols * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));

    // --- cuSOLVE input/output parameters/arrays
    int work_size = 0;
    int *devInfo;           gpuErrchk(cudaMalloc(&devInfo,          sizeof(int)));

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle;
    cusolverDnCreate(&solver_handle);

    // --- CUDA CHOLESKY initialization
    cusolveSafeCall(cusolverDnDpotrf_bufferSize(solver_handle, CUBLAS_FILL_MODE_LOWER, Nrows, d_A, Nrows, &work_size));

    // --- CUDA POTRF execution
    double *work;   gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
    cusolveSafeCall(cusolverDnDpotrf(solver_handle, CUBLAS_FILL_MODE_LOWER, Nrows, d_A, Nrows, work, work_size, devInfo));
    int devInfo_h = 0;  gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    if (devInfo_h != 0) std::cout   << "Unsuccessful potrf execution\n\n";

    cusolverDnDestroy(solver_handle);

    // --- Strided reduction of the elements of d_A: calculating the product of the diagonal of the Cholesky factorization  
    thrust::device_ptr<double> dev_ptr = thrust::device_pointer_cast(d_A);
    typedef thrust::device_vector<double>::iterator Iterator;
    strided_range<Iterator> pos(dev_ptr, dev_ptr + Nrows * Ncols, STRIDE);

    double det = thrust::reduce(pos.begin(), pos.end(), 1., thrust::multiplies<double>());
    det  = det * det;

    printf("determinant = %f\n", det);

    return 0;
}

#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括“Utilities.cuh”
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
/*************************/
/*步距函子*/
/*************************/
模板
类步距
{
公众：
typedef typename推力：：迭代器_差异：：类型差异_类型；
结构跨步函数：公共推力：：一元函数
{
差异式步幅；
步幅函子（差分型步幅）
：步幅（步幅）{}
__主机设备__
差分类型运算符（）（常数差分类型&i）常数
{
返回步幅*i；
}
};
typedef typename推力：：计数迭代器计数迭代器；
typedef typename推力：：transform_迭代器TransformIterator；
typedef typename推力：：置换迭代器置换迭代器；
//跨步范围迭代器的类型
typedef置换迭代器；
//为范围[第一个，最后一个]构建跨步范围
步幅范围（迭代器优先、迭代器最后、差分类型步幅）
：第一（第一）、最后（最后）、大步（大步）{}
迭代器开始（void）常量
{
返回置换迭代器（第一个，TransformIterator（CountingIterator（0），stride_函子（stride））；
}
迭代器结束（void）常量
{
return begin（）+（（last-first）+（stride-1））/stride；
}
受保护的：
迭代器优先；
迭代器last；
差异式步幅；
};
内部主（空）
{
常数int Nrows=5；
常数int Ncols=5；
常量int步长=Nrows+1；
//---设置主机，Nrows x Ncols矩阵
双h_A[Nrows][Ncols]={
{ 2.,    -2.,    -2.,    -2.,    -2.,},  
{-2.,     4.,     0.,     0.,     0.,}, 
{-2.,     0.,     6.,     2.,     2.,}, 
{-2.,     0.,     2.,     8.,     4.,}, 
{-2.,     0.,     2.,     4.,     10.,}
};
//---设置设备矩阵并将主机矩阵移动到设备
双倍*d_A；gpuerchk（Cudamaloc（&d_A，Nrows*Ncols*sizeof（双倍））；
gpuErrchk（cudaMemcpy（d_A，h_A，Nrows*Ncols*sizeof（double），cudaMemcpyHostToDevice））；
//---cuSOLVE输入/输出参数/数组
int work_size=0；
int*devInfo；gpuerchk（cudaMalloc（&devInfo，sizeof（int））；
//---CUDA解算器初始化
cusolverDnHandle\u t solver\u手柄；
cusolverDnCreate（&solver\u handle）；
//---CUDA CHOLESKY初始化
cusolveSafeCall（CUSOLVERNDPOTRF_缓冲大小（解算器_手柄、立方体_填充_模式_下、Nrows、d_A、Nrows和工作_大小））；
//---CUDA POTRF执行
双*工作；gpuerchk（cudaMalloc（&work，工作尺寸*尺寸（双））；
cusolveSafeCall（CUSOLVERNDPOTRF（解算器手柄、立方体填充模式、下部、Nrows、d_A、Nrows、工作、工作大小、设备信息））；
int-devInfo_h=0；gpuerchk（cudaMemcpy（&devInfo_h，devInfo，sizeof（int），cudaMemcpyDeviceToHost））；