C++ 使用CUDA减少矩阵列

C++ 使用CUDA减少矩阵列,c++,cuda,C++,Cuda,我有一个矩阵,我想使用CUDA,以最快的方式计算列平均值(归结为简单的总和),即返回一个包含该矩阵中每列平均值的行向量。用于计算单个列向量和的和缩减实现如下所示: template<typename T> __global__ void kernelSum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) { extern __shared__ T sdata[];

我有一个矩阵,我想使用CUDA,以最快的方式计算列平均值(归结为简单的总和),即返回一个包含该矩阵中每列平均值的行向量。用于计算单个列向量和的和缩减实现如下所示:

template<typename T>
__global__ void kernelSum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) {
    extern __shared__ T sdata[];

    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;

    // load input into __shared__ memory
    T x = 0.0;
    if (tid < n) {
        x = input[tid];
    }
    sdata[threadIdx.x] = x;
    __syncthreads();

    // contiguous range pattern
    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
        if(threadIdx.x < offset) {
            // add a partial sum upstream to our own
            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
        }
        // wait until all threads in the block have
        // updated their partial sums
        __syncthreads();
    }

    // thread 0 writes the final result
    if(threadIdx.x == 0) {
        per_block_results[blockIdx.x] = sdata[0];
    }
}
template<typename T>
__global__ void kernelSum(const T* __restrict__ input, 
                          T* __restrict__ per_block_results, 
                          const size_t lda, const size_t n)
{
    extern __shared__ T sdata[];

    // Accumulate per thread partial sum
    T x = 0.0;
    T * p = &input[blockIdx.x * lda];
    for(int i=threadIdx.x; i < n; i += blockDim.x) {
        x += p[i];
    }

    // load partial sum into __shared__ memory
    sdata[threadIdx.x] = x;
    __syncthreads();

    // contiguous range pattern
    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
        if(threadIdx.x < offset) {
            // add a partial sum upstream to our own
            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
        }
        // wait until all threads in the block have
        // updated their partial sums
        __syncthreads();
    }

    // thread 0 writes the final result
    if(threadIdx.x == 0) {
        per_block_results[blockIdx.x] = sdata[0];
    }
}
模板
__全局无效内核数(常量T*\U限制\U输入,T*\U限制\U每块结果,常量大小\U n){
外部共享数据[];
size\u t tid=blockIdx.x*blockDim.x+threadIdx.x;
//将输入加载到共享内存中
tx=0.0;
如果(tid0;offset>>=1){
if(螺纹IDX.x<偏移量){
//在我们自己的上游加上一部分金额
sdata[threadIdx.x]+=sdata[threadIdx.x+偏移量];
}
//等待,直到块中的所有线程都已完成
//更新了他们的部分总和
__同步线程();
}
//线程0写入最终结果
if(threadIdx.x==0){
每块结果[blockIdx.x]=sdata[0];
}
}
这被称为:

int n = ... // vector size
const int BLOCK_SIZE = 1024;
int number_of_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
double* per_block_results = NULL;
cudaMalloc((void**) &per_block_results, sizeof(double)*(number_of_blocks + 1));
// launch one kernel to compute, per-block, a partial sum
kernelSum<double> <<<number_of_blocks, BLOCK_SIZE, BLOCK_SIZE*sizeof(double)>>>(a, per_block_results, n);
// launch a single block to compute the sum of the partial sums
kernelSum<double> <<<1, number_of_blocks, number_of_blocks*sizeof(double)>>>(per_block_results, per_block_results + number_of_blocks, number_of_blocks);
int n=…//矢量大小
const int BLOCK_SIZE=1024;
整型块数=(n+块大小-1)/块大小;
double*per_block_results=NULL;
Cudamaloc((无效**)和每个区块的结果,大小(双)*(区块数量+1);
//启动一个内核以计算每个块的部分和
核数(a,每块结果,n);
//启动单个块以计算部分和的和
kernelSum(每个块的结果,每个块的结果+块的数量,块的数量);
我可以将这个内核推广到任意列数的矩阵,但我受到共享内存的限制。我的GPU有计算能力
3.5
,因此它有
48KB
的共享内存,最大块大小为
1024
,即每个块的线程数。因为我对双精度感兴趣,所以共享内存的最大倍数是
48*1024/8=6144
。由于减少是按块进行的,因此我最多可以有
6144(共享内存中的双倍)/1024(块大小)=6列
列,我可以同时计算减少的总和。然后减小块大小将允许同时计算更多列,例如
6144(共享内存中的双倍)/512(块大小)=12


这个更复杂的策略会在矩阵的每一列上击败简单的CPU循环并调用求和归约吗。还有其他更好的方法吗

是什么阻止了你这样做:

template<typename T>
__global__ void kernelSum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) {
    extern __shared__ T sdata[];

    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;

    // load input into __shared__ memory
    T x = 0.0;
    if (tid < n) {
        x = input[tid];
    }
    sdata[threadIdx.x] = x;
    __syncthreads();

    // contiguous range pattern
    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
        if(threadIdx.x < offset) {
            // add a partial sum upstream to our own
            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
        }
        // wait until all threads in the block have
        // updated their partial sums
        __syncthreads();
    }

    // thread 0 writes the final result
    if(threadIdx.x == 0) {
        per_block_results[blockIdx.x] = sdata[0];
    }
}
template<typename T>
__global__ void kernelSum(const T* __restrict__ input, 
                          T* __restrict__ per_block_results, 
                          const size_t lda, const size_t n)
{
    extern __shared__ T sdata[];

    // Accumulate per thread partial sum
    T x = 0.0;
    T * p = &input[blockIdx.x * lda];
    for(int i=threadIdx.x; i < n; i += blockDim.x) {
        x += p[i];
    }

    // load partial sum into __shared__ memory
    sdata[threadIdx.x] = x;
    __syncthreads();

    // contiguous range pattern
    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
        if(threadIdx.x < offset) {
            // add a partial sum upstream to our own
            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
        }
        // wait until all threads in the block have
        // updated their partial sums
        __syncthreads();
    }

    // thread 0 writes the final result
    if(threadIdx.x == 0) {
        per_block_results[blockIdx.x] = sdata[0];
    }
}

您应该尝试相对于矩阵大小的块大小,以获得最佳性能,但通常情况下,内核每个线程所做的工作越多,总体性能就会越好(因为共享内存减少的成本相当高)。您可以在中看到一种针对类似内存带宽限制问题的块和网格大小启发式方法。

作为Talonmes已经提供的答案的替代方案,我在这里报告
4
其他列缩减方法,
3
其中基于使用CUDA推力的方法和
1
基于使用
cublasgemv的方法()
,列为
1
,正如我在上面的评论中所建议的那样

CUDA推力方法与通过以下方法获得的隐式换位类似:

thrust::make_permutation_iterator(d_matrix.begin(),                 
        thrust::make_transform_iterator(thrust::make_counting_iterator(0),
                (_1 % Nrows) * Ncols + _1 / Nrows))
以下是完整的代码:

#include <cublas_v2.h>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
#include <thrust/sequence.h>

#include <stdio.h>
#include <iostream>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

using namespace thrust::placeholders;

// --- Required for approach #2
__device__ float *vals;

/**************************************************************/
/* CONVERT LINEAR INDEX TO ROW INDEX - NEEDED FOR APPROACH #1 */
/**************************************************************/
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T> {

    T Ncols; // --- Number of columns

    __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}

    __host__ __device__ T operator()(T i) { return i / Ncols; }
};

/******************************************/
/* ROW_REDUCTION - NEEDED FOR APPROACH #2 */
/******************************************/
struct col_reduction {

    const int Nrows;    // --- Number of rows
    const int Ncols;    // --- Number of cols

    col_reduction(int _Nrows, int _Ncols) : Nrows(_Nrows), Ncols(_Ncols) {}

    __device__ float operator()(float& x, int& y ) {
        float temp = 0.f;
        for (int i = 0; i<Nrows; i++) {
            temp += vals[y + (i*Ncols)];
        }
        return temp;
    }
};

/**************************/
/* NEEDED FOR APPROACH #3 */
/**************************/
template<typename T>
struct MulC: public thrust::unary_function<T, T>
{
    T C;
    __host__ __device__ MulC(T c) : C(c) { }
    __host__ __device__ T operator()(T x) { return x * C; }
};

/********/
/* MAIN */
/********/
int main()
{
    const int Nrows = 5;     // --- Number of rows
    const int Ncols = 8;     // --- Number of columns

    // --- Random uniform integer distribution between 10 and 99
    thrust::default_random_engine rng;
    thrust::uniform_int_distribution<int> dist(10, 99);

    // --- Matrix allocation and initialization
    thrust::device_vector<float> d_matrix(Nrows * Ncols);
    for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist(rng);

    TimingGPU timerGPU;

    /***************/
    /* APPROACH #1 */
    /***************/
    timerGPU.StartCounter();
    // --- Allocate space for row sums and indices
    thrust::device_vector<float> d_col_sums(Ncols);
    thrust::device_vector<int> d_col_indices(Ncols);

    // --- Compute row sums by summing values with equal row indices
    thrust::reduce_by_key(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Nrows)),
                          thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols),
                          thrust::make_permutation_iterator(
                                d_matrix.begin(),
                                thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
                          d_col_indices.begin(),
                          d_col_sums.begin(),
                          thrust::equal_to<int>(),
                          thrust::plus<float>());

    //thrust::reduce_by_key(
 //               thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)),
 //               thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols),
 //               thrust::make_permutation_iterator(
    //              d_matrix.begin(),
    //              thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
 //               thrust::make_discard_iterator(),
 //               d_col_sums.begin());

    printf("Timing for approach #1 = %f\n", timerGPU.GetCounter());

    // --- Print result
    for(int j = 0; j < Ncols; j++) {
        std::cout << "[ ";
        for(int i = 0; i < Nrows; i++)
            std::cout << d_matrix[i * Ncols + j] << " ";
        std::cout << "] = " << d_col_sums[j] << "\n";
    }

    /***************/
    /* APPROACH #2 */
    /***************/
    timerGPU.StartCounter();
    thrust::device_vector<float> d_col_sums_2(Ncols, 0);
    float *s_vals = thrust::raw_pointer_cast(&d_matrix[0]);
    gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *)));
    thrust::transform(d_col_sums_2.begin(), d_col_sums_2.end(), thrust::counting_iterator<int>(0), d_col_sums_2.begin(), col_reduction(Nrows, Ncols));

    printf("Timing for approach #2 = %f\n", timerGPU.GetCounter());

    for(int j = 0; j < Ncols; j++) {
        std::cout << "[ ";
        for(int i = 0; i < Nrows; i++)
            std::cout << d_matrix[i * Ncols + j] << " ";
        std::cout << "] = " << d_col_sums_2[j] << "\n";
    }

    /***************/
    /* APPROACH #3 */
    /***************/

    timerGPU.StartCounter();
    thrust::device_vector<float> d_col_sums_3(Ncols, 0);
    thrust::device_vector<float> d_temp(Nrows * Ncols);
    thrust::inclusive_scan_by_key(
                thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)),
                thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols),
                thrust::make_permutation_iterator(
                        d_matrix.begin(),
                        thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
                d_temp.begin());
    thrust::copy(
                thrust::make_permutation_iterator(
                        d_temp.begin() + Nrows - 1,
                        thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Nrows))),
                thrust::make_permutation_iterator(
                        d_temp.begin() + Nrows - 1,
                        thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Nrows))) + Ncols,
                d_col_sums_3.begin());

    printf("Timing for approach #3 = %f\n", timerGPU.GetCounter());

    for(int j = 0; j < Ncols; j++) {
        std::cout << "[ ";
        for(int i = 0; i < Nrows; i++)
            std::cout << d_matrix[i * Ncols + j] << " ";
        std::cout << "] = " << d_col_sums_3[j] << "\n";
    }

    /***************/
    /* APPROACH #4 */
    /***************/
    cublasHandle_t handle;

    timerGPU.StartCounter();
    cublasSafeCall(cublasCreate(&handle));

    thrust::device_vector<float> d_col_sums_4(Ncols);
    thrust::device_vector<float> d_ones(Nrows, 1.f);

    float alpha = 1.f;
    float beta  = 0.f;
    cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_N, Ncols, Nrows, &alpha, thrust::raw_pointer_cast(d_matrix.data()), Ncols, 
                               thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_col_sums_4.data()), 1));

    printf("Timing for approach #4 = %f\n", timerGPU.GetCounter());

    for(int j = 0; j < Ncols; j++) {
        std::cout << "[ ";
        for(int i = 0; i < Nrows; i++)
            std::cout << d_matrix[i * Ncols + j] << " ";
        std::cout << "] = " << d_col_sums_4[j] << "\n";
    }

    return 0;
}
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括“Utilities.cuh”
#包括“TimingGPU.cuh”
使用命名空间推力::占位符;
//---进近所需#2
__设备浮点数*VAL;
/**************************************************************/
/*将线性索引转换为行索引-方法#1所需*/
/**************************************************************/
模板
结构线性索引到行索引:公共推力::一元函数{
T Ncols;//--列数
__主机设备线性索引到行索引(T Ncols):Ncols(Ncols){
__主机设备操作符()(ti){return i/Ncols;}
};
/******************************************/
/*行减少-进近2需要*/
/******************************************/
结构col_归约{
常量int Nrows;//--行数
const int Ncols;//--列数
列归约(int-Nrows,int-Ncols):Nrows(_-Nrows),Ncols(_-Ncols){
__设备\浮点运算符()(浮点和x、整数和y){
浮子温度=0.f;

对于(int i=0;一个简单的替代方法是使用
cublasgemv()将问题设置为矩阵与所有
1
的向量之间的矩阵向量积
。事实上,一个解决方案实际上是给矩阵a做的:a^T*1,这会返回列的和。请详细说明答案,我会接受。但在不利的方面,做这么多徒劳的失败感觉像是浪费。它是这样写的:我会误用GEMV内核,因为实际上不需要乘法。@talonmies给出了一个answe对于您的具体问题,您确实有点误用了
cublasgemv()
因为您正在加载虚拟
1
的向量,并将矩阵
A
的元素乘以它们。但是cuBLAS例程经过了高度优化,评估您自己的实现是否比“naive”更快会很有趣我们正在谈论的cuBLAS one。或许,您可以有兴趣与talonmie的解决方案进行比较,并将结果发布在此处……您的矩阵数据是以行主(例如C样式)还是列主(例如Fortran样式)存储的顺序?用于行主存储的矩阵列和内核非常简单,不需要共享内存。@RobertCrovella:的确,除非矩阵有很多列,否则可能很难接近