减少CUDA中的矩阵行或列_Cuda_Cublas

减少CUDA中的矩阵行或列

cuda

减少CUDA中的矩阵行或列,cuda,cublas,Cuda,Cublas,我正在使用CUDA和cuBLAS执行矩阵运算我需要对矩阵的行（或列）求和。目前，我是通过将矩阵乘以一个1向量来实现的，但这似乎不是很有效还有更好的办法吗？在cuBLAS中找不到任何东西使用cuBLAS\u gemv（）将矩阵与一个向量相乘是一种非常有效的方法，除非您考虑手工编写自己的内核您可以轻松地分析cublas\u gemv（）的mem带宽。它非常接近于一次读取整个矩阵数据，这可以看作是矩阵行/列求和的理论峰值性能额外操作“x1.0”不会导致性能降低，因为： cublas_gemv（

我正在使用CUDA和

cuBLAS

执行矩阵运算

我需要对矩阵的行（或列）求和。目前，我是通过将矩阵乘以一个1向量来实现的，但这似乎不是很有效

还有更好的办法吗？在

cuBLAS

中找不到任何东西使用

cuBLAS\u gemv（）

将矩阵与一个向量相乘是一种非常有效的方法，除非您考虑手工编写自己的内核

您可以轻松地分析

cublas\u gemv（）

的mem带宽。它非常接近于一次读取整个矩阵数据，这可以看作是矩阵行/列求和的理论峰值性能

额外操作“x1.0”不会导致性能降低，因为：

cublas_gemv（）

基本上是一个内存带宽受限的操作，额外的算术指令不会成为瓶颈

指令吞吐量进一步降低

一个向量的mem通常比矩阵的mem小得多，并且可以很容易地通过GPU进行缓存以减少mem带宽

cublas_gemv（）

还可以帮助您处理矩阵布局问题。它适用于行/列主键和任意填充

我也问过这件事。我的实验表明

cublas_gemv（）

比使用

推力：：按_键减少

的分段reduce要好，这是矩阵行求和的另一种方法

与此相关的帖子，包含关于同一主题的有用答案，可在

及

这里我只想指出，通过将一行乘以同一个矩阵来减少矩阵列的方法是如何推广到执行向量集合的线性组合的。换句话说，如果要计算以下向量基展开式

其中
f（x_m）
是函数
f（x）
的样本，而
\psi_n
是基函数，
c_n
是展开系数，然后可以将
\psi\u n
组织在
n x M
矩阵中，将系数
c\u n
组织在行向量中，然后使用
cublasgemv
计算向量x矩阵乘法
下面，我将报告一个完整的示例：

#include <cublas_v2.h> #include <thrust/device_vector.h> #include <thrust/random.h> #include <stdio.h> #include <iostream> #include "Utilities.cuh" /********************************************/ /* LINEAR COMBINATION FUNCTION - FLOAT CASE */ /********************************************/ void linearCombination(const float * __restrict__ d_coeff, const float * __restrict__ d_basis_functions_real, float * __restrict__ d_linear_combination, const int N_basis_functions, const int N_sampling_points, const cublasHandle_t handle) { float alpha = 1.f; float beta = 0.f; cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_N, N_sampling_points, N_basis_functions, &alpha, d_basis_functions_real, N_sampling_points, d_coeff, 1, &beta, d_linear_combination, 1)); } void linearCombination(const double * __restrict__ d_coeff, const double * __restrict__ d_basis_functions_real, double * __restrict__ d_linear_combination, const int N_basis_functions, const int N_sampling_points, const cublasHandle_t handle) { double alpha = 1.; double beta = 0.; cublasSafeCall(cublasDgemv(handle, CUBLAS_OP_N, N_sampling_points, N_basis_functions, &alpha, d_basis_functions_real, N_sampling_points, d_coeff, 1, &beta, d_linear_combination, 1)); } /********/ /* MAIN */ /********/ int main() { const int N_basis_functions = 5; // --- Number of rows -> Number of basis functions const int N_sampling_points = 8; // --- Number of columns -> Number of sampling points of the basis functions // --- Random uniform integer distribution between 10 and 99 thrust::default_random_engine rng; thrust::uniform_int_distribution<int> dist(10, 99); // --- Matrix allocation and initialization thrust::device_vector<float> d_basis_functions_real(N_basis_functions * N_sampling_points); for (size_t i = 0; i < d_basis_functions_real.size(); i++) d_basis_functions_real[i] = (float)dist(rng); thrust::device_vector<double> d_basis_functions_double_real(N_basis_functions * N_sampling_points); for (size_t i = 0; i < d_basis_functions_double_real.size(); i++) d_basis_functions_double_real[i] = (double)dist(rng); /************************************/ /* COMPUTING THE LINEAR COMBINATION */ /************************************/ cublasHandle_t handle; cublasSafeCall(cublasCreate(&handle)); thrust::device_vector<float> d_linear_combination_real(N_sampling_points); thrust::device_vector<double> d_linear_combination_double_real(N_sampling_points); thrust::device_vector<float> d_coeff_real(N_basis_functions, 1.f); thrust::device_vector<double> d_coeff_double_real(N_basis_functions, 1.); linearCombination(thrust::raw_pointer_cast(d_coeff_real.data()), thrust::raw_pointer_cast(d_basis_functions_real.data()), thrust::raw_pointer_cast(d_linear_combination_real.data()), N_basis_functions, N_sampling_points, handle); linearCombination(thrust::raw_pointer_cast(d_coeff_double_real.data()), thrust::raw_pointer_cast(d_basis_functions_double_real.data()), thrust::raw_pointer_cast(d_linear_combination_double_real.data()), N_basis_functions, N_sampling_points, handle); /*************************/ /* DISPLAYING THE RESULT */ /*************************/ std::cout << "Real case \n\n"; for(int j = 0; j < N_sampling_points; j++) { std::cout << "Column " << j << " - [ "; for(int i = 0; i < N_basis_functions; i++) std::cout << d_basis_functions_real[i * N_sampling_points + j] << " "; std::cout << "] = " << d_linear_combination_real[j] << "\n"; } std::cout << "\n\nDouble real case \n\n"; for(int j = 0; j < N_sampling_points; j++) { std::cout << "Column " << j << " - [ "; for(int i = 0; i < N_basis_functions; i++) std::cout << d_basis_functions_double_real[i * N_sampling_points + j] << " "; std::cout << "] = " << d_linear_combination_double_real[j] << "\n"; } return 0; }

#包括 #包括 #包括 #包括 #包括 #包括“Utilities.cuh” /********************************************/ /*线性组合函数-浮点型*/ /********************************************/ 无效线性组合（常量浮点*\uuuu限制\uuuu系数、常量浮点*\uuuu限制\uuuuu基础\uu函数\uu实数、浮点*\uuu限制\uuuu线性组合、，常量int N_基函数、常量int N_采样点、常量立方体句柄）{ 浮动α=1.f；浮动β=0.f； cublasSafeCall（cublasSgemv（句柄、CUBLAS_OP_N、N_采样点、N_基函数、α、d_基函数、实数、N_采样点、， d_系数，1和β，d_线性组合，1）； } 无效线性组合（常数双*\uuuuu限制\uuuu系数、常数双*\uuuu限制\uuuuu基函数\uu实、双*\uuuu限制\uuuu线性组合，常量int N_基函数、常量int N_采样点、常量立方体句柄）{ 双α=1。；双β=0。； cublasSafeCall（cublasDgemv（手柄、CUBLAS_OP_N、N_采样点、N_基函数、α、d_基函数、实数、N_采样点、， d_系数，1和β，d_线性组合，1）； } /********/ /*主要*/ /********/ int main（） { 常量int N_基函数=5；//---行数->基函数数 const int N_sampling_points=8；//--列数->基函数的采样点数 //---10到99之间的随机均匀整数分布推力：默认随机发动机转速；推力：均匀分布区（10，99）； //---矩阵分配和初始化推力：：设备向量d基函数实数（N基函数*N采样点）；对于（size_t i=0；i