Python 执行大量4x4矩阵求逆-PyCuda
我正在寻找用Python执行矩阵求逆的解决方案。我认为CUBLAS或MAGMA应该有一种方法以批处理或并发模式执行这些操作,因为每个矩阵独立于所有其他矩阵 因此,我正在寻找这个具体问题的反馈,看看CUBLAS或MAGMA是否有解决方案来执行这个批处理或并行执行 我认为这里提出的计算应该是GPU的理想选择 我必须找到一个范围Python 执行大量4x4矩阵求逆-PyCuda,python,matrix,matrix-inverse,Python,Matrix,Matrix Inverse,我正在寻找用Python执行矩阵求逆的解决方案。我认为CUBLAS或MAGMA应该有一种方法以批处理或并发模式执行这些操作,因为每个矩阵独立于所有其他矩阵 因此,我正在寻找这个具体问题的反馈,看看CUBLAS或MAGMA是否有解决方案来执行这个批处理或并行执行 我认为这里提出的计算应该是GPU的理想选择 我必须找到一个范围(integ_prec,integ_prec)的2D范围内核,其中内核对给定的全局项执行4x4矩阵求逆 如何实现这个内核代码?我已经测试了NVIDIA Developers提供
(integ_prec,integ_prec)
的2D范围内核,其中内核对给定的全局项执行4x4矩阵求逆
如何实现这个内核代码?我已经测试了NVIDIA Developers提供的批处理解算器
,但我无法使其正常工作
更新1
为了回答@Robert Crovella,我尝试使用批处理求解器
来自NVIDIA开发人员(版本BatchedSolver\u v1\u 1
)
您可以在下面看到我在编译过程中收到的警告:
$ make
nvcc -O3 -arch=sm_35 -DKEPLER2 -o example_batch_solver example.c solve.cu inverse.cu
In file included from solve.cu:41:
./operations.h:31:2: warning: 'OPERATIONS_H_' is used as a header guard here, followed by #define of a different macro [-Wheader-guard]
#if !defined(OPERATIONS_H_)
^~
./operations.h:32:9: note: 'OPERATIONS_SOLVE_H_' is defined here; did you mean 'OPERATIONS_H_'?
#define OPERATIONS_SOLVE_H_
^~~~~~~~~~~~~~~~~~~
OPERATIONS_H_
1 warning generated.
In file included from solve.cu:41:
./operations.h:31:2: warning: 'OPERATIONS_H_' is used as a header guard here, followed by #define of a different macro [-Wheader-guard]
#if !defined(OPERATIONS_H_)
^~
./operations.h:32:9: note: 'OPERATIONS_SOLVE_H_' is defined here; did you mean 'OPERATIONS_H_'?
#define OPERATIONS_SOLVE_H_
^~~~~~~~~~~~~~~~~~~
OPERATIONS_H_
1 warning generated.
In file included from inverse.cu:44:
./operations.h:31:2: warning: 'OPERATIONS_H_' is used as a header guard here, followed by #define of a different macro [-Wheader-guard]
#if !defined(OPERATIONS_H_)
^~
./operations.h:32:9: note: 'OPERATIONS_SOLVE_H_' is defined here; did you mean 'OPERATIONS_H_'?
#define OPERATIONS_SOLVE_H_
^~~~~~~~~~~~~~~~~~~
OPERATIONS_H_
1 warning generated.
In file included from inverse.cu:44:
./operations.h:31:2: warning: 'OPERATIONS_H_' is used as a header guard here, followed by #define of a different macro [-Wheader-guard]
#if !defined(OPERATIONS_H_)
^~
./operations.h:32:9: note: 'OPERATIONS_SOLVE_H_' is defined here; did you mean 'OPERATIONS_H_'?
#define OPERATIONS_SOLVE_H_
^~~~~~~~~~~~~~~~~~~
OPERATIONS_H_
1 warning generated.
不幸的是,执行会产生不好的结果:
Non-batched matrix inversion
3.000000 1.000000 1.000000 nan -19945373249087470322107824313046586886748897396355850773313316907920980812816123986073723926411981165664742747916855157931798956499818437291518879567207778108249202114071816066955302634366146096749274721347289725502062211559628338200162202651585616465674552041292175081655027073691104118308864.000000 -25949369271932562088528097628985580835309378491979298170251656488819244813241392783541154149164125403081303093429316785499097407170772831834462454013755392.000000
etc ...
因此,为了避免这些警告,我将宏OPERATIONS\u SOLVE\u H
替换为OPERATIONS\u H
到OPERATIONS.H
文件中。编译过程中没有更多警告,但执行时仍然会出现错误结果(与上面相同)
任何人对此
Batchsolver
(在MacOS 10.13.5
上使用NVIDIA驱动程序387.10.10.35.106
和CUDA-10.0)都有同样的问题吗?如评论中所述,通常不能从pycuda内核代码(或CUDA内核代码,或NUBA CUDA内核)使用numpy函数
CUBLAS提供了一种新的技术,但它目前没有暴露在或中
我们可以继续实现自己的接口(例如,使用pythonctypes
),但由于已知要反转的矩阵是4x4,我认为talonmies评论中的建议很有趣。参考答案,有一个相当简洁的C代码可以对4x4矩阵进行直接求逆
接下来首先是在CUDA中实现这一点。inv4x4功能是对先前代码的改编,为每个矩阵分配16个线程(每个矩阵元素一个),并将该代码用作模型。每个线程负责计算一个结果矩阵元素。首先,我们将其与CUBLASmatinvBatched
进行性能比较:
$ cat t411.cu
#include <iostream>
#include <cublas_v2.h>
#include <cstdlib>
// 4x4 matrix inversion
// https://stackoverflow.com/questions/1148309/inverting-a-4x4-matrix
// assumes warp size is 32
// assumes block size is multiple of warp size
// therefore assumes number of matrices to be inverted (n) is even
// 16 threads per matrix to invert
const unsigned block_size = 256;
typedef float mt;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__device__ unsigned pat[3][16];
const unsigned hpat[3][16] = {
{ 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50},
{ 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14},
{ 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618}};
__device__ unsigned getoff(unsigned &off){
unsigned ret = off & 0x0F;
off = off >> 4;
return ret;
}
const unsigned tmsk = 0xFFFFFFFF;
// in-place is acceptable i.e. out == in)
// T = float or double only
template <typename T>
__global__ void inv4x4(const T * __restrict__ in, T * __restrict__ out, const size_t n){
__shared__ T si[block_size];
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < n*16){
si[threadIdx.x] = in[idx];
unsigned lane = threadIdx.x & 15;
unsigned sibase = threadIdx.x & 0x03F0;
__syncwarp();
unsigned off = pat[0][lane];
T a,b;
a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
if (!getoff(off)) a = -a;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
off = pat[1][lane];
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
off = pat[2][lane];
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
T det = si[sibase + (lane>>2)]*a;
det += __shfl_down_sync(tmsk, det, 4, 16); // first add
det += __shfl_down_sync(tmsk, det, 8, 16); // second add
det = __shfl_sync(tmsk, det, 0, 16); // broadcast
out[idx] = a / det;
}
}
size_t nr = 2048;
int main(int argc, char *argv[]){
if (argc > 1) nr = atoi(argv[1]);
const mt m1[] = {1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 0.0, 1.0, 0.0, 2.0, 1.0};
const mt i1[] = {-3.0, -0.5, 1.5, 1.0, 1.0, 0.25, -0.25, -0.5, 3.0, 0.25, -1.25, -0.5, -3.0, 0.0, 1.0, 1.0};
const mt m2[] = {1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0};
const mt i2[] = {1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0};
mt *h_d, *d_d;
h_d = (mt *)malloc(nr*2*16*sizeof(mt));
cudaMalloc(&d_d, nr*2*16*sizeof(mt));
cudaMemcpyToSymbol(pat, hpat, 3*16*sizeof(unsigned));
for (int i = 0; i < nr; i++){
memcpy(h_d+i*16*2, m1, sizeof(m1));
memcpy(h_d+i*16*2+16, m2, sizeof(m2));}
cudaMemcpy(d_d, h_d, nr*2*16*sizeof(mt), cudaMemcpyHostToDevice);
long long t = dtime_usec(0);
inv4x4<<<nr*2*16/block_size, block_size>>>(d_d, d_d, nr*2);
cudaDeviceSynchronize();
t = dtime_usec(t);
cudaMemcpy(h_d, d_d, nr*2*16*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++){
for (int j = 0; j < 16; j++) std::cout << h_d[i*16 + j] << ",";
std::cout << std::endl;
for (int j = 0; j < 16; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
std::cout << std::endl;}
std::cout << "kernel time: " << t << " microseconds" << std::endl;
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
//cublas
for (int i = 0; i < nr; i++){
memcpy(h_d+i*16*2, m1, sizeof(m1));
memcpy(h_d+i*16*2+16, m2, sizeof(m2));}
cudaMemcpy(d_d, h_d, nr*2*16*sizeof(mt), cudaMemcpyHostToDevice);
cublasHandle_t h;
cublasStatus_t cs = cublasCreate(&h);
if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas create error" << std::endl;
mt **A, **Ai, *Aid, **Ap, **Aip;
A = (mt **)malloc(nr*2*sizeof(mt *));
Ai = (mt **)malloc(nr*2*sizeof(mt *));
cudaMalloc(&Aid, nr*2*16*sizeof(mt));
cudaMalloc(&Ap, nr*2*sizeof(mt *));
cudaMalloc(&Aip, nr*2*sizeof(mt *));
for (int i = 0; i < nr*2; i++) A[i] = d_d + 16*i;
for (int i = 0; i < nr*2; i++) Ai[i] = Aid + 16*i;
cudaMemcpy(Ap, A, nr*2*sizeof(mt *), cudaMemcpyHostToDevice);
cudaMemcpy(Aip, Ai, nr*2*sizeof(mt *), cudaMemcpyHostToDevice);
int *info;
cudaMalloc(&info, nr*2*sizeof(int));
t = dtime_usec(0);
cs = cublasSmatinvBatched(h, 4, Ap, 4, Aip, 4, info, nr*2);
if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas matinv error" << std::endl;
cudaDeviceSynchronize();
t = dtime_usec(t);
cudaMemcpy(h_d, Aid, nr*2*16*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++){
for (int j = 0; j < 16; j++) std::cout << h_d[i*16 + j] << ",";
std::cout << std::endl;
for (int j = 0; j < 16; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
std::cout << std::endl;}
std::cout << "cublas time: " << t << " microseconds" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
return 0;
}
$ nvcc -o t411 t411.cu -lcublas
$ ./t411
-3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,-0,1,1,
-3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,0,1,1,
1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
kernel time: 49 microseconds
-3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,0,1,1,
-3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,0,1,1,
1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
cublas time: 95 microseconds
$
$cat t411.cu
#包括
#包括
#包括
//4x4矩阵求逆
// https://stackoverflow.com/questions/1148309/inverting-a-4x4-matrix
//假设扭曲大小为32
//假设块大小是扭曲大小的倍数
//因此,假设要反转的矩阵数(n)为偶数
//每个矩阵16个螺纹,以反转
常量无符号块大小=256;
类型定义浮动mt;
#包括
#包括
#定义USECPSEC 10000000ull
long long dtime\u usec(无符号长启动){
蒂梅瓦尔电视;
gettimeofday(&tv,0);
返回((tv.tv_sec*USECPSEC)+tv.tv_usec)-开始;
}
__装置u uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu;
const unsigned hpat[3][16]={
{0x0EB51FA5、0x1EB10FA1、0x0E711F61、0x1A710B61、0x1EB40FA4、0x0EB01FA0、0x1E700F60、0x0A701B60、0x0DB41F94、0x1DB00F90、0x0D701F50、0x19700B50、0x1DA40E94、0x0DA01E90、0x1D600E50、0x09601A50},
{0x1E790F69、0x0E391F29、0x1E350F25、0x0A351B25、0x0E781F68、0x1E380F28、0x0E341F24、0x1A340B24、0x1D780F58、0x0D381F18、0x1D340F14、0x09341B14、0x0D681E58、0x1D280E18、0x0D241E14、0x19240A14},
{0x0A7D1B6D、0x1A3D0B2D、0x063D172D、0x16390729、0x1A7C0B6C、0x0A3C1B2C、0x163C072C、0x06381728、0x097C1B5C、0x193C0B1C、0x053C171C、0x15380718、0x196C0A5C、0x092C1A1C、0x152C061C、0x05281618};
__设备\未签名关闭(未签名关闭){
无符号ret=off&0x0F;
关=关>>4;
返回ret;
}
常量无符号tmsk=0xFFFFFFFF;
//就地可接受,即out==in)
//T=仅浮动或双精度
模板
__全局无效inv4x4(常数T*\Uuu限制\Uuuuuuuuuuuuuuuuuuuuuuuuuuuu限制\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuun,常数大小\uun){
__共享块大小;
size_t idx=threadIdx.x+blockDim.x*blockIdx.x;
如果(idx>2)]*a;
det+=\uuuuuSHFL\uDOWN\uSYNC(tmsk,det,4,16);//第一次添加
det+=\uuuuuSHFL\uDOWN\uSYNC(tmsk,det,8,16);//第二次添加
det=uuu shfl_sync(tmsk,det,0,16);//广播
out[idx]=a/det;
}
}
尺寸=2048;
int main(int argc,char*argv[]){
如果(argc>1)nr=atoi(argv[1]);
常数mt m1[]={1.0,1.0,1.0,0.0,0.0,3.0,1.0,2.0,3.0,1.0,0.0,1.0,0.0,2.0,1.0};
常数mt i1[]={3.0,-0.5,1.5,1.0,1.0,0.25,-0.25,-0.5,3.0,0.25,-1.25,-0.5,-3.0,0.0,1.0,1.0};
常数mt m2[]={1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0};
常数mti2[]={1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0};
mt*h_d,*d_d;
h_d=(mt*)马洛克(nr*2*16*sizeof(mt));
库达马洛克公司
$ cat t10.py
import numpy as np
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""
__device__ unsigned getoff(unsigned &off){
unsigned ret = off & 0x0F;
off = off >> 4;
return ret;
}
const int block_size = 256;
const unsigned tmsk = 0xFFFFFFFF;
// in-place is acceptable i.e. out == in)
// T = float or double only
typedef float T;
__global__ void inv4x4(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){
__shared__ T si[block_size];
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < n*16){
si[threadIdx.x] = in[idx];
unsigned lane = threadIdx.x & 15;
unsigned sibase = threadIdx.x & 0x03F0;
__syncwarp();
unsigned off = pat[lane];
T a,b;
a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
if (!getoff(off)) a = -a;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
off = pat[lane+16];
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
off = pat[lane+32];
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
T det = si[sibase + (lane>>2)]*a;
det += __shfl_down_sync(tmsk, det, 4, 16); // first add
det += __shfl_down_sync(tmsk, det, 8, 16); // second add
det = __shfl_sync(tmsk, det, 0, 16); // broadcast
out[idx] = a / det;
}
}
""")
# python function for inverting 4x4 matrices
# n should be an even number
def gpuinv4x4(inp, n):
# internal constants not to be modified
hpat = ( 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50, 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14, 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618)
# Convert parameters into numpy array
inpd = np.array(inp, dtype=np.float32)
hpatd = np.array(hpat, dtype=np.uint32)
output = np.empty((n*16), dtype= np.float32)
# Get kernel function
matinv4x4 = kernel.get_function("inv4x4")
# Define block, grid and compute
blockDim = (256,1,1) # do not change
gridDim = ((n/16)+1,1,1)
# Kernel function
matinv4x4 (
cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
block=blockDim, grid=gridDim)
return output
#example/test case
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 0.0, 1.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv4x4(inp, n)
print(result)
$ python t10.py
[-3. -0.5 1.5 1. 1. 0.25 -0.25 -0.5 3. 0.25 -1.25 -0.5 -3.
-0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
1. 0. 0. 0. 0. 1. ]
$