Cuda CUBLAS库中cublasDgetrfBatched()允许的batchsize最大值
CUBLAS库中的Cuda CUBLAS库中cublasDgetrfBatched()允许的batchsize最大值,cuda,gpu,gpgpu,cublas,Cuda,Gpu,Gpgpu,Cublas,CUBLAS库中的cublasDgetrfBatched()是否有最大批量大小限制?我正在做一个比较CPU和GPU之间时间的基准问题。对于1000的批量大小,我得到的GPU时间比CPU时间长。但是,对于batchsize为100的情况,我在CPU上获得了一些加速 我在下面发布了我用于基准测试的代码 1。main.cpp /*main.cpp goes below*/ #include<stdio.h> #include <time.h> #include <cuda
cublasDgetrfBatched()
是否有最大批量大小限制?我正在做一个比较CPU和GPU之间时间的基准问题。对于1000的批量大小,我得到的GPU时间比CPU时间长。但是,对于batchsize为100的情况,我在CPU上获得了一些加速
我在下面发布了我用于基准测试的代码
1。main.cpp
/*main.cpp goes below*/
#include<stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include "mathlib_blas.h"
int main(){
double**mat;
double**mat_scratch1;
int *ipvt;
double *fVec;
double *fVecSave;
double *fVec_scratch;
double *A;
double *B;
double **devPtrA;
double **devPtrB;
double **devPtrA_dev;
double **devPtrB_dev;
double *d_x;
double *x;
int *d_pivot_array ;
int *d_info_array;
int *h_info_array;
int batchsize;
int neqn;
cublasHandle_t handle;
cublasStatus_t status;
cudaError_t error;
clock_t start, end, start1, end1;
double rcond;
batchsize = 32;
neqn = 172;
mat = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double));
mat_scratch1 = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double));
ipvt = (int*) calloc((size_t) neqn, sizeof(int));
fVec = (double*) calloc((size_t) neqn, sizeof(double));
fVecSave = (double*) calloc((size_t) neqn, sizeof(double));
fVec_scratch = (double*) calloc((size_t) neqn, sizeof(double));
A = (double*)malloc( neqn*neqn*sizeof(A[0]));
B = (double*)malloc( neqn*neqn*sizeof(B[0]));
devPtrA = (double**)malloc(batchsize*sizeof(*devPtrA));
devPtrB = (double**)malloc(batchsize*sizeof(*devPtrB));
for(int b_count =0; b_count<batchsize; b_count++){
cudaMalloc((void **)&devPtrA[b_count], neqn*neqn * sizeof(devPtrA[0][0]));
cudaMalloc((void **)&devPtrB[b_count], batchsize*neqn * sizeof(devPtrB[0][0]));
}
cudaMalloc((void **)&devPtrA_dev, batchsize*sizeof(*devPtrA));
cudaMalloc((void **)&devPtrB_dev, batchsize*sizeof(*devPtrB));
cudaMemcpy(devPtrA_dev, devPtrA, batchsize*sizeof(*devPtrA), cudaMemcpyHostToDevice);
cudaMemcpy(devPtrB_dev, devPtrB, batchsize*sizeof(*devPtrB), cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_x, neqn*sizeof(double));
x =(double *)malloc(neqn*sizeof(double));
cudaMalloc((void **)&d_pivot_array, batchsize*neqn*sizeof(int));
cudaMalloc((void **)&d_info_array, batchsize*sizeof(int));
h_info_array =(int*)malloc(batchsize*sizeof(int));
cublasCreate(&handle);
srand(time(NULL));
/* Fill in the CPU and GPU Matrix */
for (int iRow = 0; iRow < neqn; iRow++) {
double sumCol = 0;
for (int iColumn = 0; iColumn < neqn; iColumn++) {
for(int b_count =0; b_count<batchsize; b_count++){
A[neqn*iColumn + iRow] = rand()%10 ;
mat[iRow][iColumn] = A[neqn*iColumn + iRow];
}
sumCol +=A[neqn*iColumn + iRow];
}
fVec[iRow] = sumCol;
fVecSave[iRow] = sumCol;
}
/*CPU_CODE GOES HERE */
start = clock();
for(int b_count =0; b_count<batchsize; b_count++){
for (int iRow = 0; iRow < neqn; iRow++) {
for (int iColumn = 0; iColumn < neqn; iColumn++) {
mat_scratch1[iColumn][iRow]= mat[iColumn][iRow];
}
}
dgeco_blas(mat_scratch1, neqn, ipvt, &rcond, fVecSave);
}
for (int iRow = 0; iRow < neqn; iRow++) {
for (int iColumn = 0; iColumn < neqn; iColumn++) {
mat[iColumn][iRow]= mat_scratch1[iColumn][iRow];
}
}
for(int b_count =0; b_count<batchsize; b_count++){
for(int i = 0; i < neqn; i++) fVec_scratch[i] = fVec[i];
dgesl_blas(mat, neqn, ipvt , fVec_scratch, 0);
}
end = clock();
float seconds = (float)(end - start) / CLOCKS_PER_SEC;
printf("Time in seconds(CPU) : %lf \n", seconds);
/*CPU_CODE ENDS HERE */
start1 = clock();
for(int b_count =0; b_count<batchsize; b_count++){
status = cublasSetMatrix(neqn, neqn, sizeof(A[0]), A, neqn, devPtrA[b_count], neqn);
}
status = cublasDgetrfBatched(handle, neqn, ( double**)devPtrA_dev,neqn,d_pivot_array,d_info_array,batchsize);
if (status != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error in dgetrf %i\n",status);
cudaMemcpy(h_info_array, d_info_array, batchsize*sizeof(int), cudaMemcpyDeviceToHost);
for(int b_count =0; b_count<batchsize; b_count++){
cudaMemcpy(devPtrB[b_count], fVec, neqn*sizeof(double),cudaMemcpyHostToDevice); /* for testing purpose only */
}
status = cublasDgetrsBatched(handle, CUBLAS_OP_N, neqn, batchsize, (const double**)devPtrA_dev,
neqn, d_pivot_array,devPtrB_dev, neqn, h_info_array, batchsize);
for(int b_count =0; b_count<batchsize; b_count++){
cudaMemcpy( fVec,devPtrB[b_count], neqn*sizeof(double),cudaMemcpyDeviceToHost); /* for testing purpose only */
}
end1 = clock();
float seconds1 = (float)(end1 - start1) / CLOCKS_PER_SEC;
printf("Time in seconds(GPU) : %lf \n", seconds1);
printf("Speedup(CPU/GPU) : %lf \n", seconds/seconds1);
system("pause");
/* End of the main portion of the code */
free(mat);
free(mat_scratch1);
free(ipvt);
free(fVec);
free(fVecSave);
free(fVec_scratch);
free(A);
free(B);
cudaFree(devPtrA[0]);
cudaFree(devPtrB[0]);
cudaFree(devPtrA_dev);
cudaFree(devPtrB_dev);
free(devPtrA);
free(devPtrB);
cudaFree(d_x);
free(x);
cudaFree(d_pivot_array);
cudaFree(d_info_array);
free(h_info_array);
cublasDestroy_v2(handle);
}
批量大小为100和批量大小为1000之间不应有任何行为差异。(当然会有性能差异-批量大小为1000可能需要更长的时间。) 除了内隐记忆限制外,没有其他限制。事实上,除非GPU返回错误的结果,否则没有理由认为您已经遇到了任何硬限制
(如果您想探索一些行为或性能问题,这个问题没有正确地解决。)“GPU计时大于CPU计时”意味着GPU更慢?无论如何,你应该发布你用作密码的bechmark代码。我已经上传了gpu版本的代码。我已经上传了gpu版本的代码(1)从发布的代码它不清楚是什么时间,以及如何。(2) 解算器的速度在某种程度上取决于数据,因为可能会使用部分旋转。通过为每次运行使用不同的随机数据,测量结果可能是不可重复的。并且没有与CPU版本的比较。我已上传了修订版的代码,并对单个批量大小的CPU和GPU之间的时间进行了比较。我使用8种不同的批大小运行代码。批量大小为1、10、64、100、200、300、500、1000的速度分别为1.80x、13.28x、29.5x、24.91x、17.89x、13.28x、8.58x和4.76x。加速比的最大值约为batchsize 64。64之后,batchsize的增加导致加速比下降,原因是什么?对不起,目前我无法访问dropbox。为什么不在问题中发布你的代码?因此,希望您在问题中发布代码,而不是在外部链接中发布代码。
#include <stdio.h>
#include <math.h>
#define maxm(a,b) (((a) > (b)) ? (a) : (b))
#define minm(a,b) (((a) < (b)) ? (a) : (b))
#define signum(a,b) (((b) < (0)) ? (-a) : (a))
void **ArrayAlloc2d( const int size1, const int size2, const size_t sizeType);
void dgefa_blas(double **a,int n, int ipvt[],int *info);
void dgesl_blas(double **a,int n,int ipvt[],double b[],int job);
void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z);
void **ArrayAlloc2d( const int size1, const int size2, const size_t sizeType )
{
void** array = nullptr;
array = (void**)calloc(size1, sizeof(void*));
if (array != nullptr) {
if (size2 > 0) {
void* data = calloc(size1*size2, sizeType);
if (data != nullptr) {
char* addr = (char*)data;
for (int index1 = 0; index1 < size1; index1++) {
array[index1] = (void*)addr;
addr += sizeType*size2; /* char is always 1 byte */
}
} else {
free(array);
free(data);
array = nullptr;
}
}
} else {
}
return array;
}
void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z)
{
double anorm,ek,s,sm,t,vecdot,vecsum,wk,wkm,ynorm;
int i,info,j,k,kb,kp1,l;
/* Compute 1-norm of a */
anorm = 0.0;
for (j = 0; j < n; j++) {
vecsum = 0.0;
for (i = 0;i < n; i++)
vecsum += fabs(a[i][j]);
anorm = maxm(anorm,vecsum);
}
/* Factor. */
dgefa_blas(a,n,ipvt,&info);
/* rcond = 1/(norm(a) * (estimate of norm(inverse(a)))).
* estimate = norm(z)/norm(y), where a*z=y and trans(a)*y=e.
* trans(a) is the transpose of a. The components of e are
* chosen to cause maximum local growth in the elements of
* w, where trans(u)*w=e. The vectors are frequently rescaled
* to avoid overflow.
*/
ek = 1.0;
for (j = 0; j < n; j++)
z[j] = 0.0;
for (k = 0; k < n; k++) {
if (z[k] != 0.0)
ek = signum(ek,-z[k]);
if (fabs(ek-z[k]) > fabs(a[k][k])) {
s = fabs(a[k][k])/fabs(ek-z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ek *= s;
}
wk = ek - z[k];
wkm = -ek - z[k];
s = fabs(wk);
sm = fabs(wkm);
if (a[k][k] != 0.0) {
wk /= a[k][k];
wkm /= a[k][k];
}
else {
wk = 1.0;
wkm = 1.0;
}
kp1 = k + 1;
if (kp1 < n) {
for (j = kp1; j < n; j++) {
sm += (fabs(z[j] + wkm * a[k][j]));
z[j] += (wk * a[k][j]);
s += fabs(z[j]);
}
if (s < sm) {
t = wkm -wk;
wk = wkm;
for (j = kp1; j < n; j++)
z[j] += (t * a[k][j]);
}
}
z[k] = wk;
}
/* dasum(n,s,z,1) */
vecsum = 0.0;
for (i = 0;i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
/* Solve trans(l)*y= w
*/
for (kb = 0; kb < n; kb++) {
k = n - kb - 1;
if (k < (n-1)) {
/* sdot(n-k,a(k+1,k),1,z(k+1),1) */
vecdot = 0.0;
for (i = k+1;i < n; i++)
vecdot += (a[i][k] * z[i]);
z[k] += vecdot;
}
if (fabs(z[k]) > 1.0) {
s = 1.0/fabs(z[k]);
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
}
l = ipvt[k];
t = z[l];
z[l] = z[k];
z[k] = t;
} /* endfor kb */
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm = 1.0;
/*
* Solve l * v = y
*/
for (k = 0; k < n; k++) {
l = ipvt[k];
t = z[l];
z[l] = z[k];
z[k] = t;
if (k < (n-1)) {
/* daxpy(n-k,t,a[k+1][k],1,z[k+1],1) */
for (i = k+1;i < n; i++)
z[i] += (t * a[i][k]);
}
if (fabs(z[k]) > 1.0) {
s = 1.0/fabs(z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
}
}
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
/* Solve u * z = v */
for (kb = 0; kb < n; kb++) {
k = n - kb - 1;
if (fabs(z[k]) > fabs(a[k][k])) {
s = fabs(a[k][k])/fabs(z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
}
if (a[k][k] != 0.0)
z[k] /= a[k][k];
if (a[k][k] == 0.0)
z[k] = 1.0;
t = -z[k];
/* daxpy(k-1,t,a[1][k],1,z[1],1) */
for (i = 0; i < k; i++)
z[i] += (t * a[i][k]);
}
/* Make znorm = 1.0 */
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
if (anorm != 0.0) *rcond = ynorm/anorm;
if (anorm == 0.0) *rcond = 0.0;
}
void dgefa_blas(double **a,int n, int ipvt[],int *info)
{
double dmax,t;
int i,j,k,kp1,l,nm1;
*info = 0;
nm1 = n - 1;
if (n > 0) {
for (k = 0; k < nm1; k++) {
kp1 = k + 1;
/* Find l = pivot index. */
dmax = fabs(a[k][k]);
l = k;
for (i = k+1; i < n; i++) {
if (fabs(a[i][k]) <= dmax) continue;
l = i;
}
ipvt[k] = l;
/* Zero pivot implies this column already triangularized. */
if (a[l][k] == 0.0) {
*info = k;
continue;
}
/* Interchange if necessary. */
if (l != k) {
t = a[l][k];
a[l][k] = a[k][k];
a[k][k] = t;
}
/* Compute multipliers. */
if (a[k][k] == 0.0) printf("\n!ERROR. Singular matrix.\n");
t = -1.0/a[k][k];
for (i = k+1; i < n; i++)
a[i][k] *= t;
/* Row elimination with column indexing. */
for (j = kp1; j < n; j++) {
t = a[l][j];
if (l != k) {
a[l][j] = a[k][j];
a[k][j] = t;
}
for (i = k+1; i < n; i++ )
a[i][j] += (t * a[i][k]);
}
}
}
ipvt[n-1] = n-1;
if (a[n-1][n-1] == 0.0) *info = n-1;
}
void dgesl_blas(double **a,int n,int ipvt[],double b[],int job)
{
double t;
int i,k,kb,l,nm1;
nm1 = n - 1;
if (job == 0) {
/* job = 0, solve a * x = b.
* First solve l * y = b.
*/
if (n > 0) {
for (k = 0; k < nm1; k++) {
l = ipvt[k];
t = b[l];
if (l != k) {
b[l] = b[k];
b[k] = t;
}
/* saxpy(n-k,t,a(k+1,k),1,b(k+1),1); */
for (i=k+1;i < n;i++)
b[i] += (t * a[i][k]);
}
}
/* Now solve u * x = y. */
for (kb = 0; kb < n; kb++) {
k = n - kb-1;
b[k] /= a[k][k];
t = -b[k];
/* saxpy(k-1,t,a(1,k),1,b(1),1); */
for (i = 0; i < k ; i++)
b[i] += (t * a[i][k]);
}
return;
}
/* job != 0, solve trans(a) * x = b.
* First solve trans(u) * x = y.
*/
for (k = 0; k < n; k++) {
/* t = ddot(k-1,a(1,k),1,b(1),1); */
t = 0;
for (i = 0; i < k; i++)
t += (a[i][k] * b[i]);
b[k] = (b[k] - t)/a[k][k];
}
/* Now solve trans(l) * x = y. */
if (n > 0) {
for (kb = 0; kb < nm1; kb++) {
k = n - 2 - kb;
/* b[k] = b[k] + ddot(n-k,a(k+1,k),1,b(k+1),1); */
t = 0;
for (i = k+1;i < n; i++)
t += (a[i][k] * b[i]);
b[k] += t;
l = ipvt[k];
if (l != k) {
t = b[l];
b[l] = b[k];
b[k] = t;
}
}
}
}