CublastRsmBatched-执行失败
如果没有CUBLAS_状态_执行_失败(13)输出,我无法运行cublasStrsmBatched(第113行)。为了简化,所有矩阵值和alpha均为1.0,所有矩阵均为正方形,且lda、ldb、m和n相等。 我能够以相同的方式运行CublashGemBatched和CublastRsm,没有错误。CublastRsmbatched应该是相同的,但它不是,对我来说不是。 请告诉我您是否知道我在这段代码中做错了什么:CublastRsmBatched-执行失败,c,cuda,gpgpu,hpc,cublas,C,Cuda,Gpgpu,Hpc,Cublas,如果没有CUBLAS_状态_执行_失败(13)输出,我无法运行cublasStrsmBatched(第113行)。为了简化,所有矩阵值和alpha均为1.0,所有矩阵均为正方形,且lda、ldb、m和n相等。 我能够以相同的方式运行CublashGemBatched和CublastRsm,没有错误。CublastRsmbatched应该是相同的,但它不是,对我来说不是。 请告诉我您是否知道我在这段代码中做错了什么: #include <stdio.h> #include <s
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
cublasHandle_t handle;
void CheckCublasCreate(cublasStatus_t status);
void CheckAllocateHost(void* h_pointer);
void CheckCudaMalloc(cudaError_t d_allocStatus);
void CheckCudaMemcpy( cudaError_t error );
void CheckCublasSetGetMatrix(cublasStatus_t status);
void CheckKernelExecution(cublasStatus_t status);
void CheckCublasDestroy(cublasStatus_t status);
void TestCublasStrsmBatched(int size, int numOfLinSys);
int main()
{
cublasStatus_t status = cublasCreate(&handle);
CheckCublasCreate(status);
/*arguments are size of square matrix
and number of linear systems*/
TestCublasStrsmBatched(2,2);
status = cublasDestroy(handle);
CheckCublasDestroy(status);
}
void TestCublasStrsmBatched(int size, int numOfLinSys)
{
cublasStatus_t status;
cudaError_t error;
float **h_A;
float **d_A;
float **h_B;
float **d_B;
float **hd_A;
float **hd_B;
float *alpha;
const int n = size;
const int m = size;
const int lda=m;
const int ldb=m;
const int matA_numOfElem = m*m;
const int matB_numOfElem = m*n;
int i,j;
h_A = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(h_A);
h_B = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(h_B);
alpha=(float *)malloc(sizeof(float));
*alpha = 1.0;
for (j=0; j<numOfLinSys; j++){
h_A[j] = (float *)malloc(matA_numOfElem * sizeof(float));
CheckAllocateHost(h_A);
for (i=0; i < matA_numOfElem; i++)
h_A[j][i] = 1.0;
h_B[j] = (float *)malloc(matB_numOfElem * sizeof(float));
CheckAllocateHost(h_B);
for (i=0; i < matB_numOfElem; i++)
h_B[j][i] = 1.0;
}
hd_A = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(hd_A);
hd_B = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(hd_B);
for (j=0; j<numOfLinSys; j++){
error = cudaMalloc((void **)&hd_A[j],
matA_numOfElem * sizeof(float));
CheckCudaMalloc(error);
error = cudaMalloc((void **)&hd_B[j],
matB_numOfElem * sizeof(float));
CheckCudaMalloc(error);
status = cublasSetMatrix(m, m, sizeof(float),
h_A[j], lda, hd_A[j], lda);
CheckCublasSetGetMatrix(status);
status = cublasSetMatrix(m, n, sizeof(float),
h_B[j], ldb, hd_B[j], ldb);
CheckCublasSetGetMatrix(status);
}
error = cudaMalloc((void **)&d_A, numOfLinSys * sizeof(float*));
CheckCudaMalloc(error);
error = cudaMalloc((void **)&d_B, numOfLinSys * sizeof(float*));
CheckCudaMalloc(error);
error = cudaMemcpy(d_A, hd_A, numOfLinSys * sizeof(float*),
cudaMemcpyHostToDevice);
CheckCudaMemcpy(error);
error = cudaMemcpy(d_B, hd_B, numOfLinSys * sizeof(float*),
cudaMemcpyHostToDevice);
CheckCudaMemcpy(error);
/*After cublasStrsmBatched call
status changes to CUBLAS_STATUS_EXECUTION_FAILED (13)*/
status = cublasStrsmBatched(handle,
CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,
CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
m, n, alpha, d_A, lda, d_B, ldb, numOfLinSys);
CheckKernelExecution(status);
}
void CheckCublasCreate( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr,
"!!!! CUBLAS initialization error \n");
exit(EXIT_FAILURE);
}
}
void CheckAllocateHost( void* h_pointer )
{
if (h_pointer == 0){
fprintf(stderr,
"!!!! host memory allocation error \n");
exit(EXIT_FAILURE);
}
}
void CheckCudaMalloc( cudaError_t error )
{
if (error != cudaSuccess){
fprintf(stderr,
"!!!! device memory allocation error (error code %s)\n",
cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
}
void CheckCudaMemcpy( cudaError_t error )
{
if (error != cudaSuccess){
fprintf(stderr, "!!!! data copy error (error code %s)\n",
cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
}
void CheckCublasSetGetMatrix( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! device access error \n");
exit(EXIT_FAILURE);
}
}
void CheckKernelExecution( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! kernel execution error.\n");
exit(EXIT_FAILURE);
}
}
void CheckCublasDestroy( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! shutdown error \n");
exit(EXIT_FAILURE);
}
}
#包括
#包括
#包括
#包括
立方手柄;
无效检查CUBLASCREATE(cublasStatus_t状态);
void CheckAllocateHost(void*h_指针);
无效检查cudamaloc(cudaError_t d_allocStatus);
无效检查cudamemcpy(cudaError\t error);
无效检查CublassetMatrix(cublasStatus_t状态);
无效检查内核执行(cublasStatus_t状态);
无效检查cublasdestroy(cublasStatus_t状态);
void TestCublasStrsmBatched(int size,int numOfLinSys);
int main()
{
cublastatus_t status=cublasCreate(&handle);
选中创建(状态);
/*参数是平方矩阵的大小
和线性系统的个数*/
testcublastrsmbatched(2,2);
状态=销毁(句柄);
检查(状态);
}
void TestCublasStrsmBatched(int size,int numOfLinSys)
{
立方体状态;
cudaError\t错误;
浮动**h_A;
浮动**d_A;
浮动**h_B;
浮动**d_B;
浮动**hd_A;
浮动**hd_B;
浮动*α;
常量int n=大小;
const int m=大小;
常数int lda=m;
常数int ldb=m;
常数int matA_numOfElem=m*m;
常数int matB_numOfElem=m*n;
int i,j;
h_A=(浮动**)malloc(numOfLinSys*sizeof(浮动*);
CheckAllocateHost(h_A);
h_B=(浮动**)malloc(numOfLinSys*sizeof(浮动*);
检查分配主机(h_B);
alpha=(浮动*)malloc(sizeof(浮动));
*α=1.0;
对于(j=0;j批处理的三角形后Solver是我以前在CUBLAS中没有尝试过的东西,因此我有兴趣看一看,看看可能发生了什么。您的代码相当复杂,所以我没有费心去理解它,但当我运行它时,它似乎由于内部CUBLAS启动失败而失败:
$ cuda-memcheck ./a.out
========= CUDA-MEMCHHECK
!!!! kernel execution error.
========= Program hit error 8 on CUDA API call to cudaLaunch
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/Library/Frameworks/CUDA.framework/Versions/A/Libraries/libcuda_256.00.35.dylib (cudbgGetAPIVersion + 0x27bd7) [0x4538e7]
========= Host Frame:/usr/local/cuda/lib/libcudart.dylib (cudaLaunch + 0x26c) [0x45c8c]
========= Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasZgetrfBatched + 0x1e34) [0x196ae4]
========= Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasCtrsmBatched + 0x64d) [0x1974cd]
========= Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasCtrsmBatched + 0xacb) [0x19794b]
========= Host Frame:/Users/talonmies/./a.out (_Z22TestCublasStrsmBatchedii + 0x3c1) [0x1b28]
========= Host Frame:/Users/talonmies/./a.out (main + 0x3d) [0x1b7d]
========= Host Frame:/Users/talonmies/./a.out (start + 0x35) [0x14e9]
========= Host Frame:[0x1]
(这是一台装有compute 1.2 GPU和CUDA 5.0的OS X机器)。错误8是cudaErrorInvalidDeviceFunction
,通常只有当库或fatbinary没有匹配的体系结构或无法JIT重新编译为GPU可以运行的体系结构时才会出现
出于好奇,我从无到有地编写了自己的简单得多的复制案例:
#include <iostream>
#include <cublas_v2.h>
int main(void)
{
const int Neq = 5, Nrhs = 2, Nsys = 4;
float Atri[Neq][Neq] =
{ { 1, 6, 11, 16, 21},
{ 0, 7, 12, 17, 22},
{ 0, 0, 13, 18, 23},
{ 0, 0, 0, 19, 24},
{ 0, 0, 0, 0, 25} };
float B[Nrhs][Neq] =
{ { 1, 27, 112, 290, 595},
{ 2, 40, 148, 360, 710} };
float *syslhs[Nsys], *sysrhs[Nsys];
float *A_, *B_, **syslhs_, **sysrhs_;
size_t Asz = sizeof(float) * (size_t)(Neq * Neq);
size_t Bsz = sizeof(float) * (size_t)(Neq * Nrhs);
cudaMalloc((void **)(&A_), Asz);
cudaMalloc((void **)(&B_), Bsz * size_t(Nsys));
cudaMemcpy(A_, Atri, Asz, cudaMemcpyHostToDevice);
for(int i=0; i<Nsys; i++) {
syslhs[i] = A_;
sysrhs[i] = (float*)((char *)B_ + i*Bsz);
cudaMemcpy(sysrhs[i], B, Bsz, cudaMemcpyHostToDevice);
}
size_t syssz = sizeof(float *) * (size_t)Nsys;
cudaMalloc((void **)&syslhs_, syssz);
cudaMalloc((void **)&sysrhs_, syssz);
cudaMemcpy(syslhs_, syslhs, syssz, cudaMemcpyHostToDevice);
cudaMemcpy(sysrhs_, sysrhs, syssz, cudaMemcpyHostToDevice);
const cublasSideMode_t side = CUBLAS_SIDE_LEFT;
const cublasDiagType_t diag = CUBLAS_DIAG_NON_UNIT;
const cublasFillMode_t ulo = CUBLAS_FILL_MODE_LOWER;
const cublasOperation_t trans = CUBLAS_OP_N;
float alpha = 1.f;
cublasHandle_t handle;
cublasCreate(&handle);
cublasStrsmBatched(
handle,
side, ulo, trans, diag,
Neq, Nrhs,
&alpha,
syslhs_, Neq,
sysrhs_, Neq,
Nsys
);
for(int k=0; k<Nsys; k++) {
cudaMemcpy(B, sysrhs[k], Bsz, cudaMemcpyDeviceToHost);
for(int i=0; i<Nrhs; i++) {
for(int j=0; j<Neq; j++) {
std::cout << B[i][j] << ",";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
return 0;
}
#包括
#包括
内部主(空)
{
常数int Neq=5,Nrhs=2,Nsys=4;
浮动Atri[Neq][Neq]=
{ { 1, 6, 11, 16, 21},
{ 0, 7, 12, 17, 22},
{ 0, 0, 13, 18, 23},
{ 0, 0, 0, 19, 24},
{ 0, 0, 0, 0, 25} };
浮动B[Nrhs][Neq]=
{ { 1, 27, 112, 290, 595},
{ 2, 40, 148, 360, 710} };
浮动*syslhs[Nsys],*sysrhs[Nsys];
浮点数*A_uu、*B_u、**syslhs_uu、**sysrhs_uu;
大小Asz=sizeof(float)*(size_t)(Neq*Neq);
尺寸Bsz=浮动尺寸)*(尺寸t)(Neq*Nrhs);
Cudamaloc((void**)(&A_),Asz);
Cudamaloc((void**)(&B_),Bsz*尺寸(Nsys));
cudaMemcpy(A_u2;、Atri、Asz、cudaMemcpyHostToDevice);
对于(int i=0;iYes,cuda memcheck报告了相同的错误。在发送错误报告之前,我必须冷静下来。发送了GPU试驾请求。同时,我想询问是否有人可以访问3.5计算能力卡,以尝试上面的代码。@user2971354:请查看我的更新答案。我将有权确认这一点,但似乎有库中的两个GPU都没有代码。记录的文档/版本应该更改以反映这一点。我明天可以访问开普勒卡,以便验证这一假设。如果您可以接受此答案以将其从未回答的问题列表中删除,那将是一件好事。似乎这种行为或多或少是意料之中的,我t是一个文档疏忽。文档可能应在CUDA 6.0时间范围内更新,以反映此功能仅在cc2.0或更高的体系结构上受支持。这主要是因为算法需要48KB的共享内存,这在cc1.x体系结构的设备上不可用。CUDA 6文档已更新,以反映trsmBatched
、getrfBatched
和getriBatched
需要cc 2.0或更高版本。