为什么我的CUDA内核会因不同的数据集大小而崩溃(未指定的启动失败)?
我有一个内核来计算矩阵的不同元素,基于它们的位置(对角线或非对角线)。计算大小矩阵时,内核按预期工作:为什么我的CUDA内核会因不同的数据集大小而崩溃(未指定的启动失败)?,cuda,Cuda,我有一个内核来计算矩阵的不同元素,基于它们的位置(对角线或非对角线)。计算大小矩阵时,内核按预期工作: 14 x 14(我知道这很小,并且没有正确使用GPU资源,但这纯粹是为了测试目的,以确保结果正确) 118 x 118,以及 300 x 300 然而,当我试图计算一个大小为2383x2383的矩阵时,内核崩溃了。具体来说,cudaMemcpy()行上会抛出错误“Unspecified launch failure”,以将结果从设备返回到主机。通过研究,我了解到这种错误通常发生在越界内存访
- 14 x 14(我知道这很小,并且没有正确使用GPU资源,但这纯粹是为了测试目的,以确保结果正确)
- 118 x 118,以及
- 300 x 300
__global__ void createYBus(float *R, float *X, float *B, int numberOfBuses, int numberOfBranches, int *fromBus, int *toBus, cuComplex *y)
{
int rowIdx = blockIdx.y*blockDim.y + threadIdx.y;
int colIdx = blockIdx.x*blockDim.x + threadIdx.x;
int index = rowIdx*numberOfBuses + colIdx;
if (rowIdx<numberOfBuses && colIdx<numberOfBuses)
{
for (int i=0; i<numberOfBranches; ++i)
{
if (rowIdx==fromBus[i] && colIdx==fromBus[i]) { //diagonal element
y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
}
if (rowIdx==toBus[i] && colIdx==toBus[i]) { //diagonal element
y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
}
if (rowIdx==fromBus[i] && colIdx==toBus[i]) { //off-diagonal element
y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
}
if (rowIdx==toBus[i] && colIdx==fromBus[i]) { //off-diagonal element
y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
}
}
}
}
其中cudaStatN为cudaError\t类型,用于捕获错误。最后四次分配是在代码后面完成的,并且是针对另一个内核的。然而,这些分配是在调用相关内核之前完成的
发射参数如下:
cudaStat1 = cudaMalloc((void**)&dev_fromBus, numLines*sizeof(int));
cudaStat2 = cudaMalloc((void**)&dev_toBus, numLines*sizeof(int));
cudaStat3 = cudaMalloc((void**)&dev_R, numLines*sizeof(float));
cudaStat4 = cudaMalloc((void**)&dev_X, numLines*sizeof(float));
cudaStat5 = cudaMalloc((void**)&dev_B, numLines*sizeof(float));
cudaStat6 = cudaMalloc((void**)&dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex));
cudaStat7 = cudaMalloc((void**)&dev_Pd, numberOfBuses*sizeof(float));
cudaStat8 = cudaMalloc((void**)&dev_Qd, numberOfBuses*sizeof(float));
cudaStat9 = cudaMalloc((void**)&dev_Vmag, numberOfBuses*sizeof(float));
cudaStat10 = cudaMalloc((void**)&dev_theta, numberOfBuses*sizeof(float));
cudaStat11 = cudaMalloc((void**)&dev_Peq, numberOfBuses*sizeof(float));
cudaStat12 = cudaMalloc((void**)&dev_Qeq, numberOfBuses*sizeof(float));
cudaStat13 = cudaMalloc((void**)&dev_Peq1, numberOfBuses*sizeof(float));
cudaStat14 = cudaMalloc((void**)&dev_Qeq1, numberOfBuses*sizeof(float));
...
...
cudaStat15 = cudaMalloc((void**)&dev_powerMismatch, jacSize*sizeof(float));
cudaStat16 = cudaMalloc((void**)&dev_jacobian, jacSize*jacSize*sizeof(float));
cudaStat17 = cudaMalloc((void**)&dev_stateVector, jacSize*sizeof(float));
cudaStat18 = cudaMalloc((void**)&dev_PQindex, jacSize*sizeof(int));
dim3 dimBlock(16, 16); //number of threads
dim3 dimGrid((numberOfBuses+15)/16, (numberOfBuses+15)/16); //number of blocks
//launch kernel once data has been copied to GPU
createYBus<<<dimGrid, dimBlock>>>(dev_R, dev_X, dev_B, numberOfBuses, numLines, dev_fromBus, dev_toBus, dev_y);
//copy results back to CPU
cudaStat6 = cudaMemcpy(y_bus, dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex), cudaMemcpyDeviceToHost);
if (cudaStat6 != cudaSuccess) {
cout<<"Device memcpy failed"<<endl;
cout<<cudaGetErrorString(cudaStat6)<<endl;
return 1;
}
我知道我的内核不是最高效的,因为有很多全局内存访问。为什么这个较大的矩阵的内核会崩溃?是否有我丢失的越界数组访问?非常感谢您的帮助。我试图用下面的完整示例重现您的代码。代码编译、运行时没有错误
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "cuComplex.h"
__global__ void createYBus(float *R, float *X, float *B, int numberOfBuses, int numberOfBranches, int *fromBus, int *toBus, cuComplex *y)
{
int rowIdx = blockIdx.y*blockDim.y + threadIdx.y;
int colIdx = blockIdx.x*blockDim.x + threadIdx.x;
int index = rowIdx*numberOfBuses + colIdx;
if (rowIdx<numberOfBuses && colIdx<numberOfBuses)
{
for (int i=0; i<numberOfBranches; ++i)
{
if (rowIdx==fromBus[i] && colIdx==fromBus[i]) { //diagonal element
y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
}
if (rowIdx==toBus[i] && colIdx==toBus[i]) { //diagonal element
y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
}
if (rowIdx==fromBus[i] && colIdx==toBus[i]) { //off-diagonal element
y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
}
if (rowIdx==toBus[i] && colIdx==fromBus[i]) { //off-diagonal element
y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
}
}
}
}
int main ()
{
int numLines = 32 ;
int numberOfBuses = 2383 ;
int* dev_fromBus, *dev_toBus;
float *dev_R, *dev_X, *dev_B;
cuComplex* dev_y ;
cudaMalloc((void**)&dev_fromBus, numLines*sizeof(int));
cudaMalloc((void**)&dev_toBus, numLines*sizeof(int));
cudaMalloc((void**)&dev_R, numLines*sizeof(float));
cudaMalloc((void**)&dev_X, numLines*sizeof(float));
cudaMalloc((void**)&dev_B, numLines*sizeof(float));
cudaMalloc((void**)&dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex));
dim3 dimBlock(16, 16); //number of threads
dim3 dimGrid((numberOfBuses+15)/16, (numberOfBuses+15)/16); //number of blocks
//launch kernel once data has been copied to GPU
createYBus<<<dimGrid, dimBlock>>>(dev_R, dev_X, dev_B, numberOfBuses, numLines, dev_fromBus, dev_toBus, dev_y);
cuComplex* y_bus = new cuComplex[numberOfBuses*numberOfBuses] ;
//copy results back to CPU
cudaError_t cudaStat6 = cudaMemcpy(y_bus, dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex), cudaMemcpyDeviceToHost);
if (cudaStat6 != cudaSuccess) {
printf ("failure : (%d) - %s\n", cudaStat6, ::cudaGetErrorString(cudaStat6)) ;
return 1;
}
return 0 ;
}
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#包括“cuComplex.h”
__全局无效创建总线(float*R、float*X、float*B、int numberofbus、int numberofbranchs、int*fromBus、int*toBus、cuComplex*y)
{
int rowIdx=blockIdx.y*blockDim.y+threadIdx.y;
int colIdx=blockIdx.x*blockDim.x+threadIdx.x;
int index=rowIdx*numberofbus+colIdx;
如果(rowIdx解决了问题。结果显示WDDM TDR(超时检测恢复)已启用,延迟设置为2秒。这意味着如果内核执行时间超过2秒,驱动程序将崩溃并恢复。这适用于图形和渲染(用于GPU的一般用途)。但是,在这种情况下,TDR必须禁用me或延迟增加。通过将延迟增加到10秒,崩溃错误“未指定的启动失败”停止出现,内核执行仍像以前一样继续
TDR延迟(以及启用/禁用)可以通过Nsight监视器中的Nsight选项或通过注册表(HKEY\U LOCAL\U MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers)完成-DWORDS Tdrdelay和Tdrlevel。可能是内存分配大小与参数不匹配。是否也可以共享分配调用?是否确定“int index”的值,在你的内核中,是否正确?它不应该取决于块的尺寸吗?@FlorentDUGUET我编辑了这篇文章以包含分配调用。我知道全局内存很大,但有可能是我分配了太多内存吗?Cudamaloc()没有任何错误调用。@Taro我正在启动一个包含2D块的2D网格。全局线程ID在x和y方向上计算,然后索引结合全局线程ID公式,以平展形式索引整个矩阵。我很好奇为什么要向下投票。我很乐意改进答案。这将与较小的情况一样有效。The numLines变量来自存储数据的文本文件中的行数。行数为2896。文本文件中的每列存储为一个数组(fromBus、toBus、R、X、B),然后复制到其分配的设备对应项(前缀为“dev”)。此外,cuComplex是标头中的CUDA数据类型“CuFix.h”附带工具包,所以没有必要执行。我将再次尝试NVISE,谢谢。(不确定下注)。不同的人认为是“为我工作”。"答案很差。几年前我第一次开始回答问题时就被告知了这一点。当然,这是一个意见问题,但这可能是否决票的一个原因。然而,人们普遍认为,要求调试帮助但未能提供MCVE的问题应该投票了结——这显然是错误的在SO帮助页面上显示。回答此类问题或试图这样做,也可能被一些人认为是不好的形式。相反,鼓励发布MCVE。如果您希望完成您在回答中所做的工作,一种可能的替代方法是在问题的注释中添加足够的描述性文本,建议应提供MCVEided,并在外部链接(如pastebin)中提供代码或完全工作的示例。如果您仔细阅读此处的问题,您会发现此类行为的示例。无论如何,我不想争论。如果我们不同意,也没关系。这是社区的本质。感谢@RobertCrovella的解释。问题是:为什么内核会因为t而崩溃他的更大的矩阵?是否有我缺少的越界数组访问?我试图在回答中表达的是,答案中提供的元素没有显示任何问题,而问题在其他地方。提供函数代码示例是为了说明我的说法。我不知道如何投票支持结束这个问题,但我会得到答案r如果被提议为danieljovan,则既没有提供MCVE,也没有将该答案标记为有效。再次感谢Robert。感谢分享,你是一个英雄!这正是我的问题,非常感谢。我想知道为什么通过Google找到这个答案如此困难?这通常适用于所有CUDA用例-特别是MATLAB。刚刚修复了我遇到的一个问题已经有好几天了!我已经为Tdrdelay创建了一个DWORD注册表项。我应该为Tdrlevel创建另一个注册表项吗?的值应该是多少
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "cuComplex.h"
__global__ void createYBus(float *R, float *X, float *B, int numberOfBuses, int numberOfBranches, int *fromBus, int *toBus, cuComplex *y)
{
int rowIdx = blockIdx.y*blockDim.y + threadIdx.y;
int colIdx = blockIdx.x*blockDim.x + threadIdx.x;
int index = rowIdx*numberOfBuses + colIdx;
if (rowIdx<numberOfBuses && colIdx<numberOfBuses)
{
for (int i=0; i<numberOfBranches; ++i)
{
if (rowIdx==fromBus[i] && colIdx==fromBus[i]) { //diagonal element
y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
}
if (rowIdx==toBus[i] && colIdx==toBus[i]) { //diagonal element
y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
}
if (rowIdx==fromBus[i] && colIdx==toBus[i]) { //off-diagonal element
y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
}
if (rowIdx==toBus[i] && colIdx==fromBus[i]) { //off-diagonal element
y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
}
}
}
}
int main ()
{
int numLines = 32 ;
int numberOfBuses = 2383 ;
int* dev_fromBus, *dev_toBus;
float *dev_R, *dev_X, *dev_B;
cuComplex* dev_y ;
cudaMalloc((void**)&dev_fromBus, numLines*sizeof(int));
cudaMalloc((void**)&dev_toBus, numLines*sizeof(int));
cudaMalloc((void**)&dev_R, numLines*sizeof(float));
cudaMalloc((void**)&dev_X, numLines*sizeof(float));
cudaMalloc((void**)&dev_B, numLines*sizeof(float));
cudaMalloc((void**)&dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex));
dim3 dimBlock(16, 16); //number of threads
dim3 dimGrid((numberOfBuses+15)/16, (numberOfBuses+15)/16); //number of blocks
//launch kernel once data has been copied to GPU
createYBus<<<dimGrid, dimBlock>>>(dev_R, dev_X, dev_B, numberOfBuses, numLines, dev_fromBus, dev_toBus, dev_y);
cuComplex* y_bus = new cuComplex[numberOfBuses*numberOfBuses] ;
//copy results back to CPU
cudaError_t cudaStat6 = cudaMemcpy(y_bus, dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex), cudaMemcpyDeviceToHost);
if (cudaStat6 != cudaSuccess) {
printf ("failure : (%d) - %s\n", cudaStat6, ::cudaGetErrorString(cudaStat6)) ;
return 1;
}
return 0 ;
}