为什么我的CUDA内核会因不同的数据集大小而崩溃(未指定的启动失败)?

为什么我的CUDA内核会因不同的数据集大小而崩溃(未指定的启动失败)?,cuda,Cuda,我有一个内核来计算矩阵的不同元素,基于它们的位置(对角线或非对角线)。计算大小矩阵时,内核按预期工作: 14 x 14(我知道这很小,并且没有正确使用GPU资源,但这纯粹是为了测试目的,以确保结果正确) 118 x 118,以及 300 x 300 然而,当我试图计算一个大小为2383x2383的矩阵时,内核崩溃了。具体来说,cudaMemcpy()行上会抛出错误“Unspecified launch failure”,以将结果从设备返回到主机。通过研究,我了解到这种错误通常发生在越界内存访

我有一个内核来计算矩阵的不同元素,基于它们的位置(对角线或非对角线)。计算大小矩阵时,内核按预期工作:

  • 14 x 14(我知道这很小,并且没有正确使用GPU资源,但这纯粹是为了测试目的,以确保结果正确)
  • 118 x 118,以及
  • 300 x 300
然而,当我试图计算一个大小为2383x2383的矩阵时,内核崩溃了。具体来说,cudaMemcpy()行上会抛出错误“Unspecified launch failure”,以将结果从设备返回到主机。通过研究,我了解到这种错误通常发生在越界内存访问的情况下(例如,在数组中),但是,我不知道它适用于前面三种情况,但不适用于2383 x 2383情况。内核代码如下所示:

__global__ void createYBus(float *R, float *X, float *B, int numberOfBuses, int numberOfBranches, int *fromBus, int *toBus, cuComplex *y)
{
    int rowIdx = blockIdx.y*blockDim.y + threadIdx.y;
    int colIdx = blockIdx.x*blockDim.x + threadIdx.x;
    int index = rowIdx*numberOfBuses + colIdx;
    if (rowIdx<numberOfBuses && colIdx<numberOfBuses)
    {
        for (int i=0; i<numberOfBranches; ++i)
        {
            if (rowIdx==fromBus[i] && colIdx==fromBus[i]) { //diagonal element
                y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
            }
            if (rowIdx==toBus[i] && colIdx==toBus[i]) { //diagonal element
                y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
            }
            if (rowIdx==fromBus[i] && colIdx==toBus[i]) { //off-diagonal element
                y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
            }
            if (rowIdx==toBus[i] && colIdx==fromBus[i]) { //off-diagonal element
                y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
            }
        }
    }
}
其中cudaStatN为cudaError\t类型,用于捕获错误。最后四次分配是在代码后面完成的,并且是针对另一个内核的。然而,这些分配是在调用相关内核之前完成的

发射参数如下:

cudaStat1 = cudaMalloc((void**)&dev_fromBus, numLines*sizeof(int));
cudaStat2 = cudaMalloc((void**)&dev_toBus, numLines*sizeof(int));
cudaStat3 = cudaMalloc((void**)&dev_R, numLines*sizeof(float));
cudaStat4 = cudaMalloc((void**)&dev_X, numLines*sizeof(float));
cudaStat5 = cudaMalloc((void**)&dev_B, numLines*sizeof(float));
cudaStat6 = cudaMalloc((void**)&dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex));
cudaStat7 = cudaMalloc((void**)&dev_Pd, numberOfBuses*sizeof(float));
cudaStat8 = cudaMalloc((void**)&dev_Qd, numberOfBuses*sizeof(float));
cudaStat9 = cudaMalloc((void**)&dev_Vmag, numberOfBuses*sizeof(float));
cudaStat10 = cudaMalloc((void**)&dev_theta, numberOfBuses*sizeof(float));
cudaStat11 = cudaMalloc((void**)&dev_Peq, numberOfBuses*sizeof(float));
cudaStat12 = cudaMalloc((void**)&dev_Qeq, numberOfBuses*sizeof(float));
cudaStat13 = cudaMalloc((void**)&dev_Peq1, numberOfBuses*sizeof(float));
cudaStat14 = cudaMalloc((void**)&dev_Qeq1, numberOfBuses*sizeof(float));
...
...
cudaStat15 = cudaMalloc((void**)&dev_powerMismatch, jacSize*sizeof(float));
cudaStat16 = cudaMalloc((void**)&dev_jacobian, jacSize*jacSize*sizeof(float));
cudaStat17 = cudaMalloc((void**)&dev_stateVector, jacSize*sizeof(float));
cudaStat18 = cudaMalloc((void**)&dev_PQindex, jacSize*sizeof(int));
dim3 dimBlock(16, 16); //number of threads 
dim3 dimGrid((numberOfBuses+15)/16, (numberOfBuses+15)/16);  //number of blocks

//launch kernel once data has been copied to GPU
createYBus<<<dimGrid, dimBlock>>>(dev_R, dev_X, dev_B, numberOfBuses, numLines, dev_fromBus, dev_toBus, dev_y);

//copy results back to CPU
cudaStat6 = cudaMemcpy(y_bus, dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex), cudaMemcpyDeviceToHost);
if (cudaStat6 != cudaSuccess) {
    cout<<"Device memcpy failed"<<endl;
    cout<<cudaGetErrorString(cudaStat6)<<endl;
    return 1;
}

我知道我的内核不是最高效的,因为有很多全局内存访问。为什么这个较大的矩阵的内核会崩溃?是否有我丢失的越界数组访问?非常感谢您的帮助。

我试图用下面的完整示例重现您的代码。代码编译、运行时没有错误

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include "cuComplex.h"

__global__ void createYBus(float *R, float *X, float *B, int numberOfBuses, int numberOfBranches, int *fromBus, int *toBus, cuComplex *y)
{
    int rowIdx = blockIdx.y*blockDim.y + threadIdx.y;
    int colIdx = blockIdx.x*blockDim.x + threadIdx.x;
    int index = rowIdx*numberOfBuses + colIdx;
    if (rowIdx<numberOfBuses && colIdx<numberOfBuses)
    {
        for (int i=0; i<numberOfBranches; ++i)
        {
            if (rowIdx==fromBus[i] && colIdx==fromBus[i]) { //diagonal element
                y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
            }
            if (rowIdx==toBus[i] && colIdx==toBus[i]) { //diagonal element
                y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
            }
            if (rowIdx==fromBus[i] && colIdx==toBus[i]) { //off-diagonal element
                y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
            }
            if (rowIdx==toBus[i] && colIdx==fromBus[i]) { //off-diagonal element
                y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
            }
        }
    }
}


int main ()
{
    int numLines = 32 ;
    int numberOfBuses = 2383 ;

    int* dev_fromBus, *dev_toBus;
    float *dev_R, *dev_X, *dev_B;
    cuComplex* dev_y ; 

    cudaMalloc((void**)&dev_fromBus, numLines*sizeof(int));
    cudaMalloc((void**)&dev_toBus, numLines*sizeof(int));
    cudaMalloc((void**)&dev_R, numLines*sizeof(float));
    cudaMalloc((void**)&dev_X, numLines*sizeof(float));
    cudaMalloc((void**)&dev_B, numLines*sizeof(float));
    cudaMalloc((void**)&dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex));

    dim3 dimBlock(16, 16); //number of threads 
    dim3 dimGrid((numberOfBuses+15)/16, (numberOfBuses+15)/16);  //number of blocks

    //launch kernel once data has been copied to GPU
    createYBus<<<dimGrid, dimBlock>>>(dev_R, dev_X, dev_B, numberOfBuses, numLines, dev_fromBus, dev_toBus, dev_y);

    cuComplex* y_bus = new cuComplex[numberOfBuses*numberOfBuses] ;

    //copy results back to CPU
    cudaError_t cudaStat6 = cudaMemcpy(y_bus, dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex), cudaMemcpyDeviceToHost);
    if (cudaStat6 != cudaSuccess) {
        printf ("failure : (%d) - %s\n", cudaStat6, ::cudaGetErrorString(cudaStat6)) ;
        return 1;
    }
    return 0 ;
}
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#包括“cuComplex.h”
__全局无效创建总线(float*R、float*X、float*B、int numberofbus、int numberofbranchs、int*fromBus、int*toBus、cuComplex*y)
{
int rowIdx=blockIdx.y*blockDim.y+threadIdx.y;
int colIdx=blockIdx.x*blockDim.x+threadIdx.x;
int index=rowIdx*numberofbus+colIdx;

如果(rowIdx解决了问题。结果显示WDDM TDR(超时检测恢复)已启用,延迟设置为2秒。这意味着如果内核执行时间超过2秒,驱动程序将崩溃并恢复。这适用于图形和渲染(用于GPU的一般用途)。但是,在这种情况下,TDR必须禁用me或延迟增加。通过将延迟增加到10秒,崩溃错误“未指定的启动失败”停止出现,内核执行仍像以前一样继续


TDR延迟(以及启用/禁用)可以通过Nsight监视器中的Nsight选项或通过注册表(HKEY\U LOCAL\U MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers)完成-DWORDS Tdrdelay和Tdrlevel。

可能是内存分配大小与参数不匹配。是否也可以共享分配调用?是否确定“int index”的值,在你的内核中,是否正确?它不应该取决于块的尺寸吗?@FlorentDUGUET我编辑了这篇文章以包含分配调用。我知道全局内存很大,但有可能是我分配了太多内存吗?Cudamaloc()没有任何错误调用。@Taro我正在启动一个包含2D块的2D网格。全局线程ID在x和y方向上计算,然后索引结合全局线程ID公式,以平展形式索引整个矩阵。我很好奇为什么要向下投票。我很乐意改进答案。这将与较小的情况一样有效。The numLines变量来自存储数据的文本文件中的行数。行数为2896。文本文件中的每列存储为一个数组(fromBus、toBus、R、X、B),然后复制到其分配的设备对应项(前缀为“dev”)。此外,cuComplex是标头中的CUDA数据类型“CuFix.h”附带工具包,所以没有必要执行。我将再次尝试NVISE,谢谢。(不确定下注)。不同的人认为是“为我工作”。"答案很差。几年前我第一次开始回答问题时就被告知了这一点。当然,这是一个意见问题,但这可能是否决票的一个原因。然而,人们普遍认为,要求调试帮助但未能提供MCVE的问题应该投票了结——这显然是错误的在SO帮助页面上显示。回答此类问题或试图这样做,也可能被一些人认为是不好的形式。相反,鼓励发布MCVE。如果您希望完成您在回答中所做的工作,一种可能的替代方法是在问题的注释中添加足够的描述性文本,建议应提供MCVEided,并在外部链接(如pastebin)中提供代码或完全工作的示例。如果您仔细阅读此处的问题,您会发现此类行为的示例。无论如何,我不想争论。如果我们不同意,也没关系。这是社区的本质。感谢@RobertCrovella的解释。问题是:为什么内核会因为t而崩溃他的更大的矩阵?是否有我缺少的越界数组访问?我试图在回答中表达的是,答案中提供的元素没有显示任何问题,而问题在其他地方。提供函数代码示例是为了说明我的说法。我不知道如何投票支持结束这个问题,但我会得到答案r如果被提议为danieljovan,则既没有提供MCVE,也没有将该答案标记为有效。再次感谢Robert。感谢分享,你是一个英雄!这正是我的问题,非常感谢。我想知道为什么通过Google找到这个答案如此困难?这通常适用于所有CUDA用例-特别是MATLAB。刚刚修复了我遇到的一个问题已经有好几天了!我已经为Tdrdelay创建了一个DWORD注册表项。我应该为Tdrlevel创建另一个注册表项吗?的值应该是多少
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include "cuComplex.h"

__global__ void createYBus(float *R, float *X, float *B, int numberOfBuses, int numberOfBranches, int *fromBus, int *toBus, cuComplex *y)
{
    int rowIdx = blockIdx.y*blockDim.y + threadIdx.y;
    int colIdx = blockIdx.x*blockDim.x + threadIdx.x;
    int index = rowIdx*numberOfBuses + colIdx;
    if (rowIdx<numberOfBuses && colIdx<numberOfBuses)
    {
        for (int i=0; i<numberOfBranches; ++i)
        {
            if (rowIdx==fromBus[i] && colIdx==fromBus[i]) { //diagonal element
                y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
            }
            if (rowIdx==toBus[i] && colIdx==toBus[i]) { //diagonal element
                y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
            }
            if (rowIdx==fromBus[i] && colIdx==toBus[i]) { //off-diagonal element
                y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
            }
            if (rowIdx==toBus[i] && colIdx==fromBus[i]) { //off-diagonal element
                y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
            }
        }
    }
}


int main ()
{
    int numLines = 32 ;
    int numberOfBuses = 2383 ;

    int* dev_fromBus, *dev_toBus;
    float *dev_R, *dev_X, *dev_B;
    cuComplex* dev_y ; 

    cudaMalloc((void**)&dev_fromBus, numLines*sizeof(int));
    cudaMalloc((void**)&dev_toBus, numLines*sizeof(int));
    cudaMalloc((void**)&dev_R, numLines*sizeof(float));
    cudaMalloc((void**)&dev_X, numLines*sizeof(float));
    cudaMalloc((void**)&dev_B, numLines*sizeof(float));
    cudaMalloc((void**)&dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex));

    dim3 dimBlock(16, 16); //number of threads 
    dim3 dimGrid((numberOfBuses+15)/16, (numberOfBuses+15)/16);  //number of blocks

    //launch kernel once data has been copied to GPU
    createYBus<<<dimGrid, dimBlock>>>(dev_R, dev_X, dev_B, numberOfBuses, numLines, dev_fromBus, dev_toBus, dev_y);

    cuComplex* y_bus = new cuComplex[numberOfBuses*numberOfBuses] ;

    //copy results back to CPU
    cudaError_t cudaStat6 = cudaMemcpy(y_bus, dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex), cudaMemcpyDeviceToHost);
    if (cudaStat6 != cudaSuccess) {
        printf ("failure : (%d) - %s\n", cudaStat6, ::cudaGetErrorString(cudaStat6)) ;
        return 1;
    }
    return 0 ;
}