Warning: file_get_contents(/data/phpspider/zhask/data//catemap/6/cplusplus/132.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C++ CUDA内核在没有任何原因的情况下使用20k+线程崩溃_C++_Visual Studio_Cuda_Gpu_Nvidia - Fatal编程技术网

C++ CUDA内核在没有任何原因的情况下使用20k+线程崩溃

C++ CUDA内核在没有任何原因的情况下使用20k+线程崩溃,c++,visual-studio,cuda,gpu,nvidia,C++,Visual Studio,Cuda,Gpu,Nvidia,我在一个CUDA项目中工作,但我遇到了一些严重的问题,我找不到解决办法 我实施了该项目,并在我的个人电脑pA使用NVIDIA Quadro K2000m,它的工作。但是当我在一个集群上部署这个项目时,它有一个Nvidia Tesla GPU,而在另一台PC pB Nvidia gtx 960m上,它就不会执行了 有趣的是,当我在pB second PC上使用VisualStudio中的Nsight调试器时,它将执行而不显示错误:未指定的启动失败 这是第一个内核的代码: __global__ vo

我在一个CUDA项目中工作,但我遇到了一些严重的问题,我找不到解决办法

我实施了该项目,并在我的个人电脑pA使用NVIDIA Quadro K2000m,它的工作。但是当我在一个集群上部署这个项目时,它有一个Nvidia Tesla GPU,而在另一台PC pB Nvidia gtx 960m上,它就不会执行了

有趣的是,当我在pB second PC上使用VisualStudio中的Nsight调试器时,它将执行而不显示错误:未指定的启动失败

这是第一个内核的代码:

__global__ void calcKernel(float *dev_calcMatrix,

                        int *documentarray,
                        int *documentTermArray,
                        int *distincttermsarray,
                        int *distinctclassarray,
                        int *startingPointOfClassDoc,
                        int *endingPOintOfClassDoc,
                        int sizeOfDistinctClassarray,
                        int sizeOfTerms)
{

 int index = blockIdx.x * blockDim.x + threadIdx.x;

int term = distincttermsarray[index];

if (index <= sizeOfTerms) {

    for (int i = 0; i < sizeOfDistinctClassarray; i++)
    {
        int save = (index * sizeOfDistinctClassarray) + i;
        bool test = false;
        for (int j = startingPointOfClassDoc[i]; j <= endingPOintOfClassDoc[i]; j++)
        {
            if (term == documentarray[j])
            {
                printf("%i \t", index);
                dev_calcMatrix[save] = dev_calcMatrix[save] + documentTermArray[j];

                //printf("TermArray: documentTermArray[j] %d\n", dev_calcMatrix[save], documentTermArray[j]);

                test = true;
            }
        }

        if (!test) dev_calcMatrix[save] = 0;


    }
}
}
这是我用来创建线程和块的代码:

float blockNotFinal = data.sizeOfDistinctTerms / 1024;
int threads = 0;
int  blocks = (int)floor(blockNotFinal);

dim3 dimGrid((blocks + 1), 1, 1);
if (data.sizeOfDistinctTerms < 1024)
{
    threads = data.sizeOfDistinctTerms;
}
else
{
    threads = 1024;
}
dim3 dimBlock(threads, 1, 1);
所以,我需要创建23652个线程。我现在做的是23652/1024=23.09。在得到23.09的值后,我将其四舍五入为23,并添加+1=24个块。所以我创建了24个块*1024个线程:24576个线程

我知道一些线程将被创建,即使它们不会被使用,这就是为什么我在内核的beggining中添加了这个if语句:

int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index <= sizeOfTerms (23,652 is the size)) { .... }
问题是我在IF语句之前和之后添加了一些PRINTF

在IF语句之前,崩溃前线程的最大索引是:24479 在IF语句中,崩溃前线程的最大索引为:23487

因此,从上面的信息来看,线程的数量并没有达到最大值。另外,在集群上,它给了我另一个错误:遇到非法内存访问。我知道这个错误意味着它可能有一个索引越界,但我给出的数组大小和线程数相等

下面是我在GPU中分配内存的代码:

cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    goto Error;
}

cout << "\n Allocated GPU buffers";
// Allocate GPU buffers for input and output vectors
cudaStatus = cudaMalloc((void**)&dev_calcMatrix, data.sizeOfDistinctTerms * data.sizeOfDistinctClassarray * sizeof(float));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_probMatrix, data.sizeOfDistinctTerms * data.sizeOfDistinctClassarray * sizeof(float));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&classSummationTerms, data.sizeOfDistinctClassarray * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&documentarray, data.sizeOfTotalTermsDocsFreq * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&documentTermArray, data.sizeOfTotalTermsDocsFreq * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&distincttermsarray, data.sizeOfDistinctTerms * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&distinctclassarray, data.sizeOfDistinctClassarray * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&startingPointOfClassDoc, data.sizeOfDistinctClassarray * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&endingPOintOfClassDoc, data.sizeOfDistinctClassarray * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cout << "\n Copied input vectors from host to GPU";
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(documentarray, data.documentarray, data.sizeOfTotalTermsDocsFreq * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(documentTermArray, data.documentTermArray, data.sizeOfTotalTermsDocsFreq * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(distincttermsarray, data.distincttermsarray, data.sizeOfDistinctTerms * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(classSummationTerms, data.classSummationTerms, data.sizeOfDistinctClassarray * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(distinctclassarray, data.distinctclassarray, data.sizeOfDistinctClassarray * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(startingPointOfClassDoc, data.startingPointOfClassDoc, data.sizeOfDistinctClassarray * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(endingPOintOfClassDoc, data.endingPOintOfClassDoc, data.sizeOfDistinctClassarray * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}


cout << "\n Now we call the CALCKERNL()";
// Launch a kernel on the GPU with one thread for each element.
calcKernel <<<dimGrid, dimBlock >>>(dev_calcMatrix,
                            documentarray, 
                            documentTermArray, 
                            distincttermsarray, 
                            distinctclassarray, 
                            startingPointOfClassDoc, 
                            endingPOintOfClassDoc,
                            sizi,
                            sizeOfTerms);

//// cudaDeviceSynchronize waits for the kernel to finish, and returns
//// any errors encountered during the launch.
//cudaStatus = cudaDeviceSynchronize();
//if (cudaStatus != cudaSuccess) {
//  fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
//  goto Error;
//}

cudaStatus = cudaStreamSynchronize(0);
if (cudaStatus != cudaSuccess) {
    //fprintf(stderr, "calcKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    cout << "\n Synchronization failed: " << cudaGetErrorString(cudaStatus);
    goto Error;
}
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "calcKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    goto Error;
}
知道为什么会发生这种情况吗?

如果没有完整的代码,甚至没有完整的代码,就无法回答。但是内核的开头已经有两个可能导致越界内存访问的bug:

    int index = blockIdx.x * blockDim.x + threadIdx.x;

    int term = distincttermsarray[index];

    if (index <= sizeOfTerms) {

首先,在检查索引是否在所需范围内之前,将其用作数组索引是不安全的。其次,检查需要针对index
我的建议是在关闭TDR的情况下使用Nsight+VisualStudio,因此,如果非法错误需要一段时间才能发生,则不会有问题

不,23652还可以。问题是,他正在运行23653个线程。我想你将很难制造一个。GPU上的线程数量有限吗?对于不同的GPU,此限制是否不同?您可以使用描述的方法将非法内存访问错误定位为一行代码。如果有必要,您可以在内核中使用printf或其他方法,例如调试器,以帮助理解为什么这行代码会生成非法访问。哇,我确实犯了一些错误:。我按照您告诉我的方式解决了这些问题,但我仍然有相同的问题:我对Visual Studio进行了清理,因为这有时会给我带来非常棘手的问题,但它仍然不起作用。我编译了它并在我提到的集群中运行了它,但仍然存在相同的问题:在cuda memcheck下运行代码并修复指出的问题。准备一个MCVE。那我们拭目以待。