Memory CUDA“；未指明的发射失败“；存取存储器_Memory_Cuda_Runtime Error

Memory CUDA“；未指明的发射失败“；存取存储器

memory cuda

Memory CUDA“；未指明的发射失败“；存取存储器,memory,cuda,runtime-error,Memory,Cuda,Runtime Error,我想做的很简单。每个线程从全局内存中存储的全局数组中读取子数组。然后进行一些计算并将结果存储在静态数组中。最后，输出被存储回全局内存中的另一个数组中当我注释将静态数组写入全局数组的行时，内核将运行。如代码所示。有什么想法吗 GPU内核： #ifndef _TEMPLATE_KERNEL_H_ #define _TEMPLATE_KERNEL_H_ #include <stdio.h> __device__ void DecompressBlockGPU(unsigne

我想做的很简单。每个线程从全局内存中存储的全局数组中读取子数组。然后进行一些计算并将结果存储在静态数组中。最后，输出被存储回全局内存中的另一个数组中当我注释将静态数组写入全局数组的行时，内核将运行。如代码所示。有什么想法吗

GPU内核：

#ifndef _TEMPLATE_KERNEL_H_
#define _TEMPLATE_KERNEL_H_

#include <stdio.h>

__device__  void
DecompressBlockGPU(unsigned char *compressed_block,unsigned char *compressed_size,
                    int array_length,unsigned char *decompressed_block)
{       
    int j = 0;

    for(int i = 0 ; i < array_length ;i++)
    {
        for(int idx = 0 ; idx < compressed_size[i]; idx++)
        {
            decompressed_block[j] = compressed_block[i];
            j++;
        }
    }
}
__global__ void

gpu_test(unsigned char *compressed_data,int *OffsetsArray,int xBlocks,int yBlocks,
        unsigned char *output, int BlockSize,int BlockWidth,int BlockHeight,
        int cols,int xTB,int yTB,int xTH,int yTH,unsigned char *aux_array)
{
    int x_max = xBlocks ;
    int y_max = yBlocks ;

    int x_block = blockIdx.x ; 
    int y_block = blockIdx.y ;

    x_max = gridDim.x*blockDim.x ;
    y_max = gridDim.y*blockDim.y ;

    x_block = (blockIdx.x*xTH); 
    y_block = (blockIdx.y*yTH);
    int x_block1 = x_block + threadIdx.x;
    int y_block1 = y_block + threadIdx.y;

    int block_idx = y_block1*xBlocks + x_block1;
    unsigned char *temp_ptr = compressed_data + OffsetsArray[block_idx];        
    int *array_length = (int *)temp_ptr;
    unsigned char *compressed_size = compressed_data + OffsetsArray[block_idx] + 
                               array_length[0] +sizeof(int)/sizeof(unsigned char);
    unsigned char *compressed_block = compressed_data + OffsetsArray[block_idx] + 
                               sizeof(int)/sizeof(unsigned char);

    aux_array = aux_array + (BlockWidth+2)*(BlockHeight+2)*block_idx;
    aux_array[block_idx]=array_length[0];

    unsigned char decompressed_block[72];
    unsigned char extracted_block[32];

    DecompressBlockGPU(compressed_block,compressed_size,array_length[0],
                             &decompressed_block[0]);

    if(block_idx == 0)
    {
        for(int i=0;i<16;i++) aux_array[i]= decompressed_block[i]; //fails  
        for(int i=16;i<16*36;i++) aux_array[i]=1;//works
    }
}
#endif

\ifndef\u模板\u内核\u H_
#定义_模板_内核_H_
#包括
__设备无效
解压缩块GPU（无符号字符*压缩块，无符号字符*压缩块大小，
整数数组（长度，无符号字符*解压块）
{       
int j=0；
for（int i=0；i对于（int i=0；i尝试通过运行程序（或者如果使用并行Nsight，则启用内存检查）.您是否可以编辑问题以包含一些有意义的代码？从您发布的内容中，绝对不可能说出任何可能出错的地方。发布简明、可编译的代码以再现问题，然后您可能会得到一些有用的建议。不是更多细节，而是不同的细节。您询问的是运行时错误，以及然而，您发布的代码无法编译或运行，因此没有人能为您提供太多帮助。我同意Talonmes的观点：解决此类问题需要一个自包含的重新编译案例。在非常通用的术语中，ULF相当于CPU上的SEGFULT，即您可能在内核中的某个地方有一个越界内存访问。到c不管怎样，将问题简化为触发bug的最简单情况是一种很好的调试技术。
unsigned char *runGPU(unsigned char *d_compressed_data,int *OffsetsArray,int xBlocks,int yBlocks,unsigned char *h_output)
{


    printf("xBlocks =%d yBlocks =%d  \n",xBlocks,yBlocks);



    int xTB = 4;
    int yTB = 4;
    int xTH = 1;
    int yTH = 1; 



    unsigned char *d_output;
    unsigned char *d_aux_array;
    unsigned char *h_aux_array;

    int mem_size = image_len*sizeof(unsigned char);
    int big_mem_size = sizeof(unsigned char)*xBlocks*yBlocks*(BlockWidth+2)*(BlockHeight+2);

    cutilSafeCall( cudaMalloc( (void**) &d_output, mem_size));
    cutilSafeCall( cudaMalloc( (void**) &d_aux_array,big_mem_size));
    h_aux_array = (unsigned char *)malloc(big_mem_size);


    float time = 0;
    float totalTime = 0;
    cudaEvent_t start_event4, stop_event4;
    cutilSafeCall( cudaEventCreate(&start_event4) );
    cutilSafeCall( cudaEventCreate(&stop_event4) );
    cutilSafeCall( cudaEventRecord(start_event4, 0) );

    dim3 grid(xTB,yTB, 1);
    dim3 threads( xTH, yTH, 1);

    gpu_test<<<grid,threads>>>(d_compressed_data,OffsetsArray,xBlocks,yBlocks,d_output,BlockSize,BlockWidth,BlockHeight,cols,xTB,yTB,xTH,yTH,d_aux_array);
    cudaThreadSynchronize();

    cutilSafeCall( cudaEventRecord(stop_event4, 0) );
    cutilSafeCall( cudaEventSynchronize(stop_event4) );
    time = 0;
    cutilSafeCall( cudaEventElapsedTime(&time, start_event4, stop_event4));
    totalTime += time;
    totalTime /= (1.0e3 * 1);
    shrLogEx(LOGBOTH | MASTER, 0, "GPU decompression Time = %.5f \n",totalTime); 

    cutilSafeCall(cudaMemcpy(h_output,d_output, mem_size, cudaMemcpyDeviceToHost));
    cutilSafeCall(cudaMemcpy(h_aux_array,d_aux_array, big_mem_size, cudaMemcpyDeviceToHost));


    cudaFree(d_output);
    cudaFree(d_aux_array);

    return h_aux_array;

}