Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/c/60.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C++ CUDA结果使用非常大的数组返回垃圾,但不报告任何错误_C++_C_Cuda_Nvidia - Fatal编程技术网

C++ CUDA结果使用非常大的数组返回垃圾,但不报告任何错误

C++ CUDA结果使用非常大的数组返回垃圾,但不报告任何错误,c++,c,cuda,nvidia,C++,C,Cuda,Nvidia,我正在创建一个测试程序,该程序将创建一个设备和一个大小为n的主机阵列,然后启动一个内核,该内核将创建n个线程,这些线程将常量值0.95f分配给设备阵列中的每个位置。完成后,将设备阵列复制到主机阵列,并对所有条目进行合计,并显示最终合计 下面的程序对于大小高达6000万个浮点数的数组似乎工作得很好,并很快返回正确的结果,但当达到7000万个浮点数时,该程序似乎会挂起一段时间,最终返回一个总的NAN结果。在6000万次运行后检查主机阵列,表明已正确填充0.95f,但在7000万次运行后检查主机阵列,

我正在创建一个测试程序,该程序将创建一个设备和一个大小为n的主机阵列,然后启动一个内核,该内核将创建n个线程,这些线程将常量值0.95f分配给设备阵列中的每个位置。完成后,将设备阵列复制到主机阵列,并对所有条目进行合计,并显示最终合计

下面的程序对于大小高达6000万个浮点数的数组似乎工作得很好,并很快返回正确的结果,但当达到7000万个浮点数时,该程序似乎会挂起一段时间,最终返回一个总的NAN结果。在6000万次运行后检查主机阵列,表明已正确填充0.95f,但在7000万次运行后检查主机阵列,表明已填充NAN。据我所知,CUDA调用没有返回错误

我使用的是2GB GT640m(Compute 3.0),最大块大小为1024,最大网格尺寸为2147483647

我相信有更好的方法来实现类似的目标,我希望听到一些建议。但我也想了解这里出了什么问题,以便从中吸取教训

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <fstream>

void cudaErrorHandler(cudaError_t status)
{
    // Cuda call returned an error, just print error for now
    if(status != cudaSuccess)
    {
        printf("Error");
    }
}

__global__ void addKernel(float* _Results, int _TotalCombinations)
{
    // Get thread Id
    unsigned int Id = (blockDim.x * blockDim.y * blockIdx.x) + (blockDim.x * threadIdx.y) + threadIdx.x;

    //If the Id is within simulation range, log it
    if(Id < _TotalCombinations)
    {
        _Results[Id] = 0.95f;
    }
}

#define BLOCK_DIM_X 32
#define BLOCK_DIM_Y 32
#define BLOCK_SIZE BLOCK_DIM_X * BLOCK_DIM_Y // Statc block size of 32*32 (1024)
#define CUDA_CALL(x) cudaErrorHandler(x)

int main()
{
    // The number of simulations to run
    unsigned int totalCombinations = 45000000;

    int gridsize = 1;

    // Work out how many blocks of size 1024 are required to perform all of totalCombinations
    for(unsigned int totalsize = gridsize * BLOCK_SIZE; totalsize < totalCombinations; 
        gridsize++, totalsize = gridsize * BLOCK_SIZE)
        ;

    // Allocate host memory
    float* host_results = new float[totalCombinations];
    memset(host_results, 0, sizeof(float) * totalCombinations);
    float *dev_results = 0;

    cudaSetDevice(0);

    // Allocate device memory
    CUDA_CALL(cudaMalloc((void**)&dev_results, totalCombinations * sizeof(float)));

    dim3 grid, block;

    block = dim3(BLOCK_DIM_X, BLOCK_DIM_Y);

    grid = dim3(gridsize);

    // Launch kernel
    addKernel<<<gridsize, block>>>(dev_results, totalCombinations);

    // Wait for synchronize
    CUDA_CALL(cudaDeviceSynchronize());

    // Copy device data back to host
    CUDA_CALL(cudaMemcpy(host_results, dev_results, totalCombinations * sizeof(float), cudaMemcpyDeviceToHost));

    double total = 0.0;

    // Total the results in the host array
    for(unsigned int i = 0; i < totalCombinations; i++)
        total+=host_results[i];

    // Print results to screen
    printf("Total %f\n", total);

    delete[] host_results;

    return 0;
}
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#包括
无效cudaErrorHandler(cudaError\t状态)
{
//Cuda调用返回了一个错误,请暂时打印错误
如果(状态!=cudaSuccess)
{
printf(“错误”);
}
}
__全局无效addKernel(浮点*结果,整数+总组合)
{
//获取线程Id
无符号int-Id=(blockDim.x*blockDim.y*blockIdx.x)+(blockDim.x*threadIdx.y)+threadIdx.x;
//如果Id在模拟范围内,则记录它
如果(Id<\u总组合)
{
_结果[Id]=0.95f;
}
}
#定义块尺寸×32
#定义块尺寸32
#定义块大小块大小X*块大小Y//Statc块大小为32*32(1024)
#定义CUDA_调用(x)cudaErrorHandler(x)
int main()
{
//要运行的模拟数
无符号整数组合=45000000;
int gridsize=1;
//计算出需要多少大小为1024的块来执行所有TotalCombination
对于(无符号整数totalsize=gridsize*块大小;totalsize
正如您所发现的,您的错误处理方法不起作用。下面,我用我经常使用的错误检查方法粘贴了一个版本的代码。故障点无法正常工作的原因是您的网格大小(您正在启动一个1D网格)超过了X维度的最大网格大小(默认情况下为65535,即计算能力高达2.X)。如果您想利用更大的gridsize维度(2^31-1是计算能力3.0的限制),则需要使用
-arch=sm_30
开关进行编译

这里有一个代码版本,显示了我经常使用的错误检查方法,仅供参考

#include <stdio.h>
#include <fstream>


#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__global__ void addKernel(float* _Results, int _TotalCombinations)
{
    // Get thread Id
    unsigned int Id = (blockDim.x * blockDim.y * blockIdx.x) + (blockDim.x * threadIdx.y) + threadIdx.x;

    //If the Id is within simulation range, log it
    if(Id < _TotalCombinations)
    {
        _Results[Id] = 0.95f;
    }
}

#define BLOCK_DIM_X 32
#define BLOCK_DIM_Y 32
#define BLOCK_SIZE BLOCK_DIM_X * BLOCK_DIM_Y // Statc block size of 32*32 (1024)

int main()
{
    // The number of simulations to run
    unsigned int totalCombinations = 65000000;

    int gridsize = 1;

    // Work out how many blocks of size 1024 are required to perform all of totalCombinations
    for(unsigned int totalsize = gridsize * BLOCK_SIZE; totalsize < totalCombinations;
        gridsize++, totalsize = gridsize * BLOCK_SIZE)
        ;
    printf("gridsize = %d, blocksize = %d\n", gridsize, BLOCK_SIZE);
    // Allocate host memory
    float* host_results = new float[totalCombinations];
    memset(host_results, 0, sizeof(float) * totalCombinations);
    float *dev_results = 0;

    cudaSetDevice(0);

    // Allocate device memory
    cudaMalloc((void**)&dev_results, totalCombinations * sizeof(float));
    cudaCheckErrors("cudaMalloc fail");

    dim3 grid, block;

    block = dim3(BLOCK_DIM_X, BLOCK_DIM_Y);

    grid = dim3(gridsize);

    // Launch kernel
    addKernel<<<gridsize, block>>>(dev_results, totalCombinations);
    cudaCheckErrors("kernel fail");
    // Wait for synchronize
    cudaDeviceSynchronize();
    cudaCheckErrors("sync fail");

    // Copy device data back to host
    cudaMemcpy(host_results, dev_results, totalCombinations * sizeof(float), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy 2 fail");

    double total = 0.0;

    // Total the results in the host array
    for(unsigned int i = 0; i < totalCombinations; i++)
        total+=host_results[i];

    // Print results to screen
    printf("Total %f\n", total);

    delete[] host_results;

    return 0;
}
#包括
#包括
#定义cudaCheckErrors(msg)\
做{\
cudaError\u t\u err=cudaGetLastError()\
如果(_err!=cudaSuccess){\
fprintf(标准,“致命错误:%s(%s位于%s:%d)\n”\
msg,cudaGetErrorString(_err)\
__文件(行)\
fprintf(stderr,“***失败-中止\n”)\
出口(1)\
} \
}而(0)
__全局无效addKernel(浮点*结果,整数+总组合)
{
//获取线程Id
无符号int-Id=(blockDim.x*blockDim.y*blockIdx.x)+(blockDim.x*threadIdx.y)+threadIdx.x;
//如果Id在模拟范围内,则记录它
如果(Id<\u总组合)
{
_结果[Id]=0.95f;
}
}
#定义块尺寸×32
#定义块尺寸32
#定义块大小块大小X*块大小Y//Statc块大小为32*32(1024)
int main()
{
//要运行的模拟数
无符号整数组合=65000000;
int gridsize=1;
//计算出需要多少大小为1024的块来执行所有TotalCombination
对于(无符号整数totalsize=gridsize*块大小;totalsize