Cuda 库达。无法将网格与maxGridSizes一起使用

Cuda 库达。无法将网格与maxGridSizes一起使用,cuda,Cuda,优化我的代码,以充分利用下一步推出的CUDA卡。 尽管每个信息源都告诉我,使用2.x计算能力,网格可能是(655356553565535)大小,但我无法使用大于(655358192,1)大小的网格。 示例代码显示,即使使用等于(1,1,1)的blockSize和空内核,在使用大于所述大小的网格运行时也会导致错误“code=4(cudaErrorLaunchFailure)” 操作系统:Win10Pro HW:GTS 450 SDK:CUDA 8.0,VS2013CE(使用nvcc-ccbin选项

优化我的代码,以充分利用下一步推出的CUDA卡。
尽管每个信息源都告诉我,使用2.x计算能力,网格可能是(655356553565535)大小,但我无法使用大于(655358192,1)大小的网格。
示例代码显示,即使使用等于(1,1,1)的blockSize和空内核,在使用大于所述大小的网格运行时也会导致错误“code=4(cudaErrorLaunchFailure)”

操作系统:Win10Pro
HW:GTS 450
SDK:CUDA 8.0,VS2013CE(使用nvcc-ccbin选项的直通路径)
测试代码:

#include <helper_cuda.h>

__global__ void KernelTest()
{}

int main()
{
    int cudaDevice=0;  
    int driverVersion = 0, runtimeVersion = 0;
    int deviceCount = 0;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

    if (error_id != cudaSuccess)
    {
        printf ("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
        printf ("Result = FAIL\n");
        exit(EXIT_FAILURE);
    }

    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
    {
        printf("There are no available device(s) that support CUDA\n");
    }
    else
    {
        printf ("Detected %d CUDA Capable device(s)\n", deviceCount);
    }

    cudaSetDevice(cudaDevice);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, cudaDevice);

    cudaDriverGetVersion(&driverVersion);
    cudaRuntimeGetVersion(&runtimeVersion);
    printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
    printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

    char msg[256];
    …
    //Code from deviceQuery
    …


const char *sComputeMode[] =
{
    "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
    "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
    "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
    "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
    "Unknown",
    NULL
};
    printf("  Compute Mode:\n");
    printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);


    //dim3 gridtest(deviceProp.maxGridSize[0]-1, deviceProp.maxGridSize[1]-1, deviceProp.maxGridSize[2]-1);
    dim3 gridtest(deviceProp.maxGridSize[0], 1, 1);
    dim3 blocktest(1);
    KernelTest<<<gridtest,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

    dim3 gridtest2(deviceProp.maxGridSize[0]/2, 2, 1);
    KernelTest<<<gridtest2,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

    dim3 gridtest3(deviceProp.maxGridSize[0]/4, 4, 1);
    KernelTest<<<gridtest3,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

    dim3 gridtest4(deviceProp.maxGridSize[0], 2, 1);
    KernelTest<<<gridtest4,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

    dim3 gridtest5(deviceProp.maxGridSize[0], 4, 1);
    KernelTest<<<gridtest5,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

    dim3 gridtest6(deviceProp.maxGridSize[0], (deviceProp.maxGridSize[1]+1)/16, 1);//4096
    KernelTest<<<gridtest6,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

    dim3 gridtest7(deviceProp.maxGridSize[0], (deviceProp.maxGridSize[1]+1)/8, 1);//8192
    KernelTest<<<gridtest7,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

    dim3 gridtest8(deviceProp.maxGridSize[0], (deviceProp.maxGridSize[1]+1)/4, 1);//16384 - Causes Error
    KernelTest<<<gridtest8,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

//    dim3 gridtest9(deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], 1);
//    KernelTest<<<gridtest9,blocktest>>>();
//    cudaDeviceSynchronize();
//    checkCudaErrors(cudaPeekAtLastError (  ));


    cudaDeviceReset() ;
}
#包括
__全局_uuu无效内核测试()
{}
int main()
{
int cudaDevice=0;
int-driverVersion=0,runtimeVersion=0;
int deviceCount=0;
cudaError\u t error\u id=cudaGetDeviceCount(&deviceCount);
如果(错误\u id!=cudaSuccess)
{
printf(“cudaGetDeviceCount返回%d\n->%s\n”,(int)错误\u id,cudaGetErrorString(错误\u id));
printf(“结果=失败\n”);
退出(退出失败);
}
//如果没有支持CUDA的设备,此函数调用将返回0。
如果(deviceCount==0)
{
printf(“没有支持CUDA的可用设备\n”);
}
其他的
{
printf(“检测到%d个支持CUDA的设备)\n”,设备计数);
}
cudaSetDevice(cudaDevice);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(和deviceProp、CudAgetDevice);
cudaDriverGetVersion(和驱动服务器版本);
cudaRuntimeGetVersion(&runtimeVersion);
printf(“CUDA驱动程序版本/运行时版本%d.%d/%d.%d\n”,驱动服务器版本/1000,(驱动服务器版本%100)/10,运行时版本/1000,(运行时版本%100)/10);
printf(“CUDA能力主要/次要版本号:%d.%d\n”,deviceProp.Major,deviceProp.Minor);
char-msg[256];
…
//来自deviceQuery的代码
…
常量字符*scomputermode[]=
{
“默认情况下(多个主机线程可以将::cudaSetDevice()与设备同时使用)”,
“独占(一个进程中只有一个主机线程能够将::cudaSetDevice()用于此设备)”,
“禁止(没有主机线程可以将::cudaSetDevice()用于此设备)”,
“独占进程(一个进程中的多个线程能够将::cudaSetDevice()用于此设备)”,
“未知”,
无效的
};
printf(“计算模式:\n”);
printf(“<%s>\n”,SCOComputerMode[deviceProp.computeMode]);
//dim3网格测试(deviceProp.maxGridSize[0]-1,deviceProp.maxGridSize[1]-1,deviceProp.maxGridSize[2]-1);
dim3网格测试(deviceProp.maxGridSize[0],1,1);
dim3块体试验(1);
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
dim3 gridtest2(deviceProp.maxGridSize[0]/2,2,1);
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
dim3 gridtest3(deviceProp.maxGridSize[0]/4,4,1);
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
dim3 gridtest4(deviceProp.maxGridSize[0],2,1);
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
dim3 gridtest5(deviceProp.maxGridSize[0],4,1);
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
dim3 gridtest6(deviceProp.maxGridSize[0],(deviceProp.maxGridSize[1]+1)/16,1);/4096
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
dim3 gridtest7(deviceProp.maxGridSize[0],(deviceProp.maxGridSize[1]+1)/8,1);/8192
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
dim3 gridtest8(deviceProp.maxGridSize[0],(deviceProp.maxGridSize[1]+1)/4,1);//16384-导致错误
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
//dim3 gridtest9(deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],1);
//核测试();
//cudaDeviceSynchronize();
//检查cudaErrors(cudaPeekAtLastError());
cudaDeviceReset();
}
deviceQuery部件的输出:

CUDA Driver Version / Runtime Version          9.1 / 8.0
  CUDA Capability Major/Minor version number:    2.1
  Total amount of global memory:                 1024 MBytes (1073741824 bytes)
  ( 4) Multiprocessors, ( 48) CUDA Cores/MP:     192 CUDA Cores
  GPU Max Clock rate:                            1566 MHz (1.57 GHz)
  Memory Clock rate:                             1804 Mhz
  Memory Bus Width:                              128-bit
  L2 Cache Size:                                 262144 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65535), 3D=(2048, 2048, 2048)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 32768
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (65535, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
CUDA驱动程序版本/运行时版本9.1/8.0
CUDA能力主要/次要版本号:2.1
全局内存总量:1024 MB(1073741824字节)
(4)多处理器,(48)CUDA核/MP:192个CUDA核
GPU最大时钟频率:1566MHz(1.57GHz)
内存时钟频率:1804 Mhz
内存总线宽度:128位
二级缓存大小:262144字节
最大纹理尺寸大小(x,y,z)1D=(65536),2D=(6553665535),3D=(204820482048)
最大分层1D纹理大小,(num)层1D=(16384),2048层
最大分层2D纹理大小,(num)层2D=(16384,16384),2048层
恒定内存总量:65536字节
每个块的共享内存总量:49152字节
每个块可用的寄存器总数:32768
经纱尺寸:32
每个多处理器的最大线程数:1536
每个块的最大线程数:1024
螺纹块的最大尺寸(x、y、z):(1024、1024、64)
栅格尺寸的最大尺寸(x、y、z):(65535、65535、65535)
最大内存间距:2147483647字节
纹理对齐:512字节
并发复制和内核执行:是,使用1个复制引擎
内核的运行时间限制:是
集成GPU共享主机内存:否
支持主机页锁定内存映射:是
表面对齐要求:是
设备具有ECC支持:已禁用
CUDA设备驱动程序模式(TCC或WDDM):WDDM(Windows显示驱动程序型号)
设备支持统一寻址(UVA):是
设备PCI域ID/总线ID/位置ID:0/1/0
计算
CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*           Device   Context    Stream  Name
235.86ms  139.29us          (65535 1 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [106]
236.03ms  138.49us          (32767 2 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [109]
236.19ms  138.46us          (16383 4 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [112]
236.35ms  275.58us          (65535 2 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [115]
236.65ms  550.09us          (65535 4 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [118]
237.22ms  504.49ms       (65535 4096 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [121]
741.79ms  924.72ms       (65535 8192 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [124]
1.66659s  1.84941s      (65535 16384 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [127]