Cuda 库达。无法将网格与maxGridSizes一起使用
优化我的代码,以充分利用下一步推出的CUDA卡。Cuda 库达。无法将网格与maxGridSizes一起使用,cuda,Cuda,优化我的代码,以充分利用下一步推出的CUDA卡。 尽管每个信息源都告诉我,使用2.x计算能力,网格可能是(655356553565535)大小,但我无法使用大于(655358192,1)大小的网格。 示例代码显示,即使使用等于(1,1,1)的blockSize和空内核,在使用大于所述大小的网格运行时也会导致错误“code=4(cudaErrorLaunchFailure)” 操作系统:Win10Pro HW:GTS 450 SDK:CUDA 8.0,VS2013CE(使用nvcc-ccbin选项
尽管每个信息源都告诉我,使用2.x计算能力,网格可能是(655356553565535)大小,但我无法使用大于(655358192,1)大小的网格。
示例代码显示,即使使用等于(1,1,1)的blockSize和空内核,在使用大于所述大小的网格运行时也会导致错误“code=4(cudaErrorLaunchFailure)” 操作系统:Win10Pro
HW:GTS 450
SDK:CUDA 8.0,VS2013CE(使用nvcc-ccbin选项的直通路径)
测试代码:
#include <helper_cuda.h>
__global__ void KernelTest()
{}
int main()
{
int cudaDevice=0;
int driverVersion = 0, runtimeVersion = 0;
int deviceCount = 0;
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if (error_id != cudaSuccess)
{
printf ("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
printf ("Result = FAIL\n");
exit(EXIT_FAILURE);
}
// This function call returns 0 if there are no CUDA capable devices.
if (deviceCount == 0)
{
printf("There are no available device(s) that support CUDA\n");
}
else
{
printf ("Detected %d CUDA Capable device(s)\n", deviceCount);
}
cudaSetDevice(cudaDevice);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, cudaDevice);
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor);
char msg[256];
…
//Code from deviceQuery
…
const char *sComputeMode[] =
{
"Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
"Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
"Prohibited (no host thread can use ::cudaSetDevice() with this device)",
"Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
"Unknown",
NULL
};
printf(" Compute Mode:\n");
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
//dim3 gridtest(deviceProp.maxGridSize[0]-1, deviceProp.maxGridSize[1]-1, deviceProp.maxGridSize[2]-1);
dim3 gridtest(deviceProp.maxGridSize[0], 1, 1);
dim3 blocktest(1);
KernelTest<<<gridtest,blocktest>>>();
cudaDeviceSynchronize();
checkCudaErrors(cudaPeekAtLastError ( ));
dim3 gridtest2(deviceProp.maxGridSize[0]/2, 2, 1);
KernelTest<<<gridtest2,blocktest>>>();
cudaDeviceSynchronize();
checkCudaErrors(cudaPeekAtLastError ( ));
dim3 gridtest3(deviceProp.maxGridSize[0]/4, 4, 1);
KernelTest<<<gridtest3,blocktest>>>();
cudaDeviceSynchronize();
checkCudaErrors(cudaPeekAtLastError ( ));
dim3 gridtest4(deviceProp.maxGridSize[0], 2, 1);
KernelTest<<<gridtest4,blocktest>>>();
cudaDeviceSynchronize();
checkCudaErrors(cudaPeekAtLastError ( ));
dim3 gridtest5(deviceProp.maxGridSize[0], 4, 1);
KernelTest<<<gridtest5,blocktest>>>();
cudaDeviceSynchronize();
checkCudaErrors(cudaPeekAtLastError ( ));
dim3 gridtest6(deviceProp.maxGridSize[0], (deviceProp.maxGridSize[1]+1)/16, 1);//4096
KernelTest<<<gridtest6,blocktest>>>();
cudaDeviceSynchronize();
checkCudaErrors(cudaPeekAtLastError ( ));
dim3 gridtest7(deviceProp.maxGridSize[0], (deviceProp.maxGridSize[1]+1)/8, 1);//8192
KernelTest<<<gridtest7,blocktest>>>();
cudaDeviceSynchronize();
checkCudaErrors(cudaPeekAtLastError ( ));
dim3 gridtest8(deviceProp.maxGridSize[0], (deviceProp.maxGridSize[1]+1)/4, 1);//16384 - Causes Error
KernelTest<<<gridtest8,blocktest>>>();
cudaDeviceSynchronize();
checkCudaErrors(cudaPeekAtLastError ( ));
// dim3 gridtest9(deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], 1);
// KernelTest<<<gridtest9,blocktest>>>();
// cudaDeviceSynchronize();
// checkCudaErrors(cudaPeekAtLastError ( ));
cudaDeviceReset() ;
}
#包括
__全局_uuu无效内核测试()
{}
int main()
{
int cudaDevice=0;
int-driverVersion=0,runtimeVersion=0;
int deviceCount=0;
cudaError\u t error\u id=cudaGetDeviceCount(&deviceCount);
如果(错误\u id!=cudaSuccess)
{
printf(“cudaGetDeviceCount返回%d\n->%s\n”,(int)错误\u id,cudaGetErrorString(错误\u id));
printf(“结果=失败\n”);
退出(退出失败);
}
//如果没有支持CUDA的设备,此函数调用将返回0。
如果(deviceCount==0)
{
printf(“没有支持CUDA的可用设备\n”);
}
其他的
{
printf(“检测到%d个支持CUDA的设备)\n”,设备计数);
}
cudaSetDevice(cudaDevice);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(和deviceProp、CudAgetDevice);
cudaDriverGetVersion(和驱动服务器版本);
cudaRuntimeGetVersion(&runtimeVersion);
printf(“CUDA驱动程序版本/运行时版本%d.%d/%d.%d\n”,驱动服务器版本/1000,(驱动服务器版本%100)/10,运行时版本/1000,(运行时版本%100)/10);
printf(“CUDA能力主要/次要版本号:%d.%d\n”,deviceProp.Major,deviceProp.Minor);
char-msg[256];
…
//来自deviceQuery的代码
…
常量字符*scomputermode[]=
{
“默认情况下(多个主机线程可以将::cudaSetDevice()与设备同时使用)”,
“独占(一个进程中只有一个主机线程能够将::cudaSetDevice()用于此设备)”,
“禁止(没有主机线程可以将::cudaSetDevice()用于此设备)”,
“独占进程(一个进程中的多个线程能够将::cudaSetDevice()用于此设备)”,
“未知”,
无效的
};
printf(“计算模式:\n”);
printf(“<%s>\n”,SCOComputerMode[deviceProp.computeMode]);
//dim3网格测试(deviceProp.maxGridSize[0]-1,deviceProp.maxGridSize[1]-1,deviceProp.maxGridSize[2]-1);
dim3网格测试(deviceProp.maxGridSize[0],1,1);
dim3块体试验(1);
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
dim3 gridtest2(deviceProp.maxGridSize[0]/2,2,1);
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
dim3 gridtest3(deviceProp.maxGridSize[0]/4,4,1);
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
dim3 gridtest4(deviceProp.maxGridSize[0],2,1);
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
dim3 gridtest5(deviceProp.maxGridSize[0],4,1);
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
dim3 gridtest6(deviceProp.maxGridSize[0],(deviceProp.maxGridSize[1]+1)/16,1);/4096
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
dim3 gridtest7(deviceProp.maxGridSize[0],(deviceProp.maxGridSize[1]+1)/8,1);/8192
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
dim3 gridtest8(deviceProp.maxGridSize[0],(deviceProp.maxGridSize[1]+1)/4,1);//16384-导致错误
核测试();
cudaDeviceSynchronize();
检查cudaErrors(cudaPeekAtLastError());
//dim3 gridtest9(deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],1);
//核测试();
//cudaDeviceSynchronize();
//检查cudaErrors(cudaPeekAtLastError());
cudaDeviceReset();
}
deviceQuery部件的输出:
CUDA Driver Version / Runtime Version 9.1 / 8.0
CUDA Capability Major/Minor version number: 2.1
Total amount of global memory: 1024 MBytes (1073741824 bytes)
( 4) Multiprocessors, ( 48) CUDA Cores/MP: 192 CUDA Cores
GPU Max Clock rate: 1566 MHz (1.57 GHz)
Memory Clock rate: 1804 Mhz
Memory Bus Width: 128-bit
L2 Cache Size: 262144 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65535), 3D=(2048, 2048, 2048)
Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 32768
Warp size: 32
Maximum number of threads per multiprocessor: 1536
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (65535, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
CUDA Device Driver Mode (TCC or WDDM): WDDM (Windows Display Driver Model)
Device supports Unified Addressing (UVA): Yes
Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
CUDA驱动程序版本/运行时版本9.1/8.0
CUDA能力主要/次要版本号:2.1
全局内存总量:1024 MB(1073741824字节)
(4)多处理器,(48)CUDA核/MP:192个CUDA核
GPU最大时钟频率:1566MHz(1.57GHz)
内存时钟频率:1804 Mhz
内存总线宽度:128位
二级缓存大小:262144字节
最大纹理尺寸大小(x,y,z)1D=(65536),2D=(6553665535),3D=(204820482048)
最大分层1D纹理大小,(num)层1D=(16384),2048层
最大分层2D纹理大小,(num)层2D=(16384,16384),2048层
恒定内存总量:65536字节
每个块的共享内存总量:49152字节
每个块可用的寄存器总数:32768
经纱尺寸:32
每个多处理器的最大线程数:1536
每个块的最大线程数:1024
螺纹块的最大尺寸(x、y、z):(1024、1024、64)
栅格尺寸的最大尺寸(x、y、z):(65535、65535、65535)
最大内存间距:2147483647字节
纹理对齐:512字节
并发复制和内核执行:是,使用1个复制引擎
内核的运行时间限制:是
集成GPU共享主机内存:否
支持主机页锁定内存映射:是
表面对齐要求:是
设备具有ECC支持:已禁用
CUDA设备驱动程序模式(TCC或WDDM):WDDM(Windows显示驱动程序型号)
设备支持统一寻址(UVA):是
设备PCI域ID/总线ID/位置ID:0/1/0
计算
CUDA Device Driver Mode (TCC or WDDM): WDDM (Windows Display Driver Model)
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Device Context Stream Name
235.86ms 139.29us (65535 1 1) (1 1 1) 8 0B 0B GeForce GTX 970 1 7 KernelTest(void) [106]
236.03ms 138.49us (32767 2 1) (1 1 1) 8 0B 0B GeForce GTX 970 1 7 KernelTest(void) [109]
236.19ms 138.46us (16383 4 1) (1 1 1) 8 0B 0B GeForce GTX 970 1 7 KernelTest(void) [112]
236.35ms 275.58us (65535 2 1) (1 1 1) 8 0B 0B GeForce GTX 970 1 7 KernelTest(void) [115]
236.65ms 550.09us (65535 4 1) (1 1 1) 8 0B 0B GeForce GTX 970 1 7 KernelTest(void) [118]
237.22ms 504.49ms (65535 4096 1) (1 1 1) 8 0B 0B GeForce GTX 970 1 7 KernelTest(void) [121]
741.79ms 924.72ms (65535 8192 1) (1 1 1) 8 0B 0B GeForce GTX 970 1 7 KernelTest(void) [124]
1.66659s 1.84941s (65535 16384 1) (1 1 1) 8 0B 0B GeForce GTX 970 1 7 KernelTest(void) [127]