Crash OpenCl程序崩溃视频卡驱动程序
我已经写了一个程序,用于在GPU上计算矩阵积 我的问题是,对于大型矩阵,催化剂驱动程序会崩溃。我知道超时检测会导致长时间的计算,但我的计算速度相当快,所以我认为这不是问题 本质上,我有三种不同的代码迭代。第一个是矩阵多重化的简单实现,它使用OpenCL中的内置功能来确定工作组大小(WGS)。这个版本工作正常,但由于带宽的模仿(我猜)而效率低下 第二个版本手动指定WGS。在这种情况下,工作组是尺寸为{256,1,1}的薄片(以便能够乘以由素数指定尺寸的矩阵)。WGS是clGetKernelWorkGroupInfo()返回的CL\u内核\u首选\u工作\u组大小\u倍数的倍数。此版本在使用大于约4000 x 4000的矩阵时崩溃。它也比第一个版本慢很多 第三个版本与第二个版本类似,只是它使用本地内存。对于大于约2000 x 2000的矩阵,此版本将崩溃。就其实际工作的矩阵大小而言,它是迄今为止最快的 我在Windows 8.1上使用带有gcc的MinGW64(如果需要,可以检查版本,不记得了)。我使用AMD R9 290和CCC 14.9驱动程序 有用的链接 内核(本地内存版本):Crash OpenCl程序崩溃视频卡驱动程序,crash,driver,opencl,matrix-multiplication,Crash,Driver,Opencl,Matrix Multiplication,我已经写了一个程序,用于在GPU上计算矩阵积 我的问题是,对于大型矩阵,催化剂驱动程序会崩溃。我知道超时检测会导致长时间的计算,但我的计算速度相当快,所以我认为这不是问题 本质上,我有三种不同的代码迭代。第一个是矩阵多重化的简单实现,它使用OpenCL中的内置功能来确定工作组大小(WGS)。这个版本工作正常,但由于带宽的模仿(我猜)而效率低下 第二个版本手动指定WGS。在这种情况下,工作组是尺寸为{256,1,1}的薄片(以便能够乘以由素数指定尺寸的矩阵)。WGS是clGetKernelWork
__kernel void matMult(__global float* A,
__global float* B,
__global float* C,
int m, int p)
{
int a, b, k, group_idx, group_idz, tx, ty;
const int wgsize = get_local_size(0);
float value;
group_idx = get_group_id(0);
group_idz = get_group_id(2);
tx = get_local_id(0);
ty = get_local_id(2);
if(tx >= p) {
//printf("Thread %d exiting ...\n", tx);
return;
}
// Index of the first sub-matrix of A processed
// group_idz the block
int aBegin = m * wgsize * group_idz;
// Index of the last sub-matrix of A processed
// group_idz the block
int aEnd = aBegin + m - 1;
// Step size used to iterate through the
// sub-matrices of A
int aStep = wgsize;
// Index of the first sub-matrix of B processed
// group_idz the block
int bBegin = wgsize * group_idx;
// Step size used to iterate through the
// sub-matrices of B
int bStep = wgsize * p;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep)
{
// Declaration of the local memory array As
// used to store the sub-matrix of A
__local float As[256][1];
// Declaration of the local memory array Bs
// used to store the sub-matrix of B
__local float Bs[256][1];
// Load the matrices from global memory
// to local memory; each thread loads
// one element of each matrix
As[ty][tx] = A[a + m * ty + tx];
Bs[ty][tx] = B[b + p * ty + tx];
// Synchronize to make sure the matrices
// are loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
for (k = 0; k < wgsize; k++)
value += As[ty][k] * Bs[k][tx];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
barrier(CLK_LOCAL_MEM_FENCE);
}
//printf("value: %f\n", value);
int c = p * wgsize * group_idz + wgsize * group_idx;
C[c + p * ty + tx] = value;
}
int main(int argc, const char * argv[])
{
...
//Allocate memory for and generate test data on host for matrix multiplication
float* hostA = allocMatrix(n, m);
float* hostB = allocMatrix(m, p);
//Allocate results array on host
float* hostC = (float *)malloc(sizeof(float) * p * n);
//Setup the objects OpenCL needs in order to function
if(SetupCL(&context, properties, &kernel, &command_queue, &program, &platform_id, &device_id, suppressoutp, usecpu)) {
printf("Failed to setup OpenCL\n");
return -1;
}
//10. Allocate memory on device
cl_mem devA = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * m * n, NULL, &err);
cl_mem devB = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * p * m, NULL, &err);
cl_mem devC = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * p * n, NULL, &err);
//Load data into the input buffer
clEnqueueWriteBuffer(command_queue, devA, CL_TRUE, 0,
sizeof(float) * m * n, hostA, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, devB, CL_TRUE, 0,
sizeof(float) * m * p, hostB, 0, NULL, NULL);
//11. Set the argument list for the kernel command
int wa = m;
int wb = p;
clSetKernelArg(kernel, 0, sizeof(cl_mem), &devA);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &devB);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &devC);
clSetKernelArg(kernel, 3, sizeof(int), &wa);
clSetKernelArg(kernel, 4, sizeof(int), &wb);
//Fetch information about compute device
unsigned int pref_workg_size_mult;
const unsigned int max_workg_size;
const unsigned int max_workit_sizes[3];
clGetKernelWorkGroupInfo(kernel, device_id,
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
sizeof(size_t), (void*) &pref_workg_size_mult, NULL);
clGetDeviceInfo(device_id,
CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(size_t), (void*) &max_workg_size, NULL);
clGetDeviceInfo(device_id,
CL_DEVICE_MAX_WORK_ITEM_SIZES,
sizeof(size_t) * 3, (void*) max_workit_sizes, NULL);
//Determine work group size
int k = 1, s = 1;
if (pref_workg_size_mult == 0)
pref_workg_size_mult = 1;
while(k * pref_workg_size_mult < n && k * pref_workg_size_mult < max_workg_size)
k++;
while(k *s * pref_workg_size_mult < n)
s++;
const size_t work_group_size[3] = {k * pref_workg_size_mult, 1, 1};
const size_t global_work_size[3] = {k * s * pref_workg_size_mult, 1, p};
//12. Enqueue the kernel command for execution
cl_event event0;
cl_int enqueue_error = clEnqueueNDRangeKernel(command_queue, kernel, 3, NULL, global_work_size,
work_group_size, 0, NULL, &event0);
if (enqueue_error != CL_SUCCESS)
{
printf("Kernel launch failed, error %d\n", enqueue_error);
return enqueue_error;
}
clWaitForEvents(1, &event0);
//Call profiling function to obtain starting and ending times of kernel execution
clGetEventProfilingInfo(event0, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &start, NULL);
clGetEventProfilingInfo(event0, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &end, NULL);
duration = end - start;
//13. Copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, devC, CL_TRUE, 0,
sizeof(float) * p * n, hostC, 0, NULL, NULL);
//14. Cleanup - release OpenCL resources
clReleaseMemObject(devA);
clReleaseMemObject(devB);
clReleaseMemObject(devC);
clReleaseEvent(event0);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
//Release host memory
free(hostA);
free(hostB);
free(hostC);
return 0;
}
\uuuuu内核无效matMult(\uuuu全局浮点*A,
__全球浮动*B,
__全球浮动*C,
整数m,整数p)
{
int a,b,k,group_idx,group_idz,tx,ty;
const int wgsize=获取本地大小(0);
浮动值;
group\u idx=获取\u组\u id(0);
group_idz=get_group_id(2);
tx=获取本地id(0);
ty=获取本地id(2);
如果(tx>=p){
//printf(“线程%d正在退出…\n”,tx);
返回;
}
//已处理数据的第一个子矩阵的索引
//组_idz块
int aBegin=m*wgsize*group_idz;
//已处理数据的最后一个子矩阵的索引
//组_idz块
int aEnd=aBegin+m-1;
//用于迭代整个过程的步长
//A的子矩阵
int aStep=wgsize;
//处理的B的第一个子矩阵的索引
//组_idz块
int bBegin=wgsize*group_idx;
//用于迭代整个过程的步长
//B的子矩阵
int bStep=wgsize*p;
//循环A和B的所有子矩阵
//需要计算块子矩阵
对于(a=aBegin,b=bBegin;
a如果你的计算时间太长,你肯定会使Windows watchdog计时器出错。你能发布你的内核和缩写的主机代码吗?这样我们就可以确保你不会调用4000×4000个内核,每个内核都在4000×4000矩阵上运行?谢谢你的回复。我已经发布了内核和主机代码(略缩写)。我有一个额外的函数setupCL,它设置了所有需要的对象,我可以根据请求发布。我必须说,你的建议对我来说似乎不太可能。当代码完成时,你在任何场景中都能得到正确的答案吗?是的,据我所知,代码会产生正确的结果。-你为什么使用get\local_id(2),如果您的工作大小都定义为1D?-为什么要这样声明您的本地内存:[256][1]
?为什么要这样访问它:作为[ty][tx]
,特别是如果tx并不总是0?