Crash OpenCl程序崩溃视频卡驱动程序_Crash_Driver_Opencl_Matrix Multiplication

Crash OpenCl程序崩溃视频卡驱动程序

opencl

Crash OpenCl程序崩溃视频卡驱动程序,crash,driver,opencl,matrix-multiplication,Crash,Driver,Opencl,Matrix Multiplication,我已经写了一个程序，用于在GPU上计算矩阵积我的问题是，对于大型矩阵，催化剂驱动程序会崩溃。我知道超时检测会导致长时间的计算，但我的计算速度相当快，所以我认为这不是问题本质上，我有三种不同的代码迭代。第一个是矩阵多重化的简单实现，它使用OpenCL中的内置功能来确定工作组大小（WGS）。这个版本工作正常，但由于带宽的模仿（我猜）而效率低下第二个版本手动指定WGS。在这种情况下，工作组是尺寸为{256,1,1}的薄片（以便能够乘以由素数指定尺寸的矩阵）。WGS是clGetKernelWork

我已经写了一个程序，用于在GPU上计算矩阵积

我的问题是，对于大型矩阵，催化剂驱动程序会崩溃。我知道超时检测会导致长时间的计算，但我的计算速度相当快，所以我认为这不是问题

本质上，我有三种不同的代码迭代。第一个是矩阵多重化的简单实现，它使用OpenCL中的内置功能来确定工作组大小（WGS）。这个版本工作正常，但由于带宽的模仿（我猜）而效率低下

第二个版本手动指定WGS。在这种情况下，工作组是尺寸为{256,1,1}的薄片（以便能够乘以由素数指定尺寸的矩阵）。WGS是clGetKernelWorkGroupInfo（）返回的CL\u内核\u首选\u工作\u组大小\u倍数的倍数。此版本在使用大于约4000 x 4000的矩阵时崩溃。它也比第一个版本慢很多
第三个版本与第二个版本类似，只是它使用本地内存。对于大于约2000 x 2000的矩阵，此版本将崩溃。就其实际工作的矩阵大小而言，它是迄今为止最快的
我在Windows 8.1上使用带有gcc的MinGW64（如果需要，可以检查版本，不记得了）。我使用AMD R9 290和CCC 14.9驱动程序
有用的链接

内核（本地内存版本）：

__kernel void matMult(__global float* A, __global float* B, __global float* C, int m, int p) { int a, b, k, group_idx, group_idz, tx, ty; const int wgsize = get_local_size(0); float value; group_idx = get_group_id(0); group_idz = get_group_id(2); tx = get_local_id(0); ty = get_local_id(2); if(tx >= p) { //printf("Thread %d exiting ...\n", tx); return; } // Index of the first sub-matrix of A processed // group_idz the block int aBegin = m * wgsize * group_idz; // Index of the last sub-matrix of A processed // group_idz the block int aEnd = aBegin + m - 1; // Step size used to iterate through the // sub-matrices of A int aStep = wgsize; // Index of the first sub-matrix of B processed // group_idz the block int bBegin = wgsize * group_idx; // Step size used to iterate through the // sub-matrices of B int bStep = wgsize * p; // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { // Declaration of the local memory array As // used to store the sub-matrix of A __local float As[256][1]; // Declaration of the local memory array Bs // used to store the sub-matrix of B __local float Bs[256][1]; // Load the matrices from global memory // to local memory; each thread loads // one element of each matrix As[ty][tx] = A[a + m * ty + tx]; Bs[ty][tx] = B[b + p * ty + tx]; // Synchronize to make sure the matrices // are loaded barrier(CLK_LOCAL_MEM_FENCE); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix for (k = 0; k < wgsize; k++) value += As[ty][k] * Bs[k][tx]; // Synchronize to make sure that the preceding // computation is done before loading two new // sub-matrices of A and B in the next iteration barrier(CLK_LOCAL_MEM_FENCE); } //printf("value: %f\n", value); int c = p * wgsize * group_idz + wgsize * group_idx; C[c + p * ty + tx] = value; }

int main(int argc, const char * argv[]) { ... //Allocate memory for and generate test data on host for matrix multiplication float* hostA = allocMatrix(n, m); float* hostB = allocMatrix(m, p); //Allocate results array on host float* hostC = (float *)malloc(sizeof(float) * p * n); //Setup the objects OpenCL needs in order to function if(SetupCL(&context, properties, &kernel, &command_queue, &program, &platform_id, &device_id, suppressoutp, usecpu)) { printf("Failed to setup OpenCL\n"); return -1; } //10. Allocate memory on device cl_mem devA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * m * n, NULL, &err); cl_mem devB = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * p * m, NULL, &err); cl_mem devC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * p * n, NULL, &err); //Load data into the input buffer clEnqueueWriteBuffer(command_queue, devA, CL_TRUE, 0, sizeof(float) * m * n, hostA, 0, NULL, NULL); clEnqueueWriteBuffer(command_queue, devB, CL_TRUE, 0, sizeof(float) * m * p, hostB, 0, NULL, NULL); //11. Set the argument list for the kernel command int wa = m; int wb = p; clSetKernelArg(kernel, 0, sizeof(cl_mem), &devA); clSetKernelArg(kernel, 1, sizeof(cl_mem), &devB); clSetKernelArg(kernel, 2, sizeof(cl_mem), &devC); clSetKernelArg(kernel, 3, sizeof(int), &wa); clSetKernelArg(kernel, 4, sizeof(int), &wb); //Fetch information about compute device unsigned int pref_workg_size_mult; const unsigned int max_workg_size; const unsigned int max_workit_sizes[3]; clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), (void*) &pref_workg_size_mult, NULL); clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*) &max_workg_size, NULL); clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, (void*) max_workit_sizes, NULL); //Determine work group size int k = 1, s = 1; if (pref_workg_size_mult == 0) pref_workg_size_mult = 1; while(k * pref_workg_size_mult < n && k * pref_workg_size_mult < max_workg_size) k++; while(k *s * pref_workg_size_mult < n) s++; const size_t work_group_size[3] = {k * pref_workg_size_mult, 1, 1}; const size_t global_work_size[3] = {k * s * pref_workg_size_mult, 1, p}; //12. Enqueue the kernel command for execution cl_event event0; cl_int enqueue_error = clEnqueueNDRangeKernel(command_queue, kernel, 3, NULL, global_work_size, work_group_size, 0, NULL, &event0); if (enqueue_error != CL_SUCCESS) { printf("Kernel launch failed, error %d\n", enqueue_error); return enqueue_error; } clWaitForEvents(1, &event0); //Call profiling function to obtain starting and ending times of kernel execution clGetEventProfilingInfo(event0, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(event0, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); duration = end - start; //13. Copy the results from out of the output buffer clEnqueueReadBuffer(command_queue, devC, CL_TRUE, 0, sizeof(float) * p * n, hostC, 0, NULL, NULL); //14. Cleanup - release OpenCL resources clReleaseMemObject(devA); clReleaseMemObject(devB); clReleaseMemObject(devC); clReleaseEvent(event0); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(command_queue); clReleaseContext(context); //Release host memory free(hostA); free(hostB); free(hostC); return 0; }

\uuuuu内核无效matMult（\uuuu全局浮点*A， __全球浮动*B， __全球浮动*C，整数m，整数p） { int a，b，k，group_idx，group_idz，tx，ty； const int wgsize=获取本地大小（0）；浮动值； group\u idx=获取\u组\u id（0）； group_idz=get_group_id（2）； tx=获取本地id（0）； ty=获取本地id（2）；如果（tx>=p）{ //printf（“线程%d正在退出…\n”，tx）；返回； } //已处理数据的第一个子矩阵的索引 //组_idz块 int aBegin=m*wgsize*group_idz； //已处理数据的最后一个子矩阵的索引 //组_idz块 int aEnd=aBegin+m-1； //用于迭代整个过程的步长 //A的子矩阵 int aStep=wgsize； //处理的B的第一个子矩阵的索引 //组_idz块 int bBegin=wgsize*group_idx； //用于迭代整个过程的步长 //B的子矩阵 int bStep=wgsize*p； //循环A和B的所有子矩阵 //需要计算块子矩阵对于（a=aBegin，b=bBegin； a如果你的计算时间太长，你肯定会使Windows watchdog计时器出错。你能发布你的内核和缩写的主机代码吗？这样我们就可以确保你不会调用4000×4000个内核，每个内核都在4000×4000矩阵上运行？谢谢你的回复。我已经发布了内核和主机代码（略缩写）。我有一个额外的函数setupCL，它设置了所有需要的对象，我可以根据请求发布。我必须说，你的建议对我来说似乎不太可能。当代码完成时，你在任何场景中都能得到正确的答案吗？是的，据我所知，代码会产生正确的结果。-你为什么使用get\local_id（2），如果您的工作大小都定义为1D？-为什么要这样声明您的本地内存：[256][1] ？为什么要这样访问它：作为[ty][tx] ，特别是如果tx并不总是0？