Crash OpenCl程序崩溃视频卡驱动程序

Crash OpenCl程序崩溃视频卡驱动程序,crash,driver,opencl,matrix-multiplication,Crash,Driver,Opencl,Matrix Multiplication,我已经写了一个程序,用于在GPU上计算矩阵积 我的问题是,对于大型矩阵,催化剂驱动程序会崩溃。我知道超时检测会导致长时间的计算,但我的计算速度相当快,所以我认为这不是问题 本质上,我有三种不同的代码迭代。第一个是矩阵多重化的简单实现,它使用OpenCL中的内置功能来确定工作组大小(WGS)。这个版本工作正常,但由于带宽的模仿(我猜)而效率低下 第二个版本手动指定WGS。在这种情况下,工作组是尺寸为{256,1,1}的薄片(以便能够乘以由素数指定尺寸的矩阵)。WGS是clGetKernelWork

我已经写了一个程序,用于在GPU上计算矩阵积

我的问题是,对于大型矩阵,催化剂驱动程序会崩溃。我知道超时检测会导致长时间的计算,但我的计算速度相当快,所以我认为这不是问题

本质上,我有三种不同的代码迭代。第一个是矩阵多重化的简单实现,它使用OpenCL中的内置功能来确定工作组大小(WGS)。这个版本工作正常,但由于带宽的模仿(我猜)而效率低下

第二个版本手动指定WGS。在这种情况下,工作组是尺寸为{256,1,1}的薄片(以便能够乘以由素数指定尺寸的矩阵)。WGS是clGetKernelWorkGroupInfo()返回的CL\u内核\u首选\u工作\u组大小\u倍数的倍数。此版本在使用大于约4000 x 4000的矩阵时崩溃。它也比第一个版本慢很多

第三个版本与第二个版本类似,只是它使用本地内存。对于大于约2000 x 2000的矩阵,此版本将崩溃。就其实际工作的矩阵大小而言,它是迄今为止最快的

我在Windows 8.1上使用带有gcc的MinGW64(如果需要,可以检查版本,不记得了)。我使用AMD R9 290和CCC 14.9驱动程序

有用的链接

内核(本地内存版本):

    __kernel void matMult(__global float* A,
                    __global float* B,
                    __global float* C,
                    int m, int p)
{
    int a, b, k, group_idx, group_idz, tx, ty;
    const int wgsize = get_local_size(0);
    float value;

    group_idx = get_group_id(0);
    group_idz = get_group_id(2);
    tx = get_local_id(0);
    ty = get_local_id(2);

    if(tx >= p)  {
        //printf("Thread %d exiting ...\n", tx);
        return;
    }

    // Index of the first sub-matrix of A processed 
    // group_idz the block
    int aBegin = m * wgsize * group_idz;

    // Index of the last sub-matrix of A processed 
    // group_idz the block
    int aEnd   = aBegin + m - 1;

    // Step size used to iterate through the 
    // sub-matrices of A
    int aStep  = wgsize;

    // Index of the first sub-matrix of B processed 
    // group_idz the block
    int bBegin = wgsize * group_idx;

    // Step size used to iterate through the 
    // sub-matrices of B
    int bStep  = wgsize * p;

    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) 
    {

        // Declaration of the local memory array As 
        // used to store the sub-matrix of A
        __local float As[256][1];

        // Declaration of the local memory array Bs 
        // used to store the sub-matrix of B
        __local float Bs[256][1];

        // Load the matrices from global memory
        // to local memory; each thread loads
        // one element of each matrix
        As[ty][tx] = A[a + m * ty + tx];
        Bs[ty][tx] = B[b + p * ty + tx];

        // Synchronize to make sure the matrices 
        // are loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (k = 0; k < wgsize; k++)
            value += As[ty][k] * Bs[k][tx];

        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    //printf("value: %f\n", value);
    int c = p * wgsize * group_idz + wgsize * group_idx;
    C[c + p * ty + tx] = value;
}
int main(int argc, const char * argv[])
{
    ...

    //Allocate memory for and generate test data on host for matrix multiplication
    float* hostA = allocMatrix(n, m);
    float* hostB = allocMatrix(m, p);

    //Allocate results array on host
    float* hostC = (float *)malloc(sizeof(float) * p * n);

    //Setup the objects OpenCL needs in order to function
    if(SetupCL(&context, properties, &kernel, &command_queue, &program, &platform_id, &device_id, suppressoutp, usecpu))  {
        printf("Failed to setup OpenCL\n");
        return -1;
    }


    //10. Allocate memory on device
    cl_mem devA  = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                 sizeof(cl_float) * m * n, NULL, &err);

    cl_mem devB  = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                 sizeof(cl_float) * p * m, NULL, &err);

    cl_mem devC  = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                 sizeof(cl_float) * p * n, NULL, &err);

    //Load data into the input buffer
    clEnqueueWriteBuffer(command_queue, devA, CL_TRUE, 0,
                         sizeof(float) * m * n, hostA, 0, NULL, NULL);
    clEnqueueWriteBuffer(command_queue, devB, CL_TRUE, 0,
                         sizeof(float) * m * p, hostB, 0, NULL, NULL);


    //11. Set the argument list for the kernel command
    int wa = m;
    int wb = p;
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &devA);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &devB);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &devC);
    clSetKernelArg(kernel, 3, sizeof(int), &wa);
    clSetKernelArg(kernel, 4, sizeof(int), &wb);


    //Fetch information about compute device
    unsigned int pref_workg_size_mult;
    const unsigned int max_workg_size;
    const unsigned int max_workit_sizes[3];

    clGetKernelWorkGroupInfo(kernel, device_id,
        CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
        sizeof(size_t), (void*) &pref_workg_size_mult, NULL);

    clGetDeviceInfo(device_id, 
        CL_DEVICE_MAX_WORK_GROUP_SIZE,
        sizeof(size_t), (void*) &max_workg_size, NULL);

    clGetDeviceInfo(device_id, 
        CL_DEVICE_MAX_WORK_ITEM_SIZES,
        sizeof(size_t) * 3, (void*) max_workit_sizes, NULL);


    //Determine work group size
    int k = 1, s = 1;
    if (pref_workg_size_mult == 0)
        pref_workg_size_mult = 1;

    while(k * pref_workg_size_mult < n && k * pref_workg_size_mult < max_workg_size)
        k++;
    while(k *s * pref_workg_size_mult < n)
        s++;

    const size_t work_group_size[3] = {k * pref_workg_size_mult, 1, 1};
    const size_t global_work_size[3] = {k * s * pref_workg_size_mult, 1, p};


    //12. Enqueue the kernel command for execution
    cl_event event0;

    cl_int enqueue_error = clEnqueueNDRangeKernel(command_queue, kernel, 3, NULL, global_work_size,
                           work_group_size, 0, NULL, &event0);
    if (enqueue_error != CL_SUCCESS)
    {
        printf("Kernel launch failed, error %d\n", enqueue_error);
        return enqueue_error;
    }
    clWaitForEvents(1, &event0);


    //Call profiling function to obtain starting and ending times of kernel execution
    clGetEventProfilingInfo(event0, CL_PROFILING_COMMAND_START,
                                sizeof(cl_ulong), &start, NULL);
    clGetEventProfilingInfo(event0, CL_PROFILING_COMMAND_END,
                                sizeof(cl_ulong), &end, NULL);
    duration = end - start;



    //13. Copy the results from out of the output buffer
    clEnqueueReadBuffer(command_queue, devC, CL_TRUE, 0,
                                sizeof(float) * p * n, hostC, 0, NULL, NULL);


    //14. Cleanup - release OpenCL resources
    clReleaseMemObject(devA);
    clReleaseMemObject(devB);
    clReleaseMemObject(devC);
    clReleaseEvent(event0);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);


    //Release host memory
    free(hostA);
    free(hostB);
    free(hostC);

    return 0;
}
\uuuuu内核无效matMult(\uuuu全局浮点*A,
__全球浮动*B,
__全球浮动*C,
整数m,整数p)
{
int a,b,k,group_idx,group_idz,tx,ty;
const int wgsize=获取本地大小(0);
浮动值;
group\u idx=获取\u组\u id(0);
group_idz=get_group_id(2);
tx=获取本地id(0);
ty=获取本地id(2);
如果(tx>=p){
//printf(“线程%d正在退出…\n”,tx);
返回;
}
//已处理数据的第一个子矩阵的索引
//组_idz块
int aBegin=m*wgsize*group_idz;
//已处理数据的最后一个子矩阵的索引
//组_idz块
int aEnd=aBegin+m-1;
//用于迭代整个过程的步长
//A的子矩阵
int aStep=wgsize;
//处理的B的第一个子矩阵的索引
//组_idz块
int bBegin=wgsize*group_idx;
//用于迭代整个过程的步长
//B的子矩阵
int bStep=wgsize*p;
//循环A和B的所有子矩阵
//需要计算块子矩阵
对于(a=aBegin,b=bBegin;

a如果你的计算时间太长,你肯定会使Windows watchdog计时器出错。你能发布你的内核和缩写的主机代码吗?这样我们就可以确保你不会调用4000×4000个内核,每个内核都在4000×4000矩阵上运行?谢谢你的回复。我已经发布了内核和主机代码(略缩写)。我有一个额外的函数setupCL,它设置了所有需要的对象,我可以根据请求发布。我必须说,你的建议对我来说似乎不太可能。当代码完成时,你在任何场景中都能得到正确的答案吗?是的,据我所知,代码会产生正确的结果。-你为什么使用get\local_id(2),如果您的工作大小都定义为1D?-为什么要这样声明您的本地内存:
[256][1]
?为什么要这样访问它:
作为[ty][tx]
,特别是如果tx并不总是0?