opencl中的分段错误(堆芯转储)

opencl中的分段错误(堆芯转储),opencl,Opencl,我是OpenCL的新手,但我已经做并行编程一年多了。我在做我的第一个openCL代码矩阵乘法。我写了下面的代码 //#include<stdio.h> #include <stdio.h> #include <stdlib.h> #include <assert.h> #include <string.h> #include <SDKCommon.hpp> #include <SDKApplication.hpp>

我是OpenCL的新手,但我已经做并行编程一年多了。我在做我的第一个openCL代码矩阵乘法。我写了下面的代码

//#include<stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <SDKCommon.hpp>
#include <SDKApplication.hpp>
#include <SDKCommandArgs.hpp>
#include <SDKFile.hpp>
#include <CL/cl.h>

#define MAX_SOURCE_SIZE (0x100000)
#define MATSIZE 16


void initmat(float *Aa,float *Bb,float *Cc,int row,int colrow,int col);

void initmat(float *Aa,float *Bb,float *Cc,int row,int colrow,int col)
{
unsigned int i;

for(i=0;i<row*colrow;i++){
Aa[i]=1;
}

for(i=0;i<colrow*col;i++){
Bb[i]=2;
}

for(i=0;i<row*col;i++){
Cc[i]=0;
}

}


int main(void)
{
printf(" Here: 1 \n");
// Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("matmul.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );
printf(" Here: 2 \n");
// matrix declaration
float *A;
float *B;
float *C;

// set dimesions
int Arow,AcolBrow,Bcol;

Arow=AcolBrow=Bcol=MATSIZE;

// no. of elements in matrix
int sizea, sizeb, sizec;

// Error code from opencl

int err;

// Setting up matrices
sizea= Arow*AcolBrow;
sizeb= AcolBrow*Bcol;
sizec= Arow*Bcol;

A = (float *) malloc(sizeof(float)*sizea);
B = (float *) malloc(sizeof(float)*sizeb);
C = (float *) malloc(sizeof(float)*sizec);
printf(" Here: 3 \n");
initmat(A,B,C,Arow,AcolBrow,Bcol);
// Displaying inputs

unsigned long int ii;

printf("Input A: \n");
for(ii=0;ii<sizea;ii++)
printf("%f  ",A[ii]);

printf("\n \n \n \n");


printf("Input B: \n");
for(ii=0;ii<sizeb;ii++)
printf("%f  ",B[ii]);
printf("\n \n \n");

// get platform id & device id

cl_uint numplatform;
cl_platform_id platformid=NULL;
cl_device_id deviceid=NULL;

err= clGetPlatformIDs(1,&platformid,&numplatform);
err=clGetDeviceIDs(platformid,CL_DEVICE_TYPE_GPU,1,&deviceid,NULL);

cl_context_properties properties[]= 
{ 
    CL_CONTEXT_PLATFORM, (cl_context_properties)platformid,0 
};

// create context
cl_context context= clCreateContext(properties,1,&deviceid,NULL,NULL,&err);

/* when more than one gpu is installed on the system than we make use of the approach as we stated in the  notes !! */
printf(" Here: 4 \n");
// create command queue

cl_command_queue queue = clCreateCommandQueue(context,deviceid,0,&err); // I have disabled profiling option


// Allocate buffer object for Ad,Bd,Cd

cl_mem Ad = clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(cl_float)*sizea,NULL,NULL);
cl_mem Bd = clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(cl_float)*sizeb,NULL,NULL);
cl_mem Cd = clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(cl_float)*sizec,NULL,NULL);
printf(" Here: 5 \n");
// We are not explicitely making kernel. We are putting the kernel code here itself (see notes)


cl_program program= clCreateProgramWithSource(context,1,(const char **)&source_str, (const size_t *)&source_size,&err);
printf(" Here: 6 \n");
// Build program using program object just created

err = clBuildProgram(program,0,NULL,NULL,NULL,NULL);

if(err !=CL_SUCCESS)
{

size_t len;
char buffer[2048];
printf("ERROR: Failed to build executable \n ");
clGetProgramBuildInfo(program,deviceid,CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer , &len);
printf("%s \n",buffer);
//return FAILURE;

}
printf(" Here: 7 \n");
// Create kernel object

cl_kernel kernel = clCreateKernel(program,"matmul",NULL);
printf(" Here: 8 \n");
// set kernel argument values

err=0;
err= clSetKernelArg(kernel,0,sizeof(int),&Arow);
err|= clSetKernelArg(kernel,1,sizeof(int),&AcolBrow);
err|= clSetKernelArg(kernel,2,sizeof(int),&Bcol);
err|= clSetKernelArg(kernel,3,sizeof(cl_mem),&Ad);
err|= clSetKernelArg(kernel,4,sizeof(cl_mem),&Bd);
err|= clSetKernelArg(kernel,5,sizeof(cl_mem),&Cd);
printf(" Here: 9 \n");
// Write to device buffers. Ad=A and Bd=B   : Equivalent to CUDAmemcpy

err=clEnqueueWriteBuffer(queue,Ad,CL_TRUE,0,sizeof(cl_float)*sizea,A,0,NULL,NULL);
err=clEnqueueWriteBuffer(queue,Bd,CL_TRUE,0,sizeof(cl_float)*sizeb,B,0,NULL,NULL);
printf(" Here: 10 \n");
// since we have set the copy as synchronous we will be creating event
cl_event event;

// Execute the kernel over entire range of C matrix

size_t global[2];
size_t local[2];
cl_uint * ndim; // no. of dimension in ND range. 3rd parameter in kernel call signifies the dimension.

global[0]=(size_t)Arow;
global[1]=(size_t)Bcol;

* ndim=2; // because we want 2-D multiplication. Gives n

/* no local size declaration cause we are not making work groups ie blocks.We are just make making oneblock where everythread takes one element of A,B and computes C */
printf(" Here: 11 \n");
err = clEnqueueNDRangeKernel(queue,kernel,*ndim,NULL,global,NULL,0,NULL,&event); // the NULL position after global is for passing local dimension. In this case we don't have one.
clFinish(queue); // wait for kernel to finish before we begin copying the result back on host
printf(" Here: 12 \n");
//read back the result

err=clEnqueueReadBuffer(queue,Cd,CL_TRUE,0,sizeof(cl_float)*sizec,C,0,NULL,NULL);


// Checking whether the computations done are on CPU or GPU

cl_device_type dev_type;
clGetDeviceInfo(deviceid, CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
if (dev_type == CL_DEVICE_TYPE_GPU) {
    printf("Following code was complied on GPU ! \n \n \n \n");
}
else
    printf("Following code was complied on CPU ! \n \n \n \n");
printf(" Here: 13 \n \n \n");
// Displaying results
printf("Result is: \n");
for(ii=0;ii<sizec;ii++)
printf("%f  ",C[ii]);

printf("\n \n \n");

// free all memory
printf(" Here: 14 \n");
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseMemObject(Ad);
clReleaseMemObject(Bd);
clReleaseMemObject(Cd);
clReleaseCommandQueue(queue);
clReleaseContext(context);
printf(" Here: 15 \n");
return 0;
printf(" Here: 16 \n");

}
我知道内核的功能和矩阵乘法内核并没有什么相似之处,但我这样做只是为了检查内核中是否有错误。请给出能帮助我运行代码的建议或建议


提前感谢。

您需要更多的错误检查。不要只获取错误代码,如果出现问题,也要打印一条消息!我认为内核应该使用常量,而不是const,它不是内存空间限定符,尽管CLC应该能够纠正它。所以,是的,添加一些调试代码以获得更多信息,它到底在哪里失败,等等。我用标志编辑了代码。放置常量没有帮助。我仍然得到了错误。发生了一些奇怪的事情。如果我对打印A和B元素的循环进行注释,确保无符号长整数ii;在此处之前,仅显示13个标志。不显示标志:Here:14和Here:15,也不显示C元素!如果我取消对这些循环的注释,那么我会得到所有的标志,直到:这里:显示了15个,也显示了C的元素,但我仍然会得到分段错误。你能在你的机器上运行这个代码并共享输出吗。请帮助。您没有为ndim分配内存。我猜这会毁了你的一堆。您能在程序开始时打印出sizec的值,然后在显示C元素之前打印出来吗?还可以将C的值打印为指针。绝对是内存损坏。哦,哦,古德。。!!是的,你是对的。。。造成问题的是指针*ndim。我忘了给它分配内存。就这么做了,一切都在轨道上!非常感谢@Thomas:我根本不可能调试这个再次感谢。
__kernel void matmul(const int Mdim, const int Ndim,const int Pdim,__global float* A,__global float* B,__global float* C)
{

float tmp;
int i = get_global_id(0);
int j = get_global_id(1);
if((i<Ndim)&(j<Mdim))
C[i*Ndim+j] = 3; 

}
   135 > Sun Mar 17 : 07:49 PM : samkit@samkit:~/AMD/AMD-APP-SDK-v2.8-RC-lnx32/samples/opencl/bin/x86$ ./matmul
 Here: 1 
 Here: 2 
 Here: 3 
 Here: 4 
 Here: 5 
 Here: 6 
 Here: 7 
 Here: 8 
 Here: 9 
 Here: 10 
 Here: 11 
 Here: 12 
Following code was complied on GPU ! 



 Here: 13 


Result is: 
Segmentation fault (core dumped)