使用CUDA C的二维矩阵积分图像或求和面积表_C_Image Processing_Cuda_Gpgpu_Gpu

使用CUDA C的二维矩阵积分图像或求和面积表

c image-processing cuda

使用CUDA C的二维矩阵积分图像或求和面积表,c,image-processing,cuda,gpgpu,gpu,C,Image Processing,Cuda,Gpgpu,Gpu,我试图计算一个二维矩阵的求和面积表，其中行数和列数不相等。我遇到了一个小问题，当行和列相等时，我的代码似乎运行正常，但当行和列不相等时，它无法计算最终输出的最后一行。问题是我不明白为什么会这样积分图像/总面积表的基本算法：基本上，在一个整数和中，每个像素或索引元素计算它上面和后面的所有矩阵元素的和。例如，对于具有以下元素的3x2输入数组： [5, 2| |5, 2| |5, 2] 输出数组中的整数和如下所示： [5, 7| |10, 14| |15, 21]

我试图计算一个二维矩阵的求和面积表，其中行数和列数不相等。我遇到了一个小问题，当行和列相等时，我的代码似乎运行正常，但当行和列不相等时，它无法计算最终输出的最后一行。问题是我不明白为什么会这样
积分图像/总面积表的基本算法：基本上，在一个整数和中，每个像素或索引元素计算它上面和后面的所有矩阵元素的和。例如，对于具有以下元素的3x2输入数组：

[5, 2| |5, 2| |5, 2]
输出数组中的整数和如下所示：

[5, 7| |10, 14| |15, 21]
基本上，以下是我在CUDA C中尝试做的事情：

for(int matrixElement_y_index=0; matrixElement_y_index<=total_rows-1; matrixElement_y_index++) { //matrixElement_x_index and matrixElement_y_index represent (x,y) indices of each matrix element for(int matrixElement_x_index=0; matrixElement_x_index<=total_columns-1; matrixElement_x_index++) { int temp=0; for(int r=0;r<=(matrixElement_y_index);r++) { for(int c=0; c<=matrixElement_x_index;c++) { temp=temp+input[c][r]; } } output[matrixElement_y_index][matrixElement_x_index]=temp; } }

for（int matrixElement_y_index=0；matrixElement_y_index您的主要问题是错误的内存使用和存储。您的代码还损坏了堆！我使用行主顺序重新更改了代码，这是c/c++中通常使用的方法第一个错误发生在您将输入写入主机内存矩阵a[r*M+c] 时。因为r范围是从0..M（3）开始的，而c范围是从0..N（2）开始的，最大索引是2*3+1=7 。但是您的矩阵只有6个元素-最大索引是5！因此我重新更改了所有矩阵访问有了这些更改，我也必须适应您的网格设置。现在它是dim3网格（N，M）；如果您不确定变量表示什么或如何使用它，请使用好的表示名称，就像您在c参考代码中所做的那样有了这些变化，你的代码就可以为我工作了。请注意，矩阵的输入方式也发生了变化以上更改的完整代码：核函数： __global__ void image_integral(int *a, int*b, int rowsTotal,int colsTotal) { // Thread Ids equal to block Ids because the each blocks contains one thread only. int col = blockIdx.x; int row = blockIdx.y; int temp=0; if(col < colsTotal && row < rowsTotal) { // The first loop iterates from zero to the Y index of the thread which represents the corresponding element of the output/input array. for(int r=0;r<=row;r++) { // The second loop iterates from zero to the X index of the thread which represents the corresponding element of the output/input array for(int c=0; c<=col; c++) { temp = temp+a[r*colsTotal+c]; } } } //Transfer the final result to the output array b[row*colsTotal+col]=temp; } \uuuuu全局\uuuuu无效图像\u积分（int*a，int*b，int-rowsTotal，int-colsTotal） { //线程ID等于块ID，因为每个块只包含一个线程。 int col=blockIdx.x； int row=blockIdx.y；内部温度=0； if（列对于（intr=0；rTanks！基本错误，但仍然是那些让我头痛的错误。将此代码用于更大的矩阵（525 × 394）比串行实现（0.80 vs 0.002）慢，甚至更大（7680 × 4320）需要很多时间，而且永远不会结束。所有这些都发生在泰坦X上。 __global__ void image_integral(int *a, int*b, int rowsTotal,int colsTotal) { // Thread Ids equal to block Ids because the each blocks contains one thread only. int col = blockIdx.x; int row = blockIdx.y; int temp=0; if(col < colsTotal && row < rowsTotal) { // The first loop iterates from zero to the Y index of the thread which represents the corresponding element of the output/input array. for(int r=0;r<=row;r++) { // The second loop iterates from zero to the X index of the thread which represents the corresponding element of the output/input array for(int c=0; c<=col; c++) { temp = temp+a[r*colsTotal+c]; } } } //Transfer the final result to the output array b[row*colsTotal+col]=temp; } void main() { //M is number of rows //N is number of columns int M=3,N=2, m_e=0; int total_e=M*N; int widthstep=total_e*sizeof(int); int * matrix_a= (int *)malloc(widthstep); int * matrix_b= (int *)malloc(widthstep); cout<<"Enter elements for "<< M<<"x"<<N<<" matrix"; for(int r=0;r<M;r++) { for(int c=0; c<N;c++) { cout<<"Enter Matrix element [ "<<r<<","<<c<<"]"; cin>>m_e; matrix_a[r*N+c]=m_e; matrix_b[r*N+c]=0; } } int * d_matrix_a, * d_matrix_b; cout<<"Input:"<<endl; for(int r=0;r<M;r++) { for(int c=0; c<N;c++) { cout << matrix_a[r*N+c]<<" "; } cout << endl; } cout<<endl; cudaMalloc(&d_matrix_a,widthstep); cudaMalloc(&d_matrix_b,widthstep); cudaMemcpy(d_matrix_a,matrix_a,widthstep,cudaMemcpyHostToDevice); cudaMemcpy(d_matrix_b,matrix_b,widthstep,cudaMemcpyHostToDevice); //Creating a grid where the number of blocks are equal to the number of pixels or input matrix elements. //Each block contains only one thread. dim3 grid(N,M); image_integral<<<grid,1>>>(d_matrix_a, d_matrix_b,M,N); cudaThreadSynchronize(); cudaMemcpy(matrix_b,d_matrix_b,widthstep,cudaMemcpyDeviceToHost); cout<<"The Summed Area table is: "<<endl; for(int r=0;r<M;r++) { for(int c=0; c<N;c++) { cout << matrix_b[r*N+c]<<" "; } cout << endl; } system("pause"); cudaFree(d_matrix_a); cudaFree(d_matrix_b); free(matrix_a); free(matrix_b); }