2D卷积错误结果Cuda常量内存_Cuda_Gpu_Convolution

2D卷积错误结果Cuda常量内存

cuda

2D卷积错误结果Cuda常量内存,cuda,gpu,convolution,Cuda,Gpu,Convolution,我在内核代码中苦苦挣扎。我已经更新了这个，包括支持文件，但这些文件已经提供，应该是正确的这是我的第一个GPU程序之一，我花了几个小时尝试新事物，但我似乎无法正确地实现这一点。它正在编译和运行，但结果不正确我基本上很难理解我到底需要做些什么，因为这个内核给出了错误的结果。我正在尝试将输入图像的磁贴加载到共享内存（Ns[][]，我认为我做得很正确），并在输入图像磁贴上应用过滤器（我正在努力解决这个问题）如果有更有经验的人能帮我找出我到底哪里出了错，并给我一个解决问题的方法，我将不胜感激。如果我

我在内核代码中苦苦挣扎。我已经更新了这个，包括支持文件，但这些文件已经提供，应该是正确的

这是我的第一个GPU程序之一，我花了几个小时尝试新事物，但我似乎无法正确地实现这一点。它正在编译和运行，但结果不正确

我基本上很难理解我到底需要做些什么，因为这个内核给出了错误的结果。我正在尝试将输入图像的磁贴加载到共享内存（Ns[][]，我认为我做得很正确），并在输入图像磁贴上应用过滤器（我正在努力解决这个问题）

如果有更有经验的人能帮我找出我到底哪里出了错，并给我一个解决问题的方法，我将不胜感激。如果我问错了这个问题，我感谢你抽出时间，并表示歉意

main.cu：

#include <stdio.h>
#include "support.h"
#include "kernel.cu"
#include <time.h>

int main(int argc, char* argv[]){
Timer timer;
time_t t;


// Initialize host variables ----------------------------------------------

printf("\nSetting up the problem..."); fflush(stdout);
startTime(&timer);

Matrix M_h, N_h, P_h; // M: filter, N: input image, P: output image
Matrix N_d, P_d;
unsigned imageHeight, imageWidth;
cudaError_t cuda_ret;
dim3 dim_grid, dim_block;

/* Read image dimensions */
if (argc == 1) {
    imageHeight = 600;
    imageWidth = 1000;
} else if (argc == 2) {
    imageHeight = atoi(argv[1]);
    imageWidth = atoi(argv[1]);
} else if (argc == 3) {
    imageHeight = atoi(argv[1]);
    imageWidth = atoi(argv[2]);
} else {
    printf("\n    Invalid input parameters!"
       "\n    Usage: ./convolution          # Image is 600 x 1000"
       "\n    Usage: ./convolution <m>      # Image is m x m"
       "\n    Usage: ./convolution <m> <n>  # Image is m x n"
       "\n");
    exit(0);
}

/* Allocate host memory */
M_h = allocateMatrix(FILTER_SIZE, FILTER_SIZE);
N_h = allocateMatrix(imageHeight, imageWidth);
P_h = allocateMatrix(imageHeight, imageWidth);

/* Initialize filter and images */
initMatrix(M_h);
initMatrix(N_h);

stopTime(&timer); printf("%f s\n", elapsedTime(timer));
printf("    Image: %u x %u\n", imageHeight, imageWidth);
printf("    Mask: %u x %u\n", FILTER_SIZE, FILTER_SIZE);

// Allocate device variables ----------------------------------------------

printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);

N_d = allocateDeviceMatrix(imageHeight, imageWidth);
P_d = allocateDeviceMatrix(imageHeight, imageWidth);

cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));

// Copy host variables to device ------------------------------------------

printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);

/* Copy image to device global memory */
copyToDeviceMatrix(N_d, N_h);
cudaMemcpyToSymbol(M_h, M_c,FILTER_SIZE*sizeof(float));

 dim_grid = dim3(((N_h.width / BLOCK_SIZE) + 1), ((N_h.height / BLOCK_SIZE) + 1));
 dim_block = dim3(BLOCK_SIZE, BLOCK_SIZE);


cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));

// Launch kernel ----------------------------------------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);


convolution<<<dim_grid, dim_block>>>(N_d, P_d);

cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess) FATAL("Unable to launch/execute kernel");

cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));

// Copy device variables from host ----------------------------------------

printf("Copying data from device to host..."); fflush(stdout);
startTime(&timer);

copyFromDeviceMatrix(P_h, P_d);

cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));

// Verify correctness -----------------------------------------------------

printf("Verifying results..."); fflush(stdout);

verify(M_h, N_h, P_h);

// Free memory ------------------------------------------------------------

freeMatrix(M_h);
freeMatrix(N_h);
freeMatrix(P_h);
freeDeviceMatrix(N_d);
freeDeviceMatrix(P_d);

return 0;
}

#包括
#包括“support.h”
#包括“kernel.cu”
#包括
int main（int argc，char*argv[]）{
定时器；
时间；
//初始化主机变量----------------------------------------------
printf（“\n设置问题…”）；fflush（stdout）；
开始时间（和计时器）；
矩阵M_h，N_h，P_h；//M:filter，N:input image，P:output image
矩阵N_d，P_d；
无符号图像高度、图像宽度；
(小标题)；
dim3尺寸网格、尺寸块；
/*读取图像尺寸*/
如果（argc==1）{
图像高度=600；
图像宽度=1000；
}else if（argc==2）{
imageHeight=atoi（argv[1]）；
imageWidth=atoi（argv[1]）；
}else if（argc==3）{
imageHeight=atoi（argv[1]）；
imageWidth=atoi（argv[2]）；
}否则{
printf（“\n无效的输入参数！”
“\n用法：./卷积#图像为600 x 1000”
“\n用法：./卷积#图像为m x m”
“\n用法：./卷积#图像为m x n”
“\n”）；
出口（0）；
}
/*分配主机内存*/
M_h=分配矩阵（过滤器大小，过滤器大小）；
N_h=分配矩阵（图像高度、图像宽度）；
P_h=分配矩阵（图像高度、图像宽度）；
/*初始化过滤器和图像*/
初始矩阵（M_h）；
初始矩阵（N_h）；
stopTime（&timer）；printf（“%f s\n”，elapsedTime（timer））；
printf（“图像：%u x%u\n”，图像高度，图像宽度）；
printf（“掩码：%u x%u\n”，筛选器大小，筛选器大小）；
//分配设备变量----------------------------------------------
printf（“分配设备变量…”）；fflush（标准输出）；
开始时间（和计时器）；
N_d=allocateDeviceMatrix（图像高度、图像宽度）；
P_d=分配的设备矩阵（图像高度、图像宽度）；
cudaDeviceSynchronize（）；
stopTime（&timer）；printf（“%f s\n”，elapsedTime（timer））；
//将主机变量复制到设备------------------------------------------
printf（“将数据从主机复制到设备…”）；fflush（stdout）；
开始时间（和计时器）；
/*将映像复制到设备全局内存*/
copyToDeviceMatrix（N_d，N_h）；
CUDAMEMCPITOSYMBOL（M_h，M_c，过滤器大小*大小（浮点））；
尺寸网格=尺寸3（（N_高度/块大小）+1），（N_高度/块大小）+1）；
尺寸块=尺寸3（块尺寸，块尺寸）；
cudaDeviceSynchronize（）；
stopTime（&timer）；printf（“%f s\n”，elapsedTime（timer））；
//启动内核----------------------------------------------------------
printf（“启动内核…”）；fflush（标准输出）；
开始时间（和计时器）；
卷积（N_d，P_d）；
cuda_ret=cudaDeviceSynchronize（）；
if（cuda_ret！=cudaSuccess）致命（“无法启动/执行内核”）；
cudaDeviceSynchronize（）；
stopTime（&timer）；printf（“%f s\n”，elapsedTime（timer））；
//从主机复制设备变量----------------------------------------
printf（“将数据从设备复制到主机…”）；fflush（stdout）；
开始时间（和计时器）；
复制设备矩阵（P_h，P_d）；
cudaDeviceSynchronize（）；
stopTime（&timer）；printf（“%f s\n”，elapsedTime（timer））；
//验证正确性-----------------------------------------------------
printf（“验证结果…”）；fflush（标准输出）；
验证（M_h，N_h，P_h）；
//空闲内存------------------------------------------------------------
自由矩阵（M_h）；
自由矩阵（N_h）；
自由矩阵（P_h）；
freeDeviceMatrix（N_d）；
freeDeviceMatrix（P_d）；
返回0；
}

kernel.cu：

__constant__ float M_c[FILTER_SIZE][FILTER_SIZE];
__global__ void convolution(Matrix N, Matrix P){

__shared__ float Ns[TILE_SIZE + 5 - 1][TILE_SIZE + 5 -1];
int i, j;
float output = 0.0f;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row_o = blockIdx.y * TILE_SIZE + ty;
int col_o = blockIdx.x * TILE_SIZE + tx;
int row_i = row_o - 2;
int col_i = col_o - 2;                  
if((row_i >= 0) && (row_i < N.height) && (col_i >= 0) && (col_i < N.width)){
        Ns[ty][tx] = N.elements[row_i * N.width + col_i];           
}
else{
        Ns[ty][tx] = 0.0f;
}
__syncthreads();
if(ty < TILE_SIZE && tx < TILE_SIZE){
        for(i = 0; i < 5; i++){
                for(j = 0; j < 5; j++){
                output += M_c[i][j] * Ns[i + ty][j + tx];
                }
        }
}
if(row_o < P.height && col_o < P.width){
        P.elements[row_o * P.width + col_o] = output;
}
}

\uuuu常量\uuuuu浮点M_c[过滤器大小][过滤器大小]；
__全局无效卷积（矩阵N，矩阵P）{
__共享浮点数Ns[分片大小+5-1][分片大小+5-1]；
int i，j；
浮动输出=0.0f；
int tx=线程idx.x；
int ty=threadIdx.y；
int row\u o=块IDX.y*平铺大小+ty；
int col_o=块IDX.x*分片大小+tx；
int row_i=row_o-2；
int col_i=col_o-2；
如果（（行i>=0）和（行i=0）和（列i


支持.h：
#ifndef __FILEH__
#define __FILEH__

#include <sys/time.h>

typedef struct {
    struct timeval startTime;
    struct timeval endTime;
} Timer;

// Matrix Structure declaration
typedef struct {
    unsigned int width;
    unsigned int height;
    unsigned int pitch;
    float* elements;
} Matrix;

#define FILTER_SIZE 5
#define TILE_SIZE 12
#define BLOCK_SIZE (TILE_SIZE + FILTER_SIZE - 1)

Matrix allocateMatrix(unsigned height, unsigned width);
void initMatrix(Matrix mat);
Matrix allocateDeviceMatrix(unsigned height, unsigned width);
void copyToDeviceMatrix(Matrix dst, Matrix src);
void copyFromDeviceMatrix(Matrix dst, Matrix src);
void verify(Matrix M, Matrix  N, Matrix P);
void freeMatrix(Matrix mat);
void freeDeviceMatrix(Matrix mat);
void startTime(Timer* timer);
void stopTime(Timer* timer);
float elapsedTime(Timer timer);

#define FATAL(msg, ...) \
do {\
    fprintf(stderr, "[%s:%d] "msg"\n", __FILE__, __LINE__, ##__VA_ARGS__);\
    exit(-1);\
} while(0)

#if __BYTE_ORDER != __LITTLE_ENDIAN
# error "File I/O is not implemented for this system: wrong endianness."
#endif
#endif

\ifndef\uu文件__
#定义文件__
#包括
类型定义结构{
struct timeval startTime；
结构timeval endTime；
}定时器；
//矩阵结构声明
类型定义结构{
无符号整数宽度；
无符号整数高度；
无符号整数音高；
浮动*元素；
}基质；
#定义过滤器大小5
#定义瓷砖尺寸12
#定义块大小（平铺大小+过滤器大小-1）
矩阵分配器矩阵（无符号高度、无符号宽度）；
空隙率矩阵（矩阵mat）；
矩阵allocateDeviceMatrix（无符号高度、无符号宽度）；
无效复制设备矩阵（矩阵dst、矩阵src）；
来自设备矩阵的无效副本（矩阵dst、矩阵src）；
无效验证（矩阵M、矩阵N、矩阵P）；
无孔隙基质（基质垫）；
void-freeDeviceMatrix（矩阵矩阵矩阵）；
无效开始时间（计时器*计时器）；
无效停止时间（计时器*计时器）；
浮动时间（计时器）；
#定义致命（消息…）\
做{\
fprintf（标准字符，[%s:%d]“msg”\n“，\uuuuuu文件，\uuuuu行”##
#include <stdlib.h>
#include <stdio.h>

#include "support.h"

Matrix allocateMatrix(unsigned height, unsigned width)
{
    Matrix mat;
    mat.height = height;
    mat.width = mat.pitch = width;
    mat.elements = (float*)malloc(height*width*sizeof(float));
    if(mat.elements == NULL) FATAL("Unable to allocate host");

    return mat;
}

void initMatrix(Matrix mat)
{
    for (unsigned int i=0; i < mat.height*mat.width; i++) {
        mat.elements[i] = (rand()%100)/100.00;
    }
}

Matrix allocateDeviceMatrix(unsigned height, unsigned width)
{
    Matrix mat;
    cudaError_t cuda_ret;

    mat.height = height;
    mat.width = mat.pitch = width;
    cuda_ret = cudaMalloc((void**)&(mat.elements), height*width*sizeof(float));
    if(cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");

    return mat;
}

void copyToDeviceMatrix(Matrix dst, Matrix src)
{
    cudaError_t cuda_ret;
    cuda_ret = cudaMemcpy(dst.elements, src.elements, src.height*src.width*sizeof(float), cudaMemcpyHostToDevice);
    if(cuda_ret != cudaSuccess) FATAL("Unable to copy to device");
}

void copyFromDeviceMatrix(Matrix dst, Matrix src)
{
    cudaError_t cuda_ret;
    cuda_ret = cudaMemcpy(dst.elements, src.elements, src.height*src.width*sizeof(float), cudaMemcpyDeviceToHost);
    if(cuda_ret != cudaSuccess) FATAL("Unable to copy from device");
}

void verify(Matrix M, Matrix  N, Matrix P) {

  const float relativeTolerance = 1e-6;

  for(int row = 0; row < N.height; ++row) {
    for(int col = 0; col < N.width; ++col) {
      float sum = 0.0f;
      for(int i = 0; i < M.height; ++i) {
        for(int j = 0; j < M.width; ++j) {
            int iN = row - M.height/2 + i;
            int jN = col - M.width/2 + j;
            if(iN >= 0 && iN < N.height && jN >= 0 && jN < N.width) {
                sum += M.elements[i*M.width + j]*N.elements[iN*N.width + jN];
            }
        }
      }
      float relativeError = (sum - P.elements[row*P.width + col])/sum;
      if (relativeError > relativeTolerance
        || relativeError < -relativeTolerance) {
        printf("TEST FAILED\n\n");
        exit(0);
      }
    }
  }
  printf("TEST PASSED\n\n");

}

void freeMatrix(Matrix mat)
{
    free(mat.elements);
    mat.elements = NULL;
}

void freeDeviceMatrix(Matrix mat)
{
    cudaFree(mat.elements);
    mat.elements = NULL;
}

void startTime(Timer* timer) {
    gettimeofday(&(timer->startTime), NULL);
}

void stopTime(Timer* timer) {
    gettimeofday(&(timer->endTime), NULL);
}

float elapsedTime(Timer timer) {
    return ((float) ((timer.endTime.tv_sec - timer.startTime.tv_sec) \
                + (timer.endTime.tv_usec - timer.startTime.tv_usec)/1.0e6));
}

cudaMemcpyToSymbol(M_h, M_c,FILTER_SIZE*sizeof(float));

cudaMemcpyToSymbol(M_c, M_h.elements,FILTER_SIZE*FILTER_SIZE*sizeof(float));