Cuda （使用相同的网格/块设置）：-当输入的大小足够小时，gpu/cpu时间差异很小（因为H2D/D2H和内核启动开销）；-与大尺寸输入有相当大的差异（gpu在计算速度上的增益应隐藏开销）。现在我正在根据你的问题编辑我的答案谢谢@Maria Chiara Cec_Cuda_Nvidia_Gpu

Cuda （使用相同的网格/块设置）：-当输入的大小足够小时，gpu/cpu时间差异很小（因为H2D/D2H和内核启动开销）；-与大尺寸输入有相当大的差异（gpu在计算速度上的增益应隐藏开销）。现在我正在根据你的问题编辑我的答案谢谢@Maria Chiara Cec

cuda

Cuda （使用相同的网格/块设置）：-当输入的大小足够小时，gpu/cpu时间差异很小（因为H2D/D2H和内核启动开销）；-与大尺寸输入有相当大的差异（gpu在计算速度上的增益应隐藏开销）。现在我正在根据你的问题编辑我的答案谢谢@Maria Chiara Cec,cuda,nvidia,gpu,Cuda,Nvidia,Gpu,（使用相同的网格/块设置）：-当输入的大小足够小时，gpu/cpu时间差异很小（因为H2D/D2H和内核启动开销）；-与大尺寸输入有相当大的差异（gpu在计算速度上的增益应隐藏开销）。现在我正在根据你的问题编辑我的答案谢谢@Maria Chiara Cecconi，因为你我计算了更精细的时间流逝。@AmilaSenadheera不客气。从我在回答中提到的Robert Crovella的同一个答案中，还有一句有趣的话（这可能对你有用）：许多代码要么是计算绑定的，要么是内存绑定的，根据粗略的经验，G

（使用相同的网格/块设置）：-当输入的大小足够小时，gpu/cpu时间差异很小（因为H2D/D2H和内核启动开销）；-与大尺寸输入有相当大的差异（gpu在计算速度上的增益应隐藏开销）。现在我正在根据你的问题编辑我的答案谢谢@Maria Chiara Cecconi，因为你我计算了更精细的时间流逝。@AmilaSenadheera不客气。从我在回答中提到的Robert Crovella的同一个答案中，还有一句有趣的话（这可能对你有用）：许多代码要么是计算绑定的，要么是内存绑定的，根据粗略的经验，GPU在这两种情况下都提供了大约5倍的容量。因此，显著超过5倍的加速通常会被怀疑地看待，或者被认为是一厢情愿的。我试图平衡两种图像大小的块工作。每个块都有

32x32

多个线程。我没有通过定义块大小32来限制独立执行部分。GPU做得比CPU好。您是否注意到对于相同的图像大小，

5x5

比

3x3

做得更好？

        int block_size = 32;
        int grid_size = width/block_size; //width of the image in pixels
        dim3 dimBlock(block_size, block_size, 1);
        dim3 dimGrid(grid_size, grid_size, 1);

        clock_t start_d=clock();
        meanFilter_d <<< dimGrid, dimBlock >>> (image_data_d, result_image_data_d, width, height, half_window);
        cudaThreadSynchronize();
        clock_d end_d=clock();

        clock_t start_h = clock();
        meanFilter_h(data, result_image_data_h1, width, height, window_size);
        clock_t end_h = clock();

void meanFilter_h(unsigned char* raw_image_matrix,unsigned char* filtered_image_data,int image_width, int image_height, int window_size)
{
    // int size = 3 * image_width * image_height;
    int half_window = (window_size-window_size % 2)/2;
    for(int i = 0; i < image_height; i += 1){
        for(int j = 0; j < image_width; j += 1){
            int k = 3*(i*image_height+j);
            int top, bottom, left, right; 
            if(i-half_window >= 0){top = i-half_window;}else{top = 0;}// top limit
            if(i+half_window <= image_height-1){bottom = i+half_window;}else{bottom = image_height-1;}// bottom limit
            if(j-half_window >= 0){left = j-half_window;}else{left = 0;}// left limit
            if(j+half_window <= image_width-1){right = j+half_window;}else{right = image_width-1;}// right limit
            double first_byte = 0; 
            double second_byte = 0; 
            double third_byte = 0; 
            // move inside the window
            for(int x = top; x <= bottom; x++){
                for(int y = left; y <= right; y++){
                    int pos = 3*(x*image_height + y); // three bytes
                    first_byte += raw_image_matrix[pos];
                    second_byte += raw_image_matrix[pos+1];
                    third_byte += raw_image_matrix[pos+2];
                }
            }
            int effective_window_size = (bottom-top+1)*(right-left+1);
            filtered_image_data[k] = first_byte/effective_window_size;
            filtered_image_data[k+1] = second_byte/effective_window_size;
            filtered_image_data[k+2] =third_byte/effective_window_size;


        }
    }
}

__global__ void meanFilter_d(unsigned char* raw_image_matrix, unsigned char* filtered_image_data, int image_width, int image_height, int half_window)
{
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int i = blockIdx.y * blockDim.y + threadIdx.y;

    if (i < image_height && j < image_width){
        int k = 3*(i*image_height+j);
        int top, bottom, left, right; 
        if(i-half_window >= 0){top = i-half_window;}else{top = 0;}// top limit
        if(i+half_window <= image_height-1){bottom = i+half_window;}else{bottom = image_height-1;}// bottom limit
        if(j-half_window >= 0){left = j-half_window;}else{left = 0;}// left limit
        if(j+half_window <= image_width-1){right = j+half_window;}else{right = image_width-1;}// right limit
        double first_byte = 0; 
        double second_byte = 0; 
        double third_byte = 0; 
        // move inside the window
        for(int x = top; x <= bottom; x++){
            for(int y = left; y <= right; y++){
                int pos = 3*(x*image_height + y); // three bytes
                first_byte += raw_image_matrix[pos];
                second_byte += raw_image_matrix[pos+1];
                third_byte += raw_image_matrix[pos+2];
            }
        }
        int effective_window_size = (bottom-top+1)*(right-left+1);
        filtered_image_data[k] = first_byte/effective_window_size;
        filtered_image_data[k+1] = second_byte/effective_window_size;
        filtered_image_data[k+2] =third_byte/effective_window_size;
    }
}

std::chrono::system_clock::now()

std::chrono::high_resolution_clock::now();