Cuda (使用相同的网格/块设置):-当输入的大小足够小时,gpu/cpu时间差异很小(因为H2D/D2H和内核启动开销);-与大尺寸输入有相当大的差异(gpu在计算速度上的增益应隐藏开销)。现在我正在根据你的问题编辑我的答案谢谢@Maria Chiara Cec
Cuda (使用相同的网格/块设置):-当输入的大小足够小时,gpu/cpu时间差异很小(因为H2D/D2H和内核启动开销);-与大尺寸输入有相当大的差异(gpu在计算速度上的增益应隐藏开销)。现在我正在根据你的问题编辑我的答案谢谢@Maria Chiara Cec,cuda,nvidia,gpu,Cuda,Nvidia,Gpu,(使用相同的网格/块设置):-当输入的大小足够小时,gpu/cpu时间差异很小(因为H2D/D2H和内核启动开销);-与大尺寸输入有相当大的差异(gpu在计算速度上的增益应隐藏开销)。现在我正在根据你的问题编辑我的答案谢谢@Maria Chiara Cecconi,因为你我计算了更精细的时间流逝。@AmilaSenadheera不客气。从我在回答中提到的Robert Crovella的同一个答案中,还有一句有趣的话(这可能对你有用):许多代码要么是计算绑定的,要么是内存绑定的,根据粗略的经验,G
(使用相同的网格/块设置):-当输入的大小足够小时,gpu/cpu时间差异很小(因为H2D/D2H和内核启动开销);-与大尺寸输入有相当大的差异(gpu在计算速度上的增益应隐藏开销)。现在我正在根据你的问题编辑我的答案谢谢@Maria Chiara Cecconi,因为你我计算了更精细的时间流逝。@AmilaSenadheera不客气。从我在回答中提到的Robert Crovella的同一个答案中,还有一句有趣的话(这可能对你有用):许多代码要么是计算绑定的,要么是内存绑定的,根据粗略的经验,GPU在这两种情况下都提供了大约5倍的容量。因此,显著超过5倍的加速通常会被怀疑地看待,或者被认为是一厢情愿的。我试图平衡两种图像大小的块工作。每个块都有
32x32
多个线程。我没有通过定义块大小32来限制独立执行部分。GPU做得比CPU好。您是否注意到对于相同的图像大小,5x5
比3x3
做得更好?
int block_size = 32;
int grid_size = width/block_size; //width of the image in pixels
dim3 dimBlock(block_size, block_size, 1);
dim3 dimGrid(grid_size, grid_size, 1);
clock_t start_d=clock();
meanFilter_d <<< dimGrid, dimBlock >>> (image_data_d, result_image_data_d, width, height, half_window);
cudaThreadSynchronize();
clock_d end_d=clock();
clock_t start_h = clock();
meanFilter_h(data, result_image_data_h1, width, height, window_size);
clock_t end_h = clock();
void meanFilter_h(unsigned char* raw_image_matrix,unsigned char* filtered_image_data,int image_width, int image_height, int window_size)
{
// int size = 3 * image_width * image_height;
int half_window = (window_size-window_size % 2)/2;
for(int i = 0; i < image_height; i += 1){
for(int j = 0; j < image_width; j += 1){
int k = 3*(i*image_height+j);
int top, bottom, left, right;
if(i-half_window >= 0){top = i-half_window;}else{top = 0;}// top limit
if(i+half_window <= image_height-1){bottom = i+half_window;}else{bottom = image_height-1;}// bottom limit
if(j-half_window >= 0){left = j-half_window;}else{left = 0;}// left limit
if(j+half_window <= image_width-1){right = j+half_window;}else{right = image_width-1;}// right limit
double first_byte = 0;
double second_byte = 0;
double third_byte = 0;
// move inside the window
for(int x = top; x <= bottom; x++){
for(int y = left; y <= right; y++){
int pos = 3*(x*image_height + y); // three bytes
first_byte += raw_image_matrix[pos];
second_byte += raw_image_matrix[pos+1];
third_byte += raw_image_matrix[pos+2];
}
}
int effective_window_size = (bottom-top+1)*(right-left+1);
filtered_image_data[k] = first_byte/effective_window_size;
filtered_image_data[k+1] = second_byte/effective_window_size;
filtered_image_data[k+2] =third_byte/effective_window_size;
}
}
}
__global__ void meanFilter_d(unsigned char* raw_image_matrix, unsigned char* filtered_image_data, int image_width, int image_height, int half_window)
{
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
if (i < image_height && j < image_width){
int k = 3*(i*image_height+j);
int top, bottom, left, right;
if(i-half_window >= 0){top = i-half_window;}else{top = 0;}// top limit
if(i+half_window <= image_height-1){bottom = i+half_window;}else{bottom = image_height-1;}// bottom limit
if(j-half_window >= 0){left = j-half_window;}else{left = 0;}// left limit
if(j+half_window <= image_width-1){right = j+half_window;}else{right = image_width-1;}// right limit
double first_byte = 0;
double second_byte = 0;
double third_byte = 0;
// move inside the window
for(int x = top; x <= bottom; x++){
for(int y = left; y <= right; y++){
int pos = 3*(x*image_height + y); // three bytes
first_byte += raw_image_matrix[pos];
second_byte += raw_image_matrix[pos+1];
third_byte += raw_image_matrix[pos+2];
}
}
int effective_window_size = (bottom-top+1)*(right-left+1);
filtered_image_data[k] = first_byte/effective_window_size;
filtered_image_data[k+1] = second_byte/effective_window_size;
filtered_image_data[k+2] =third_byte/effective_window_size;
}
}
std::chrono::system_clock::now()
std::chrono::high_resolution_clock::now();