为什么将.at()opencv与openmp并行化需要更多时间 P>我尝试用OpenMP和OpenCV实现C++中的图像形态学算子。该算法工作正常,但当我使用VTune获得评测结果时,我发现并行化方法比顺序方法花费更多的时间,这是由.at()openCv函数引起的。为什么?我怎样才能解决它 bool Morph_op_manager::is_fullfit(Mat image,int i,int j,Mat strel,int strel_counter,bool parallel){ int mask_counter = 0; int ii=0; int jj=0; for ( ii = 0; ii <strel.rows ; ii++) { uchar* strel_ptr = strel.ptr<uchar>(ii); uchar* image_ptr = image.ptr<uchar>(i - (strel.rows - ii)); for ( jj = 0; jj <strel.cols ; ++jj) { mask_counter += (int) image_ptr[j-(strel.cols-jj)]; } } return mask_counter == strel_counter;
这是我的密码:为什么将.at()opencv与openmp并行化需要更多时间 P>我尝试用OpenMP和OpenCV实现C++中的图像形态学算子。该算法工作正常,但当我使用VTune获得评测结果时,我发现并行化方法比顺序方法花费更多的时间,这是由.at()openCv函数引起的。为什么?我怎样才能解决它 bool Morph_op_manager::is_fullfit(Mat image,int i,int j,Mat strel,int strel_counter,bool parallel){ int mask_counter = 0; int ii=0; int jj=0; for ( ii = 0; ii <strel.rows ; ii++) { uchar* strel_ptr = strel.ptr<uchar>(ii); uchar* image_ptr = image.ptr<uchar>(i - (strel.rows - ii)); for ( jj = 0; jj <strel.cols ; ++jj) { mask_counter += (int) image_ptr[j-(strel.cols-jj)]; } } return mask_counter == strel_counter;,c++,opencv,parallel-processing,openmp,C++,Opencv,Parallel Processing,Openmp,这是我的密码: Mat Morph_op_manager::compute_morph_base_op(Mat image, bool parallel, int type) { //strel attribute int strel_rows = 5; int strel_cols = 5; //strel center coordinate int cr = 2; int cc = 2; //number of row and column after strel center int
Mat Morph_op_manager::compute_morph_base_op(Mat image, bool parallel, int type) {
//strel attribute
int strel_rows = 5;
int strel_cols = 5;
//strel center coordinate
int cr = 2;
int cc = 2;
//number of row and column after strel center
int nrac = strel_rows - cr ;
int ncac = strel_cols - cr ;
//strel init
Mat strel(strel_rows,strel_cols,CV_8UC1, Scalar(0));
Mat op_result = image.clone();
if (parallel == false)
omp_set_num_threads(1); // Use 1 threads for all consecutive parallel regions
//parallelized nested loop
#pragma omp parallel for collapse(4)
for (int i= cr ; i<image.rows-nrac; i++)
for (int j = cc; j < image.cols -ncac; j++) {
for (int m = 0; m < strel_rows; m++)
for (int n = 0; n < strel_cols; n++) {
// if type = 0 -> erode
if (type == 0){
if (image.at<uchar>(i-(strel_rows-m),j-(strel_cols-n)) != strel.at<uchar>(m,n)){
op_result.at<uchar>(i, j) = 255;
}
}
// if type == 0 -> dilate
if (type == 1){
if (image.at<uchar>(i-(strel_rows-m),j-(strel_cols-n)) == strel.at<uchar>(m,n)){
op_result.at<uchar>(i, j) = 0;
}
}
}
}
}
bool Morph_op_manager::is_fullfit(Mat image,int i,int j,Mat strel,int strel_counter,bool parallel){
int mask_counter = 0;
int ii=0;
int jj=0;
for ( ii = 0; ii <strel.rows ; ii++) {
uchar* strel_ptr = strel.ptr<uchar>(ii);
uchar* image_ptr = image.ptr<uchar>(i - (strel.rows - ii));
for ( jj = 0; jj <strel.cols ; ++jj) {
mask_counter += (int) image_ptr[j-(strel.cols-jj)];
}
}
return mask_counter == strel_counter;
Mat Morph_op_manager::compute_Morph_base_op(Mat-image,bool-parallel,int-type){
//strel属性
int strel_行=5;
int strel_cols=5;
//应力中心坐标
int-cr=2;
int cc=2;
//strel center之后的行数和列数
int nrac=街行-cr;
int ncac=strel_cols-cr;
//链球菌感染
Mat strel(strel_行、strel_列、CV_8UC1、标量(0));
Mat op_result=image.clone();
if(parallel==false)
omp_set_num_threads(1);//对所有连续的并行区域使用1个线程
//并行嵌套循环
#用于折叠的pragma omp并行(4)
对于(int i=cr;i侵蚀
如果(类型==0){
如果(图像在(i-(strel\u rows-m),j-(strel\u cols-n))!=strel.at(m,n)){
(i,j)处的运算结果=255;
}
}
//如果类型==0->扩展
如果(类型==1){
如果(图像在(i-(strel_rows-m),j-(strel_cols-n))==strel.at(m,n)){
(i,j)处的运算结果=0;
}
}
}
}
}
以下是分析结果:
bool Morph_op_manager::is_fullfit(Mat image,int i,int j,Mat strel,int strel_counter,bool parallel){
int mask_counter = 0;
int ii=0;
int jj=0;
for ( ii = 0; ii <strel.rows ; ii++) {
uchar* strel_ptr = strel.ptr<uchar>(ii);
uchar* image_ptr = image.ptr<uchar>(i - (strel.rows - ii));
for ( jj = 0; jj <strel.cols ; ++jj) {
mask_counter += (int) image_ptr[j-(strel.cols-jj)];
}
}
return mask_counter == strel_counter;
bool Morph_op_manager::is_fullfit(Mat image,int i,int j,Mat strel,int strel_counter,bool parallel){
int mask_counter = 0;
int ii=0;
int jj=0;
for ( ii = 0; ii <strel.rows ; ii++) {
uchar* strel_ptr = strel.ptr<uchar>(ii);
uchar* image_ptr = image.ptr<uchar>(i - (strel.rows - ii));
for ( jj = 0; jj <strel.cols ; ++jj) {
mask_counter += (int) image_ptr[j-(strel.cols-jj)];
}
}
return mask_counter == strel_counter;
bool Morph_op_manager::is_fullfit(Mat image,int i,int j,Mat strel,int strel_counter,bool parallel){
int mask_counter = 0;
int ii=0;
int jj=0;
for ( ii = 0; ii <strel.rows ; ii++) {
uchar* strel_ptr = strel.ptr<uchar>(ii);
uchar* image_ptr = image.ptr<uchar>(i - (strel.rows - ii));
for ( jj = 0; jj <strel.cols ; ++jj) {
mask_counter += (int) image_ptr[j-(strel.cols-jj)];
}
}
return mask_counter == strel_counter;
加速:
bool Morph_op_manager::is_fullfit(Mat image,int i,int j,Mat strel,int strel_counter,bool parallel){
int mask_counter = 0;
int ii=0;
int jj=0;
for ( ii = 0; ii <strel.rows ; ii++) {
uchar* strel_ptr = strel.ptr<uchar>(ii);
uchar* image_ptr = image.ptr<uchar>(i - (strel.rows - ii));
for ( jj = 0; jj <strel.cols ; ++jj) {
mask_counter += (int) image_ptr[j-(strel.cols-jj)];
}
}
return mask_counter == strel_counter;
而是使用**.at()**方法,我使用指针访问像素矩阵,并按照下面的代码中所述更改指令
bool Morph_op_manager::is_fullfit(Mat image,int i,int j,Mat strel,int strel_counter,bool parallel){
int mask_counter = 0;
int ii=0;
int jj=0;
for ( ii = 0; ii <strel.rows ; ii++) {
uchar* strel_ptr = strel.ptr<uchar>(ii);
uchar* image_ptr = image.ptr<uchar>(i - (strel.rows - ii));
for ( jj = 0; jj <strel.cols ; ++jj) {
mask_counter += (int) image_ptr[j-(strel.cols-jj)];
}
}
return mask_counter == strel_counter;
一个问题仍然存在:在我的评测日志Mat::.release()花费大量时间为什么?我如何解决它
bool Morph_op_manager::is_fullfit(Mat image,int i,int j,Mat strel,int strel_counter,bool parallel){
int mask_counter = 0;
int ii=0;
int jj=0;
for ( ii = 0; ii <strel.rows ; ii++) {
uchar* strel_ptr = strel.ptr<uchar>(ii);
uchar* image_ptr = image.ptr<uchar>(i - (strel.rows - ii));
for ( jj = 0; jj <strel.cols ; ++jj) {
mask_counter += (int) image_ptr[j-(strel.cols-jj)];
}
}
return mask_counter == strel_counter;
加速代码:
omp_set_num_threads(4);
double start_time = omp_get_wtime();
#pragma omp parallel for shared(strel,image,op_result,strel_el_count) private(i,j) schedule(dynamic) if(parallel == true)
for( i = cr; i < image.rows-nrac; i++)
{
op_result.addref();
uchar* opresult_ptr = op_result.ptr<uchar>(i);
for ( j = cc; j < image.cols-ncac; j++)
{
//type == 0 --> erode
if (type == 0 ){
if(is_fullfit(image,i,j,strel,strel_el_count,parallel)){
opresult_ptr[j] = 0;
}
else
opresult_ptr[j] = 255;
}
}
}
bool Morph_op_manager::is_fullfit(Mat image,int i,int j,Mat strel,int strel_counter,bool parallel){
int mask_counter = 0;
int ii=0;
int jj=0;
for ( ii = 0; ii <strel.rows ; ii++) {
uchar* strel_ptr = strel.ptr<uchar>(ii);
uchar* image_ptr = image.ptr<uchar>(i - (strel.rows - ii));
for ( jj = 0; jj <strel.cols ; ++jj) {
mask_counter += (int) image_ptr[j-(strel.cols-jj)];
}
}
return mask_counter == strel_counter;
omp\u set\u num\u线程(4);
双启动时间=omp\U get\U wtime();
#pragma omp parallel for shared(strel,image,op_result,strel_el_count)private(i,j)schedule(dynamic)if(parallel==true)
对于(i=cr;i腐蚀
如果(类型==0){
如果(是完整的(图像,i,j,strel,strel_el_count,parallel)){
运算结果_ptr[j]=0;
}
其他的
运算结果_ptr[j]=255;
}
}
}
这是的fullfit函数
bool Morph_op_manager::is_fullfit(Mat image,int i,int j,Mat strel,int strel_counter,bool parallel){
int mask_counter = 0;
int ii=0;
int jj=0;
for ( ii = 0; ii <strel.rows ; ii++) {
uchar* strel_ptr = strel.ptr<uchar>(ii);
uchar* image_ptr = image.ptr<uchar>(i - (strel.rows - ii));
for ( jj = 0; jj <strel.cols ; ++jj) {
mask_counter += (int) image_ptr[j-(strel.cols-jj)];
}
}
return mask_counter == strel_counter;
bool-Morph\u-op\u管理器::是否完全适合(Mat-image、int-i、int-j、Mat-strel、int-strel\u计数器、bool-parallel){
int mask_计数器=0;
int ii=0;
int jj=0;
对于(ii=0;ii),我认为问题不在于.at函数本身,而在于复杂的索引参数,这些参数阻止openmp以有效的方式“融合”循环。我想测试一下,如果你的答案解决了我的问题,我如何只使用一个来融合循环?我有四个循环:_(这看起来像是一个简单的图像过滤器,为什么要使用折叠
呢?请提供,添加有关系统的信息(CPU、内存)并使用SO编辑器的上载图像功能。按引用传递参数按值(&)读取,替换时间缩短;)