C++ 如何优化此腐蚀过滤器代码?
我已经实现了将腐蚀过滤器应用于图像的功能C++ 如何优化此腐蚀过滤器代码?,c++,qt,opengl,image-processing,optimization,C++,Qt,Opengl,Image Processing,Optimization,我已经实现了将腐蚀过滤器应用于图像的功能 void applyErosionFilter(QImage &input, int matrixSize) { int filterOffset = (matrixSize - 1) / 2; int byteOffset = 0; uchar red, green, blue = 0; uchar morphResetValue = 255; uchar *data = input.bits();
void applyErosionFilter(QImage &input, int matrixSize)
{
int filterOffset = (matrixSize - 1) / 2;
int byteOffset = 0;
uchar red, green, blue = 0;
uchar morphResetValue = 255;
uchar *data = input.bits();
int stride = input.bytesPerLine();
uchar *newdata = new uchar[stride * input.height()];
int i = 0;
for (int y = filterOffset; y < input.height() - filterOffset; y++)
{
for (int x = filterOffset; x < input.width() - filterOffset; x++)
{
byteOffset = y * stride + x * 4;
red = morphResetValue;
green = morphResetValue;
blue = morphResetValue;
for (int filterY = -filterOffset; filterY <= filterOffset; filterY++)
{
for (int filterX = -filterOffset; filterX <= filterOffset; filterX++)
{
i = byteOffset + (filterX * 4) + (filterY * stride);
if (data[i] < red)
red = data[i];
if (data[i + 1] < green)
green = data[i + 1];
if (data[i + 2] < blue)
blue = data[i + 2];
}
}
newdata[byteOffset] = red;
newdata[byteOffset + 1] = green;
newdata[byteOffset + 2] = blue;
newdata[byteOffset + 3] = 255;
}
}
input = input.fromImage(QImage(newdata, input.width(), input.height(), QImage::Format::Format_ARGB32));
delete [] newdata;
}
void applyErosionFilter(QImage&input,int matrixSize)
{
int filterOffset=(矩阵大小-1)/2;
int字节偏移量=0;
乌查尔红、绿、蓝=0;
uchar值=255;
uchar*data=input.bits();
int stride=input.bytesPerLine();
uchar*newdata=new-uchar[stride*input.height()];
int i=0;
对于(int y=filterOffset;y 对于(int filterY=-filterOffset;filterY,作为一种快速改进,我建议使用多个线程并行计算几行。您也可以使用OpenCL或Cuda在GPU上实现这一点,但这需要大量的样板文件
我已经修改了你的代码以使用多线程,但我还没有对此进行测试,因为我目前没有在此设备上安装Qt。但这至少可以给你一个从何处开始的提示
(顺便说一句,uchar红、绿、蓝=0;
仅用0初始化蓝
,而红
和绿
保持未初始化状态)
#包括
#包括
void applyErosionFilter(QImage&input,int matrixSize)
{
int filterOffset=(矩阵大小-1)/2;
int字节偏移量=0;
uchar值=255;
uchar*data=input.bits();
int stride=input.bytesPerLine();
uchar*newdata=new-uchar[stride*input.height()];
unsigned num_threads=std::thread::hardware_concurrency();
if(num_threads==0)
num_线程=1;
向量线程;
int i=0;
for(无符号i=0;i使用多线程:将图像水平分带剪切并单独处理。我使用OpenMP完成了这项工作,它非常先进
大小为NxN的平方结构元素可以分解为大小为1xN和Nx1的两段(水平和垂直)。因此,您将不进行每像素的NxN测试,而是进行2xN:N=3 9 vs 6,N=5 25 vs 10等。速度更快
使用已在以下库中实现的算法:Matthieu Faessel(基于行比较的C++自动矢量化代码,最快!!!),Marc Van Droogenbroeck(C++但如果我是对的,则限于8位编码),或Christophe Clienti(非常适合SIMD优化的Lambert算法)。正如您在这些库中所看到的,他们使用智能方法/算法/架构来获得快速结果。您开发的是课程中为便于理解而教授的基础知识,但它是最慢的
谢谢大家。
我找到了我想要的,我想和你分享
QOpenGLTexture *m_texImageInput;
QOpenGLShaderProgram *m_shaderComputeH;
void initiateShader()
{
if (m_texImageInput)
{
delete m_texImageInput;
m_texImageInput = nullptr;
}
QImage img(":/image.png");
m_texImageInput = new QOpenGLTexture(img.convertToFormat(QImage::Format_RGBA8888).mirrored());
if (m_shaderComputeH)
{
delete m_shaderComputeH;
m_shaderComputeH = nullptr;
}
m_shaderComputeH = new QOpenGLShaderProgram;
m_shaderComputeH->addShaderFromSourceFile(QOpenGLShader::Compute, ":/csErosionFilter.fsh");
m_shaderComputeH->link();
}
QSize getWorkGroups(int workGroupSize, const QSize &imageSize)
{
int x = imageSize.width();
x = (x % workGroupSize) ? (x / workGroupSize) + 1 : (x / workGroupSize);
int y = imageSize.height();
y = (y % workGroupSize) ? (y / workGroupSize) + 1 : (y / workGroupSize);
return QSize(x, y);
}
void executeFilter(int radius)
{
QOpenGLExtraFunctions *f = QOpenGLContext::currentContext()->extraFunctions();
// Process input image
QSize workGroups = getWorkGroups(32, QSize(m_texImageInput->width(), m_texImageInput->height()));
// Pass 2
f->glBindImageTexture(0, m_texImageInput->textureId(), 0, 0, 0, GL_READ_WRITE, GL_RGBA8);
f->glBindImageTexture(1, m_texImageProcessed->textureId(), 0, 0, 0, GL_READ_WRITE, GL_RGBA8);
m_shaderComputeH->bind();
m_shaderComputeH->setUniformValue("radius", radius);
f->glDispatchCompute(workGroups.width(), workGroups.height(), 1);
f->glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
m_shaderComputeH->release();
// Compute cleanup
f->glBindImageTexture(0, 0, 0, 0, 0, GL_READ_WRITE, GL_RGBA8);
f->glBindImageTexture(1, 0, 0, 0, 0, GL_READ_WRITE, GL_RGBA8);
}
这是计算着色器
/// csErosionFilter.fsh
#version 430 core
#define COMPUTEPATCHSIZE 32
layout (local_size_x = COMPUTEPATCHSIZE, local_size_y = COMPUTEPATCHSIZE) in;
layout(binding=0, rgba8) uniform readonly highp image2D inputImage;
layout(binding=1, rgba8) uniform writeonly highp image2D resultImage;
uniform int radius;
void main()
{
ivec2 imgSize = imageSize(resultImage);
int x = int(gl_GlobalInvocationID.x);
int y = int(gl_GlobalInvocationID.y);
if ((x >= imgSize.x) || (y >= imgSize.y))
return;
vec4 newValue = vec4(1);
int left = clamp(x - radius, 0, imgSize.x - 1);
int right = clamp(x + radius, 0, imgSize.x - 1);
int top = clamp(y - radius, 0, imgSize.y - 1);
int bottom = clamp(y + radius, 0, imgSize.y - 1);
for (int iX = left; iX <= right; iX++)
{
for (int iY = top; iY <= bottom; iY++)
{
vec4 value = imageLoad(inputImage, ivec2(iX, iY));
if(value.x < newValue.x)
newValue.x = value.x;
if(value.y < newValue.y)
newValue.y = value.y;
if(value.z < newValue.z)
newValue.z = value.z;
}
}
imageStore(resultImage, ivec2(x,y), newValue);
}
///csErosionFilter.fsh
#430版核心
#定义COMPUTEPATCHSIZE 32
布局(本地大小x=COMPUTEPATCHSIZE,本地大小y=COMPUTEPATCHSIZE);
布局(绑定=0,rgba8)统一只读highp image2D inputImage;
布局(binding=1,rgba8)统一的写入高p image2D结果图像;
均匀整数半径;
void main()
{
ivec2 imgSize=图像大小(结果图像);
int x=int(gl_globalinovationid.x);
int y=int(gl_globalinjournalid.y);
if((x>=imgSize.x)| |(y>=imgSize.y))
返回;
vec4 newValue=vec4(1);
int left=夹具(x-半径,0,imgSize.x-1);
int right=夹具(x+半径,0,imgSize.x-1);
int top=夹具(y-半径,0,imgSize.y-1);
内底=夹具(y+半径,0,imgSize.y-1);
对于(int iX=left;iX),如果矩阵大小(我们称之为N)大于3,则可以对其进行显著优化。首先,可以将正方形NxN结构元素组合为长度为N的两行SE。接下来,每行SE可以在O(1)中计算每像素,与N无关。我能想到的最简单的方法是计算多行并行的newdata
值(即使用多个线程)。您当然可以使用OpenCL/OpenGL,但这需要大量的样板文件才能开始。矩阵大小有多大?@CrisLuengo:给OP一个参考是公平的。@Yves:足够公平。。我同意您的看法,如果这段代码是多线程的,它的执行速度会比这快得多,但问题是这个函数的执行速度太慢了从已经使用多线程的循环中调用。这就是为什么我想利用GPU@MostafaMahmoud好的,在这种情况下,我认为OpenGL/CL将是最可移植的方式。因为您使用的是Qt,所以您应该能够为QOffscreenSurface
,cre创建一个OpenGL上下文从你的QImage
中获取一个纹理,并编写一个实现腐蚀过滤器的着色器。然后使用该纹理和着色器渲染一个四边形,并将结果存储在一个qopengelframebufferobject
中,该对象会转换回QImage
。你甚至不会
/// csErosionFilter.fsh
#version 430 core
#define COMPUTEPATCHSIZE 32
layout (local_size_x = COMPUTEPATCHSIZE, local_size_y = COMPUTEPATCHSIZE) in;
layout(binding=0, rgba8) uniform readonly highp image2D inputImage;
layout(binding=1, rgba8) uniform writeonly highp image2D resultImage;
uniform int radius;
void main()
{
ivec2 imgSize = imageSize(resultImage);
int x = int(gl_GlobalInvocationID.x);
int y = int(gl_GlobalInvocationID.y);
if ((x >= imgSize.x) || (y >= imgSize.y))
return;
vec4 newValue = vec4(1);
int left = clamp(x - radius, 0, imgSize.x - 1);
int right = clamp(x + radius, 0, imgSize.x - 1);
int top = clamp(y - radius, 0, imgSize.y - 1);
int bottom = clamp(y + radius, 0, imgSize.y - 1);
for (int iX = left; iX <= right; iX++)
{
for (int iY = top; iY <= bottom; iY++)
{
vec4 value = imageLoad(inputImage, ivec2(iX, iY));
if(value.x < newValue.x)
newValue.x = value.x;
if(value.y < newValue.y)
newValue.y = value.y;
if(value.z < newValue.z)
newValue.z = value.z;
}
}
imageStore(resultImage, ivec2(x,y), newValue);
}