C 如何使下面的双线性插值代码更有效？_C_Interpolation_Performance

C 如何使下面的双线性插值代码更有效？

c performance

C 如何使下面的双线性插值代码更有效？,c,interpolation,performance,C,Interpolation,Performance,下面的代码是使用双线性插值放大图片在慢重缩放功能中，哪里可以进行修改以提高效率我希望从计算机组织原理的角度对其进行修改期待您的回答谢谢 unsigned char *slow_rescale(unsigned char *src, int src_x, int src_y, int dest_x, int dest_y) { double step_x,step_y; // Step increase as per instructions above unsign

下面的代码是使用双线性插值放大图片

在慢重缩放功能中，哪里可以进行修改以提高效率

我希望从计算机组织原理的角度对其进行修改

期待您的回答

谢谢

unsigned char *slow_rescale(unsigned char *src, int src_x, int src_y, int dest_x, int dest_y)
{
 double step_x,step_y;          // Step increase as per instructions above
 unsigned char R1,R2,R3,R4;     // Colours at the four neighbours
 unsigned char G1,G2,G3,G4;
 unsigned char B1,B2,B3,B4;
 double RT1, GT1, BT1;          // Interpolated colours at T1 and T2
 double RT2, GT2, BT2;
 unsigned char R,G,B;           // Final colour at a destination pixel
 unsigned char *dst;            // Destination image - must be allocated here! 
 int x,y;               // Coordinates on destination image
 double fx,fy;              // Corresponding coordinates on source image
 double dx,dy;              // Fractional component of source image    coordinates

 dst=(unsigned char *)calloc(dest_x*dest_y*3,sizeof(unsigned char));   // Allocate and clear   destination image
 if (!dst) return(NULL);                           // Unable to allocate image

 step_x=(double)(src_x-1)/(double)(dest_x-1);
 step_y=(double)(src_y-1)/(double)(dest_y-1);

 for (x=0;x<dest_x;x++)         // Loop over destination image
  for (y=0;y<dest_y;y++)
  {
    fx=x*step_x;
    fy=y*step_y;
    dx=fx-(int)fx;
    dy=fy-(int)fy;   
    getPixel(src,floor(fx),floor(fy),src_x,&R1,&G1,&B1);    // get N1 colours
    getPixel(src,ceil(fx),floor(fy),src_x,&R2,&G2,&B2); // get N2 colours
    getPixel(src,floor(fx),ceil(fy),src_x,&R3,&G3,&B3); // get N3 colours
    getPixel(src,ceil(fx),ceil(fy),src_x,&R4,&G4,&B4);  // get N4 colours
   // Interpolate to get T1 and T2 colours
   RT1=(dx*R2)+(1-dx)*R1;
   GT1=(dx*G2)+(1-dx)*G1;
   BT1=(dx*B2)+(1-dx)*B1;
   RT2=(dx*R4)+(1-dx)*R3;
   GT2=(dx*G4)+(1-dx)*G3;
   BT2=(dx*B4)+(1-dx)*B3;
   // Obtain final colour by interpolating between T1 and T2
   R=(unsigned char)((dy*RT2)+((1-dy)*RT1));
   G=(unsigned char)((dy*GT2)+((1-dy)*GT1));
   B=(unsigned char)((dy*BT2)+((1-dy)*BT1));
  // Store the final colour
  setPixel(dst,x,y,dest_x,R,G,B);
 }
  return(dst);
}
void getPixel(unsigned char *image, int x, int y, int sx, unsigned char *R, unsigned char *G, unsigned char *B)
{
 // Get the colour at pixel x,y in the image and return it using the provided RGB pointers
 // Requires the image size along the x direction!
 *(R)=*(image+((x+(y*sx))*3)+0);
 *(G)=*(image+((x+(y*sx))*3)+1);
 *(B)=*(image+((x+(y*sx))*3)+2);
}

void setPixel(unsigned char *image, int x, int y, int sx, unsigned char R, unsigned char G, unsigned char B)
{
 // Set the colour of the pixel at x,y in the image to the specified R,G,B
 // Requires the image size along the x direction!
 *(image+((x+(y*sx))*3)+0)=R;
 *(image+((x+(y*sx))*3)+1)=G;
 *(image+((x+(y*sx))*3)+2)=B;
}

unsigned char*slow\u重缩放（unsigned char*src，int-src\u x，int-src\u y，int-dest\u x，int-dest\u y）
{
双步骤x，步骤y；//按照上述说明增加步骤
无符号字符R1、R2、R3、R4；//四个相邻字符的颜色
无符号字符G1、G2、G3、G4；
无符号字符B1、B2、B3、B4；
双RT1、GT1、BT1；//T1和T2处的插值颜色
双RT2，GT2，BT2；
无符号字符R，G，B；//目标像素处的最终颜色
unsigned char*dst；//必须在此处分配目标映像！
int x，y；//目标图像上的坐标
double fx，fy；//源图像上的对应坐标
double dx，dy；//源图像坐标的分数分量
dst=（unsigned char*）calloc（dest_x*dest_y*3，sizeof（unsigned char））；//分配并清除目标映像
如果（！dst）返回（NULL）；//无法分配映像
步骤x=（双）（src_x-1）/（双）（dest_x-1）；
步骤y=（双）（src_y-1）/（双）（dest_y-1）；
对于（x=0；x，以下是一些想法：
使用而不是浮点。这将使像floor
和ceil
这样的计算（可能还有乘法，尽管我不确定）更快
将ceil（x）
替换为floor（x）+1
用于将fx=x*步骤x
中的乘法替换为加法
如果您知道内存中像素的布局，请使用更高效的方法替换getPixel
使用以下代码转换将两个乘法减少为一：（dx*R2）+（1-dx）*R1
==>R1+dx*（R2-R1）

（最后，但可能最有潜力）使用矢量化编译器或手动编辑代码以使用SSE或其他技术（如果您的平台上可用）
我一直担心图像处理性能。以下是一些需要记住的明显注意事项：
数值精度：
从你的代码中跳出来的第一件事是对步长、颜色值和坐标使用双精度。你真的需要这些数量的精度吗？如果不需要，你可以在使用定点或浮点时进行一些分析来检查代码的性能
请记住，这是一个依赖于硬件的问题，性能可能是一个问题，也可能不是一个问题，这取决于硬件是否实现了double、float only或两者都不实现（然后两者都在软件中实现）这方面的讨论还包括内存对齐、合并内存访问等。当然，这些主题涉及“计算机组织原理”，还有更多
循环展开：
您是否也考虑过手动？这可能有帮助，也可能没有帮助，因为您的编译器可能已经尝试利用这些优化，但至少值得考虑一下，因为您在潜在的大数组大小上有一个双循环
数字冗余：
在getPixel（）函数中，您还可以为每个RGB组件计算image+（（x+（y*sx））*3
，这似乎没有改变，为什么不在函数开始时只计算一次这个数量呢
矢量处理：
要想优化这样的代码，首先必须考虑是否可以利用向量处理。您是否可以访问向量化指令集，例如SSE
并行处理：
大多数系统都安装了OpenMP。如果是这样的话，你可以考虑重构你的代码以利用处理器的多核能力。这是令人惊讶的直接使用PrabMA的实现，它当然值得检查。
编译器标志：
此外，尽管您没有直接提及，但编译标志会影响C代码的性能。例如，如果使用gcc，您可以使用以下方法比较性能差异：
gcc -std=c99 -o main main.c

vs
在这段代码中，乘法运算可以大大减少
dx
可以在外循环中计算，在那里我们可以准备乘法表以进行进一步的操作，如RT1=（dx*R2）+（1-dx）*R1
，因为乘法（R2、R1等）的大小为1字节
下面的代码比我的机器上的原始文件运行速度快10倍（Mac OS，Mac C++编译器-O3）：
#包括
#包括
#包括
内联void fast_getPixel（无符号字符*图像、整数x、整数y、整数sx、无符号字符*R、无符号字符*G、无符号字符*B）
{
//获取图像中像素x，y处的颜色，并使用提供的RGB指针返回
//需要沿x方向的图像大小！
无符号字符*ptr=image+（（x+（y*sx））*3）；
*R=ptr[0]；
*G=ptr[1]；
*B=ptr[2]；
}
内联void fast_setPixel（无符号字符*图像、整数x、整数y、整数sx、无符号字符R、无符号字符G、无符号字符B）
{
//将图像中x、y处像素的颜色设置为指定的R、G、B
//需要沿x方向的图像大小！
无符号字符*ptr=image+（（x+（y*sx））*3）；
ptr[0]=R；
ptr[1]=G；
ptr[2]=B；
}
无效生成\u dx\u表格（双*表格，双dx）
{
无符号len=0xff；
表[0]=0；
对于（无符号i=1；iGPU有硬件为您执行双线性插值。在CPU上执行此操作就像在软件中执行浮点操作，而不使用浮点硬件（例如x87或SSE/AVX）我的最佳建议是考虑优化算法，如一般图像滤波器，它可以提供更好的视觉效果，而大多数GPU不支持。图形GEMS III，即使它是古老的，也有好的。
gcc -std=c99 -O3 -o main main.c 

#include <stdio.h>
#include <math.h>
#include <stdlib.h>

inline void fast_getPixel(unsigned char *image, int x, int y, int sx, unsigned char *R, unsigned char *G, unsigned char *B)
{
    // Get the colour at pixel x,y in the image and return it using the provided RGB pointers
    // Requires the image size along the x direction!
    unsigned char *ptr = image+((x+(y*sx))*3);
    *R=ptr[0];
    *G=ptr[1];
    *B=ptr[2];
}

inline void fast_setPixel(unsigned char *image, int x, int y, int sx, unsigned char R, unsigned char G, unsigned char B)
{
    // Set the colour of the pixel at x,y in the image to the specified R,G,B
    // Requires the image size along the x direction!
    unsigned char *ptr = image+((x+(y*sx))*3);
    ptr[0]=R;
    ptr[1]=G;
    ptr[2]=B;
}

void build_dx_table(double* table,double dx)
{
    unsigned len = 0xff;
    table[0] = 0;
    for (unsigned i=1;i<len;i++)
    {
        table[i] = table[i-1]+dx;
    }
}

unsigned char *fast_rescale(unsigned char *src, int src_x, int src_y, int dest_x, int dest_y)
{
    double step_x,step_y;          // Step increase as per instructions above
    unsigned char R1,R2,R3,R4;     // Colours at the four neighbours
    unsigned char G1,G2,G3,G4;
    unsigned char B1,B2,B3,B4;
    double RT1, GT1, BT1;          // Interpolated colours at T1 and T2
    double RT2, GT2, BT2;
    unsigned char R,G,B;           // Final colour at a destination pixel
    unsigned char *dst;            // Destination image - must be allocated here!
    int x,y;               // Coordinates on destination image
    double fx,fy;              // Corresponding coordinates on source image
    double dx,dy;              // Fractional component of source image    coordinates
    double dxtable[0xff];

    dst=(unsigned char *)calloc(dest_x*dest_y*3,sizeof(unsigned char));   // Allocate and clear   destination image
    if (!dst) return(NULL);                           // Unable to allocate image

    step_x=(double)(src_x-1)/(double)(dest_x-1);
    step_y=(double)(src_y-1)/(double)(dest_y-1);

    for (x=0,fx=0;x<dest_x;x++,fx+=step_x)         // Loop over destination image
        dx=fx-(int)fx;
        build_dx_table(dxtable,dx);
        for (y=0,fy=0;y<dest_y;y++,fy+=step_y)
        {
            dy=fy-(int)fy;
            fast_getPixel(src,floor(fx),floor(fy),src_x,&R1,&G1,&B1);    // get N1 colours
            fast_getPixel(src,ceil(fx),floor(fy),src_x,&R2,&G2,&B2); // get N2 colours
            fast_getPixel(src,floor(fx),ceil(fy),src_x,&R3,&G3,&B3); // get N3 colours
            fast_getPixel(src,ceil(fx),ceil(fy),src_x,&R4,&G4,&B4);  // get N4 colours
            // Interpolate to get T1 and T2 colours
            RT1=dxtable[R2-R1]+R1;
            GT1=dxtable[G2-G1]+G1;
            BT1=dxtable[B2-B1]+B1;
            RT2=dxtable[R4-R3]+R3;
            GT2=dxtable[G4-G3]+G3;
            BT2=dxtable[B4-B3]+B3;
            // Obtain final colour by interpolating between T1 and T2
            R=(unsigned char)(dy*(RT2-RT1)+RT1);
            G=(unsigned char)(dy*(GT2-GT1)+GT1);
            B=(unsigned char)(dy*(BT2-BT1)+BT1);
            // Store the final colour
            fast_setPixel(dst,x,y,dest_x,R,G,B);
        }
    return(dst);
}