C++ 如何提高浮动lerp功能的速度？_C++_Optimization_Graphics_Simd_Sse_X86

C++ 如何提高浮动lerp功能的速度？

c++ optimization graphics x86

C++ 如何提高浮动lerp功能的速度？,c++,optimization,graphics,simd,sse,x86,C++,Optimization,Graphics,Simd,Sse,X86,最近我写了一个软光栅渲染器，但它的速度真的很慢。通过性能测试，我发现浮点lerp函数是瓶颈。如何提高此功能的速度？使用simd？有什么想法吗 inline float MathUtil::Lerp(float x1, float x2, float t) { return x1 + (x2 - x1)*t; } //lerp vector ZCVector MathUtil::Lerp(const ZCVector& v1, const ZCVector& v2, fl

最近我写了一个软光栅渲染器，但它的速度真的很慢。通过性能测试，我发现浮点lerp函数是瓶颈。如何提高此功能的速度？使用simd？有什么想法吗

inline float MathUtil::Lerp(float x1, float x2, float t)
{
    return x1 + (x2 - x1)*t;
}

//lerp vector
ZCVector MathUtil::Lerp(const ZCVector& v1, const ZCVector& v2, float t)
{
    return ZCVector(
        Lerp(v1.x, v2.x, t),
        Lerp(v1.y, v2.y, t),
        Lerp(v1.z, v2.z, t),
        v1.w
    );
}

//lerp ZCFLOAT2
ZCFLOAT2 MathUtil::Lerp(const ZCFLOAT2& v1, const ZCFLOAT2& v2, float t)
{
    return ZCFLOAT2(
        Lerp(v1.u, v2.u, t),
        Lerp(v1.v, v2.v, t)
    );
}

//lerp ZCFLOAT3
ZCFLOAT3 MathUtil::Lerp(const ZCFLOAT3& v1, const ZCFLOAT3& v2, float t)
{
    return ZCFLOAT3(
        Lerp(v1.x, v2.x, t),
        Lerp(v1.y, v2.y, t),
        Lerp(v1.z, v2.z, t)
    );
}

//lerp VertexOut
VertexOut MathUtil::Lerp(const VertexOut& v1, const VertexOut& v2, float t)
{
    return VertexOut(
        Lerp(v1.posTrans, v2.posTrans, t),
        Lerp(v1.posH, v2.posH, t),
        Lerp(v1.tex, v2.tex, t),
        Lerp(v1.normal, v2.normal, t),
        Lerp(v1.color, v2.color, t),
        Lerp(v1.oneDivZ, v2.oneDivZ, t)
    );
}

VertexOut的结构：

class VertexOut
{
public:

    ZCVector posTrans;

    ZCVector posH;

    ZCFLOAT2 tex;

    ZCVector normal;

    ZCFLOAT3 color;

    float oneDivZ;
}

scanlinefill

函数用于填充三角形，每个顶点都需要使用lerp函数，因此会被多次调用

void Tiny3DDeviceContext::ScanlineFill(const VertexOut& left, const VertexOut& right,  int yIndex)
{
    float dx = right.posH.x - left.posH.x;

    for (float x = left.posH.x; x <= right.posH.x; x += 0.5f)
    {
        int xIndex = static_cast<int>(x + .5f);
        if(xIndex >= 0 && xIndex < m_pDevice->GetClientWidth())
        {

            float lerpFactor = 0;
            if (dx != 0)
            {
                lerpFactor = (x - left.posH.x) / dx;
            }


            float oneDivZ = MathUtil::Lerp(left.oneDivZ, right.oneDivZ, lerpFactor);
            if (oneDivZ >= m_pDevice->GetZ(xIndex,yIndex))
            {
                m_pDevice->SetZ(xIndex, yIndex, oneDivZ);
                //lerp get vertex
                VertexOut out = MathUtil::Lerp(left, right, lerpFactor);
                out.posH.y = yIndex;

                m_pDevice->DrawPixel(xIndex, yIndex, m_pShader->PS(out));
            }           
        }   
    }
}

void Tiny3DDeviceContext:：扫描线填充（const VertexOut&left，const VertexOut&right，int yIndex）
{
float dx=right.posH.x-left.posH.x；
对于（float x=left.posH.x；x=0&&xIndexGetClientWidth（））
{
浮动系数=0；
如果（dx！=0）
{
lerpFactor=（x-left.posH.x）/dx；
}
float-oneDivZ=MathUtil:：Lerp（left.oneDivZ，right.oneDivZ，lerpFactor）；
如果（oneDivZ>=m_pDevice->GetZ（xIndex，yIndex））
{
m_pDevice->SetZ（xIndex、yIndex、oneDivZ）；
//lerp获取顶点
VertexOut out=MathUtil:：Lerp（左、右、lerpFactor）；
out.posH.y=yIndex；
m_pDevice->DrawPixel（xIndex、yIndex、m_pShader->PS（out））；
}           
}   
}
}

此循环结构可能运行

lerp

两倍于所需的次数：

for (float x = left.posH.x; x <= right.posH.x; x += 0.5f) {
      int xIndex = static_cast<int>(x + .5f);
      ...
}

用于（float x=left.posH.x；x试试按位摆弄。不要在错误的地方尝试过早的优化。你在为什么平台进行优化？x86？你需要一个在旧CPU上运行的二进制文件，还是可以使用AVX和FMA？什么编译器和选项？更重要的是，周围的代码是什么？它实际上是自动矢量化的吗？这个函数tion本身应该使用SSE或任何其他SIMD ISA进行简单的矢量化。它所嵌入的周围代码显然是关键的。延迟是吞吐量的限制因素吗？@πάνταῥεῖ: 在现代CPU上，在FP指令之间进行整数位摆弄通常不是一个好主意。在x86上，您需要使用SIMD整数指令（因为即使是标量浮点也使用XMM寄存器），或者您必须在XMM和整数寄存器之间进行缓慢的往返。在FP INSN之间使用整数向量指令会产生额外的旁路延迟，因此，即使您可以使用一个或两个整数指令做一些有用的事情，延迟也会比仅使用FP更差。此外，向量FP指令具有极高的吞吐量。例如，Haswell上每个时钟有2个向量FMA。@PeterCordes我从来没有说过这是个好主意：PenEnvironment:win10，vs2015。它用于lerp pos、tex、颜色等，如果使用SIMD，如何重写它？