C++ 从RGB到YUV的颜色转换（YCoCg）_C++_Halide

C++ 从RGB到YUV的颜色转换（YCoCg）

c++

C++ 从RGB到YUV的颜色转换（YCoCg）,c++,halide,C++,Halide,我正在尝试实现一个颜色转换Func，输出到3个单独的缓冲区。rgb_to_ycocg功能有一个4x8位通道交错缓冲区（BGRA）和3个输出缓冲区（Y、Co和Cg），每个缓冲区都是16位值。目前，我正在使用这段代码： void rgb_to_ycocg(const uint8_t *pSrc, int32_t srcStep, int16_t *pDst[3], int32_t dstStep[3], int width, int height) { Buffer<uint8_t&g

我正在尝试实现一个颜色转换

Func

，输出到3个单独的缓冲区。

rgb_to_ycocg

功能有一个4x8位通道交错缓冲区（BGRA）和3个输出缓冲区（Y、Co和Cg），每个缓冲区都是16位值。目前，我正在使用这段代码：

void rgb_to_ycocg(const uint8_t *pSrc, int32_t srcStep, int16_t *pDst[3], int32_t dstStep[3], int width, int height)
{
    Buffer<uint8_t> inRgb((uint8_t *)pSrc, 4, width, height);
    Buffer<int16_t> outY(pDst[0], width, height);
    Buffer<int16_t> outCo(pDst[1], width, height);
    Buffer<int16_t> outCg(pDst[2], width, height);

    Var x, y, c;
    Func calcY, calcCo, calcCg, inRgb16;

    inRgb16(c, x, y) = cast<int16_t>(inRgb(c, x, y));

    calcY(x, y) = (inRgb16(0, x, y) + ((inRgb16(2, x, y) - inRgb16(0, x, y)) >> 1)) + ((inRgb16(1, x, y) - (inRgb16(0, x, y) + ((inRgb16(2, x, y) - inRgb16(0, x, y)) >> 1))) >> 1);
    calcCo(x, y) = inRgb16(2, x, y) - inRgb16(0, x, y);
    calcCg(x, y) =  inRgb16(1, x, y) - (inRgb16(0, x, y) + ((inRgb16(2, x, y) - inRgb16(0, x, y)) >> 1));

    Pipeline p =Pipeline({calcY, calcCo, calcCg});
    p.vectorize(x, 16).parallel(y);
    p.realize({ outY, outCo, outCg });
}

void rgb_to_ycocg（const uint8_t*pSrc，int32_t srcStep，int16_t*pDst[3]，int32_t dstep[3]，int width，int height）
{
缓冲器inRgb（（uint8_t*）pSrc，4，宽度，高度）；
缓冲区出口（pDst[0]，宽度，高度）；
缓冲区出口（pDst[1]，宽度，高度）；
缓冲区出口（pDst[2]，宽度，高度）；
变量x，y，c；
功能计算、计算成本、计算成本、INRG16；
inrg16（c，x，y）=铸造（inRgb（c，x，y））；
计算（x，y）=（inrg16（0，x，y）+（inrg16（2，x，y）-inrg16（0，x，y））>>1）+（inrg16（1，x，y）-（inrg16（0，x，y）+（inrg16（2，x，y）-inrg16（0，x，y））>>1）；
calcCo（x，y）=inrg16（2，x，y）-inrg16（0，x，y）；
calcCg（x，y）=inrg16（1，x，y）-（inrg16（0，x，y）+（inrg16（2，x，y）-inrg16（0，x，y））>>1）；
管道p=管道（{calcY，calcCo，calcCg}）；
p、 矢量化（x，16）。平行（y）；
p、 实现（{outY，outCo，outCg}）；
}

问题是，与参考实现（c中循环的基本实现）相比，我的性能很差。我知道我需要尝试更好的调度，但我认为我在输入/输出缓冲区方面做错了什么。我看过这些教程，并试图找到一种输出到多个缓冲区的方法。使用

管道

是我能找到的唯一方法。制作3

Func

s并分别调用它们会更好吗？这是否正确使用了

管道类？
这里可能存在的最大问题是，每次要转换单个图像时，都要生成并编译代码。那真的很慢。使用ImageParams而不是Buffers，定义一次管道，然后多次实现它
二阶效应是，我认为你实际上想要一个元组而不是管道。元组函数在同一个内部循环中计算其所有值，这将重用inRgb等的加载。暂时忽略重新编译问题，请尝试：
void rgb_to_ycocg(const uint8_t *pSrc, int32_t srcStep, int16_t *pDst[3], int32_t dstStep[3], int width, int height)
{
    Buffer<uint8_t> inRgb((uint8_t *)pSrc, 4, width, height);
    Buffer<int16_t> outY(pDst[0], width, height);
    Buffer<int16_t> outCo(pDst[1], width, height);
    Buffer<int16_t> outCg(pDst[2], width, height);

    Var x, y, c;
    Func calcY, calcCo, calcCg, inRgb16;

    inRgb16(c, x, y) = cast<int16_t>(inRgb(c, x, y));

    out(x, y) = {
        inRgb16(0, x, y) + ((inRgb16(2, x, y) - inRgb16(0, x, y)) >> 1)) + ((inRgb16(1, x, y) - (inRgb16(0, x, y) + ((inRgb16(2, x, y) - inRgb16(0, x, y)) >> 1))) >> 1),
        inRgb16(2, x, y) - inRgb16(0, x, y),
        inRgb16(1, x, y) - (inRgb16(0, x, y) + ((inRgb16(2, x, y) - inRgb16(0, x, y)) >> 1))
    };

    out.vectorize(x, 16).parallel(y);
    out.realize({ outY, outCo, outCg });
}

void rgb_to_ycocg（const uint8_t*pSrc，int32_t srcStep，int16_t*pDst[3]，int32_t dstep[3]，int width，int height）
{
缓冲器inRgb（（uint8_t*）pSrc，4，宽度，高度）；
缓冲区出口（pDst[0]，宽度，高度）；
缓冲区出口（pDst[1]，宽度，高度）；
缓冲区出口（pDst[2]，宽度，高度）；
变量x，y，c；
功能计算、计算成本、计算成本、INRG16；
inrg16（c，x，y）=铸造（inRgb（c，x，y））；
out（x，y）={
INRG16（0，x，y）+（INRG16（2，x，y）-INRG16（0，x，y））>>1）+（INRG16（1，x，y）-（INRG16（0，x，y）+（INRG16（2，x，y）-INRG16（0，x，y））>>1），
INRG16（2，x，y）-INRG16（0，x，y），
INRG16（1，x，y）-（INRG16（0，x，y）+（INRG16（2，x，y）-INRG16（0，x，y））>>1）
};
矢量化（x，16），平行（y）；
实现（{outY，outCo，outCg}）；
}
谢谢你的回答，我知道重新编译的问题，但我很高兴看到可以用元组完成。我明天试试这个。再次感谢！