C++ SIMD:为什么SSE RGB到YUV的颜色转换速度与c+差不多+;实施
我刚刚尝试优化RGB到YUV420转换器。使用查找表可以提高速度,使用定点算法也可以提高速度。然而,我期望使用SSE指令获得真正的收益。我的第一次尝试导致代码速度变慢,在链接所有操作之后,它的速度与原始代码大致相同。我的实现中是否存在错误,或者SSE指令是否不适合手头的任务 原代码的一部分如下:C++ SIMD:为什么SSE RGB到YUV的颜色转换速度与c+差不多+;实施,c++,optimization,rgb,yuv,sse2,C++,Optimization,Rgb,Yuv,Sse2,我刚刚尝试优化RGB到YUV420转换器。使用查找表可以提高速度,使用定点算法也可以提高速度。然而,我期望使用SSE指令获得真正的收益。我的第一次尝试导致代码速度变慢,在链接所有操作之后,它的速度与原始代码大致相同。我的实现中是否存在错误,或者SSE指令是否不适合手头的任务 原代码的一部分如下: #define RRGB24YUVCI2_00 0.299 #define RRGB24YUVCI2_01 0.587 #define RRGB24YUVCI2_02 0.114 #defi
#define RRGB24YUVCI2_00 0.299
#define RRGB24YUVCI2_01 0.587
#define RRGB24YUVCI2_02 0.114
#define RRGB24YUVCI2_10 -0.147
#define RRGB24YUVCI2_11 -0.289
#define RRGB24YUVCI2_12 0.436
#define RRGB24YUVCI2_20 0.615
#define RRGB24YUVCI2_21 -0.515
#define RRGB24YUVCI2_22 -0.100
void RealRGB24toYUV420Converter::Convert(void* pRgb, void* pY, void* pU, void* pV)
{
yuvType* py = (yuvType *)pY;
yuvType* pu = (yuvType *)pU;
yuvType* pv = (yuvType *)pV;
unsigned char* src = (unsigned char *)pRgb;
/// Y have range 0..255, U & V have range -128..127.
double u,v;
double r,g,b;
/// Step in 2x2 pel blocks. (4 pels per block).
int xBlks = _width >> 1;
int yBlks = _height >> 1;
for(int yb = 0; yb < yBlks; yb++)
for(int xb = 0; xb < xBlks; xb++)
{
int chrOff = yb*xBlks + xb;
int lumOff = (yb*_width + xb) << 1;
unsigned char* t = src + lumOff*3;
/// Top left pel.
b = (double)(*t++);
g = (double)(*t++);
r = (double)(*t++);
py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b));
u = RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b;
v = RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b;
/// Top right pel.
b = (double)(*t++);
g = (double)(*t++);
r = (double)(*t++);
py[lumOff+1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b));
u += RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b;
v += RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b;
lumOff += _width;
t = t + _width*3 - 6;
/// Bottom left pel.
b = (double)(*t++);
g = (double)(*t++);
r = (double)(*t++);
py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b));
u += RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b;
v += RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b;
/// Bottom right pel.
b = (double)(*t++);
g = (double)(*t++);
r = (double)(*t++);
py[lumOff+1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b));
u += RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b;
v += RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b;
/// Average the 4 chr values.
int iu = (int)u;
int iv = (int)v;
if(iu < 0) ///< Rounding.
iu -= 2;
else
iu += 2;
if(iv < 0) ///< Rounding.
iv -= 2;
else
iv += 2;
pu[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iu/4) );
pv[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iv/4) );
}//end for xb & yb...
}//end Convert.
#定义RRGB24YUVCI2_00 0.299
#定义RRGB24YUVCI2_01 0.587
#定义RRGB24YUVCI2_02 0.114
#定义RRGB24YUVCI2_10-0.147
#定义RRGB24YUVCI2_11-0.289
#定义RRGB24YUVCI2_12 0.436
#定义RRGB24YUVCI2_20 0.615
#定义RRGB24YUVCI2_21-0.515
#定义RRGB24YUVCI2_22-0.100
void RealRGB24toYUV420Converter::Convert(void*pRgb、void*pY、void*pU、void*pV)
{
yuvType*py=(yuvType*)py;
yuvType*pu=(yuvType*)pu;
yuvType*pv=(yuvType*)pv;
无符号字符*src=(无符号字符*)pRgb;
///Y的范围为0..255,U&V的范围为-128..127。
双u,v;
双r,g,b;
///步进2x2像素块(每个块4像素)。
int xBlks=_width>>1;
int yBlks=_高度>>1;
for(int-yb=0;yb1;
int yBlks=_高度>>1;
for(int-yb=0;yb2);
pv[chrOff]=(yuvType)(_chrOff+RRGB24YUVCI2_RANGECHECK_N128TO127(iv>>2));
}//结束xb和yb。。。
}
这是我第一次尝试SSE2,所以可能我遗漏了什么?仅供参考,我正在使用Visual Studio 2008在Windows平台上工作。有几个问题:
- 您正在使用未对齐的负载-这些负载非常昂贵(除了Nehalem aka Core i5/Core i7上的负载)-至少是对齐加载成本的2倍-如果加载后有大量计算,则可以摊销成本,但在这种情况下,您的计算量相对较少。您可以通过对齐这些16字节并使用对齐加载来修复来自bgr1、bgr2等的加载。[更好的是,根本不要使用这些中间数组-直接将数据从内存加载到SSE寄存器,并使用SIMD执行所有洗牌等操作-见下文]
- 您在标量和SIMD代码之间来回奔波-就性能而言,标量代码可能是占主导地位的部分,因此SIMD的任何收益都会被这一点所淹没-您确实需要使用SIMD指令在循环中做所有事情(即,摆脱标量代码)
如果您可以更改编译器,您可以尝试“英特尔编译器Windows版”。我怀疑它会更好,尤其是对于内联汇编代码,但它确实值得一看。我发现您的方法存在一些问题: < > > C++版本从指针T加载到“双R,G,B”,并且编译器很可能直接将这些加载到FP寄存器中,即“双R,G,B”在运行时在寄存器中生存。但是在您的版本中,加载到“浮标BGR0/1/2/3”中,然后调用“MyLyLuxUpPS”。如果“浮动BGR0/ 1/2/3”,我不会感到惊讶。在内存中,这意味着您有额外的读写内存
你最好的选择是找到一个现有的、优化的RGB/YUV转换库并使用它。你好,保罗,谢谢你的回答。我已经将所有数组修改为16字节对齐,现在我使用的是_mm_load_ps而不是_mm_loadu_ps。但到目前为止,我看不到任何明显的区别。关于你的第二个建议,请原谅我的无知:呵呵w我可以避免在标量和SIMD代码之间切换吗?我不明白如何摆脱标量代码。@拉尔夫:这是一个棘手的部分,即思考SIMD的方法,以了解如何使用标量代码。理想情况下,您应该从内存直接将数据加载到SSE寄存器,然后将元素重新组织到所需的arr中angement,进行计算,重新组织到所需的输出排列,然后直接从SSE寄存器存储到内存中。如果您有SSSE3(又名SSE3.5)或更好,则元素的洗牌更容易(PSHUFB)-对于SSE3和更早版本,这仍然是可能的,但有点棘手,因为可用的洗牌指令有限。嗨,约翰,我尝试了对齐数据,不幸的是没有用。谢谢你的回答。嗯……你是否只尝试对齐?因为你试图通过_mm_loadu_ps(float*)将数据加载到xmm寄存器(它映射到MOVUPS指令),您告诉处理器加载未对齐的数据。仅对齐数据是不够的,您必须使用适当的指令。对于您的情况,它是_mm_load_ps(float*)(它映射到MOVAPS指令)。如果此功能失败,则表示您的对齐有问题。感谢您的回复John,现在才看到它…是的,我确实更改了所有说明以使用_mm_load_ps,但似乎没有任何区别。感谢您的反馈
const float fRRGB24YUVCI2_00 = 0.299;
const float fRRGB24YUVCI2_01 = 0.587;
const float fRRGB24YUVCI2_02 = 0.114;
const float fRRGB24YUVCI2_10 = -0.147;
const float fRRGB24YUVCI2_11 = -0.289;
const float fRRGB24YUVCI2_12 = 0.436;
const float fRRGB24YUVCI2_20 = 0.615;
const float fRRGB24YUVCI2_21 = -0.515;
const float fRRGB24YUVCI2_22 = -0.100;
void RealRGB24toYUV420Converter::Convert(void* pRgb, void* pY, void* pU, void* pV)
{
__m128 xmm_y = _mm_loadu_ps(fCOEFF_0);
__m128 xmm_u = _mm_loadu_ps(fCOEFF_1);
__m128 xmm_v = _mm_loadu_ps(fCOEFF_2);
yuvType* py = (yuvType *)pY;
yuvType* pu = (yuvType *)pU;
yuvType* pv = (yuvType *)pV;
unsigned char* src = (unsigned char *)pRgb;
/// Y have range 0..255, U & V have range -128..127.
float bgr1[4];
bgr1[3] = 0.0;
float bgr2[4];
bgr2[3] = 0.0;
float bgr3[4];
bgr3[3] = 0.0;
float bgr4[4];
bgr4[3] = 0.0;
/// Step in 2x2 pel blocks. (4 pels per block).
int xBlks = _width >> 1;
int yBlks = _height >> 1;
for(int yb = 0; yb < yBlks; yb++)
for(int xb = 0; xb < xBlks; xb++)
{
int chrOff = yb*xBlks + xb;
int lumOff = (yb*_width + xb) << 1;
unsigned char* t = src + lumOff*3;
bgr1[2] = (float)*t++;
bgr1[1] = (float)*t++;
bgr1[0] = (float)*t++;
bgr2[2] = (float)*t++;
bgr2[1] = (float)*t++;
bgr2[0] = (float)*t++;
t = t + _width*3 - 6;
bgr3[2] = (float)*t++;
bgr3[1] = (float)*t++;
bgr3[0] = (float)*t++;
bgr4[2] = (float)*t++;
bgr4[1] = (float)*t++;
bgr4[0] = (float)*t++;
__m128 xmm1 = _mm_loadu_ps(bgr1);
__m128 xmm2 = _mm_loadu_ps(bgr2);
__m128 xmm3 = _mm_loadu_ps(bgr3);
__m128 xmm4 = _mm_loadu_ps(bgr4);
// Y
__m128 xmm_res_y = _mm_mul_ps(xmm1, xmm_y);
py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] ));
// Y
xmm_res_y = _mm_mul_ps(xmm2, xmm_y);
py[lumOff + 1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] ));
lumOff += _width;
// Y
xmm_res_y = _mm_mul_ps(xmm3, xmm_y);
py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] ));
// Y
xmm_res_y = _mm_mul_ps(xmm4, xmm_y);
py[lumOff+1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] ));
// U
__m128 xmm_res = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(xmm1, xmm_u), _mm_mul_ps(xmm2, xmm_u)),
_mm_add_ps(_mm_mul_ps(xmm3, xmm_u), _mm_mul_ps(xmm4, xmm_u))
);
float fU = xmm_res.m128_f32[0] + xmm_res.m128_f32[1] + xmm_res.m128_f32[2];
// V
xmm_res = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(xmm1, xmm_v), _mm_mul_ps(xmm2, xmm_v)),
_mm_add_ps(_mm_mul_ps(xmm3, xmm_v), _mm_mul_ps(xmm4, xmm_v))
);
float fV = xmm_res.m128_f32[0] + xmm_res.m128_f32[1] + xmm_res.m128_f32[2];
/// Average the 4 chr values.
int iu = (int)fU;
int iv = (int)fV;
if(iu < 0) ///< Rounding.
iu -= 2;
else
iu += 2;
if(iv < 0) ///< Rounding.
iv -= 2;
else
iv += 2;
pu[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iu >> 2) );
pv[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iv >> 2) );
}//end for xb & yb...
}