Optimization 预乘ARGB的SSE alpha混合
我正在尝试编写一个支持SSE的alpha合成器,这就是我想到的。首先,代码将混合两个分别为4像素的向量:Optimization 预乘ARGB的SSE alpha混合,optimization,graphics,x86,sse,simd,Optimization,Graphics,X86,Sse,Simd,我正在尝试编写一个支持SSE的alpha合成器,这就是我想到的。首先,代码将混合两个分别为4像素的向量: // alpha blend two 128-bit (16 byte) SSE vectors containing 4 pre-multiplied ARGB values each // __attribute__((always_inline)) static inline __m128i blend4(__m128i under, __m128i over) { // sh
// alpha blend two 128-bit (16 byte) SSE vectors containing 4 pre-multiplied ARGB values each
//
__attribute__((always_inline))
static inline __m128i blend4(__m128i under, __m128i over) {
// shuffle masks for alpha and 255 vector for 255-alpha
//
// NOTE: storing static __m128i here with _mm_set_si128 was _very_ slow, compiler doesn't seem
// to know it can store this as a const, so it had guard variables and did real static initialization,
// stick with arrays.
//
static const uint64_t allo[2] __attribute__((aligned(16))) = { 0x03ff03ff03ff03ff, 0x07ff07ff07ff07ff };
static const uint64_t alhi[2] __attribute__((aligned(16))) = { 0x0bff0bff0bff0bff, 0x0fff0fff0fff0fff };
static const uint64_t m255[2] __attribute__((aligned(16))) = { 0xff00ff00ff00ff00, 0xff00ff00ff00ff00 };
// replicate top two pixels from under
__m128i underhi = (__m128i)_mm_movehl_ps(
(__m128)under,
(__m128)under
);
__m128i u16_0 = _mm_cvtepu8_epi16(under); // convert 8-bit fields to 16-bit with zero extension
__m128i u16_1 = _mm_cvtepu8_epi16(underhi);
__m128i al8_0 = _mm_shuffle_epi8 (over, *(__m128i*)&allo); // replicate (alpha << 8) to each field
__m128i al8_1 = _mm_shuffle_epi8 (over, *(__m128i*)&alhi);
__m128i mal_0 = _mm_sub_epi8 (*(__m128i*)&m255, al8_0); // compute 255-alpha
__m128i mal_1 = _mm_sub_epi8 (*(__m128i*)&m255, al8_1);
__m128i mul_0 = _mm_mulhi_epu16 (u16_0, mal_0); // under*(255-over.alpha)
__m128i mul_1 = _mm_mulhi_epu16 (u16_1, mal_1);
__m128i pixel = _mm_packus_epi16 (mul_0, mul_1);
// add to background pixel with saturation
return _mm_adds_epi8(over, pixel);
}
//alpha混合两个128位(16字节)的SSE向量,每个向量包含4个预乘的ARGB值
//
__属性_uuu((始终_内联))
静态内联\uuuum128i混合4(\uuuum128i低于,\uuuum128i高于){
//对alpha和255 alpha分别使用混洗遮罩和255矢量
//
//注意:在这里用集si128存储静态m128i非常慢,编译器看起来不太好
//要知道它可以将其存储为常量,所以它有保护变量并进行了真正的静态初始化,
//坚持使用数组。
//
静态常量uint64_t allo[2]___属性(对齐(16))={0x03ff03ff03ff03ff,0x07ff07ff07ff07ff};
静态常量uint64_t alhi[2]__属性(对齐(16))={0x0bfff0bfff0bff,0x0fff0fff0fff};
静态常量uint64_t m255[2]uuu属性(对齐(16))={0xff00ff00ff00ff00,0xff00ff00ff00};
//从下方复制顶部的两个像素
__m128i underhi=(m128i)mm\u movehl\u ps(
(uu m128)根据,
(uu m128)根据
);
__m128i u16_0=_mm_cvtep8_epi16(下);//将8位字段转换为16位,扩展名为零
__m128i u16_1=_mm_cvtep8_epi16(在HI下);
__m128i al8_0=_mm_shuffle_epi8(结束,*(u m128i*)&allo);//复制(alpha不是对您的问题的直接回答,但这太长了,无法发表评论,可能对某人有用
将alpha排列到每个16位通道的上半部分,这样您就可以使用\u mm\u mulhi\u epu16
使用一条指令将产品排列到较低的位,这一技巧非常巧妙。我的问题稍有不同,因为我没有预乘alpha,我需要能够为整个纹理指定不透明度。我扩展了请注意以下事项:
__m128i blend4(__m128i under, __m128i over, float opacity) {
const __m128i alpha16 = _mm_set1_epi16(alpha * 255);
const __m128i allo = (__m128i) _mm_setr_epi32(0xff03ff03, 0xff03ff03, 0xff07ff07, 0x0ff7ff07);
const __m128i alhi = (__m128i) _mm_setr_epi32(0xff0bff0b, 0xff0bff0b, 0xff0fff0f, 0x0fffff0f);
const __m128i zero = (__m128i) _mm_setr_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000);
const __m128i i255 = (__m128i) _mm_setr_epi32(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00);
__m128i under0 = _mm_cvtepu8_epi16(under);
__m128i under1 = _mm_unpackhi_epi8(under, zero);
__m128i over0 = _mm_cvtepu8_epi16(over);
__m128i over1 = _mm_unpackhi_epi8(over, zero);
__m128i alpha0 = _mm_mullo_epi16(_mm_shuffle_epi8(over, allo), alpha16);
__m128i alpha1 = _mm_mullo_epi16(_mm_shuffle_epi8(over, alhi), alpha16);
__m128i invAlpha0 = _mm_xor_si128(i255, alpha0);
__m128i invAlpha1 = _mm_xor_si128(i255, alpha1);
__m128i underMul0 = _mm_mulhi_epu16(under0, invAlpha0);
__m128i underMul1 = _mm_mulhi_epu16(under1, invAlpha1);
__m128i overMul0 = _mm_mulhi_epu16(over0, alpha0);
__m128i overMul1 = _mm_mulhi_epu16(over1, alpha1);
__m128i underFinal = _mm_packus_epi16(underMul0, underMul1);
__m128i overFinal = _mm_packus_epi16(overMul0, overMul1);
return _mm_adds_epu8(overFinal, underFinal);
}
我首先将alpha洗入每个泳道的下半部分,这样结果的高位在与alpha16
相乘后最终进入每个泳道的上半部分,然后从那里开始\u mm\u mulhi\u epu16
技巧像往常一样工作。剩下的只是简单的alpha乘法。看起来像是很多洗。更高版本的英特尔CPU(Haswell及更高版本)只有一个矢量洗牌单元。您的Sandybridge有两个128位整数洗牌单元。无论如何,请使用带零的\u mm\u unpachi\u epi8
,而不是movhlps
+pmovzxbw
。(使用AVX,编译器可以重用相同的零向量,或者使用SSE,使用xorps
创建零向量与Sandybridge系列上的NOP一样便宜。)使用unpachi/unpaclo代替movhlps+pmovzxbq使吞吐量增加了约5%,而且更简单,sweet。对于下半部分,您仍然可以使用pmovzxbw
。这给了编译器复制和洗牌的机会,因此它可以对下半部分进行复制和洗牌,然后punpckhbw
就地销毁原始向量。(我以前使用过它;它可以重复使用相同的零向量,但如果没有AVX,则必须在
下销毁(一份)。)由于Sandybridge没有mov消除功能,避免使用movdqa
实际上可以节省向量ALU吞吐量和前端带宽。使用\u mm\u setzero\u si128()
(或\u mm\u set1\u epi8(0)
或其他;编译器知道如何优化)。你的编译器会在内联后将其从循环中提升出来,就像使用非特殊常量一样。你试过了吗?我认为它基本上是为这样的事情而设计的,但它将其输入之一视为有符号的,并对i8xu8=>i16乘积的结果字和进行有符号16位饱和。可能会将alpha的范围从无符号移到s签名(xor(v,set1_epi8(0x80))
)并在最后纠正偏差,您可以延迟解包。
// blend 32/16/8/4 pixels at a time
ssize_t ii=0;
for (ii *= 2; ii < len/32; ii++) { blendN<8>(vdst+8*ii, vunder+8*ii, vover+8*ii); }
for (ii *= 2; ii < len/16; ii++) { blendN<4>(vdst+4*ii, vunder+4*ii, vover+4*ii); }
for (ii *= 2; ii < len/8; ii++) { blendN<2>(vdst+2*ii, vunder+2*ii, vover+2*ii); }
for (ii *= 2; ii < len/4; ii++) { blendN<1>(vdst+1*ii, vunder+1*ii, vover+1*ii); }
// handle remainder
ii *= 4;
for (; ii < len; ii++) {
*(pdst+ii) = blend(*(punder+ii), *(pover+ii));
}
__attribute__((always_inline))
static inline __m128i blend4(__m128i under, __m128i over) {
// shuffle masks for alpha and 255 vector for 255-alpha
//
// NOTE: storing static __m128i is _very_ slow, compiler doesn't seem to know it can store
// this as a const, so it had guard variables and did real static initialization. Stick with
// just const
//
const __m128i allo = (__m128i)_mm_setr_epi32(0x03ff03ff, 0x03ff03ff, 0x07ff07ff, 0x07ff07ff);
const __m128i alhi = (__m128i)_mm_setr_epi32(0x0bff0bff, 0x0bff0bff, 0x0fff0fff, 0x0fff0fff);
const __m128i zero = (__m128i)_mm_setr_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000);
const __m128 m255 = (__m128 )_mm_setr_epi32(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00);
__m128i u16_0 = _mm_cvtepu8_epi16(under); // convert 8-bit fields to 16-bit with zero extension
__m128i u16_1 = _mm_unpackhi_epi8(under, zero);
__m128i al8_0 = _mm_shuffle_epi8 (over, allo); // replicate (alpha << 8) to each field
__m128i al8_1 = _mm_shuffle_epi8 (over, alhi);
__m128i mal_0 = (__m128i)_mm_xor_ps(m255, (__m128)al8_0); // compute 255-alpha
__m128i mal_1 = (__m128i)_mm_xor_ps(m255, (__m128)al8_1);
__m128i mul_0 = _mm_mulhi_epu16 (u16_0, mal_0); // under*(255-over.alpha)
__m128i mul_1 = _mm_mulhi_epu16 (u16_1, mal_1);
__m128i pixel = _mm_packus_epi16 (mul_0, mul_1);
// add to background pixel with saturation
return _mm_adds_epi8(over, pixel);
}
__m128i blend4(__m128i under, __m128i over, float opacity) {
const __m128i alpha16 = _mm_set1_epi16(alpha * 255);
const __m128i allo = (__m128i) _mm_setr_epi32(0xff03ff03, 0xff03ff03, 0xff07ff07, 0x0ff7ff07);
const __m128i alhi = (__m128i) _mm_setr_epi32(0xff0bff0b, 0xff0bff0b, 0xff0fff0f, 0x0fffff0f);
const __m128i zero = (__m128i) _mm_setr_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000);
const __m128i i255 = (__m128i) _mm_setr_epi32(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00);
__m128i under0 = _mm_cvtepu8_epi16(under);
__m128i under1 = _mm_unpackhi_epi8(under, zero);
__m128i over0 = _mm_cvtepu8_epi16(over);
__m128i over1 = _mm_unpackhi_epi8(over, zero);
__m128i alpha0 = _mm_mullo_epi16(_mm_shuffle_epi8(over, allo), alpha16);
__m128i alpha1 = _mm_mullo_epi16(_mm_shuffle_epi8(over, alhi), alpha16);
__m128i invAlpha0 = _mm_xor_si128(i255, alpha0);
__m128i invAlpha1 = _mm_xor_si128(i255, alpha1);
__m128i underMul0 = _mm_mulhi_epu16(under0, invAlpha0);
__m128i underMul1 = _mm_mulhi_epu16(under1, invAlpha1);
__m128i overMul0 = _mm_mulhi_epu16(over0, alpha0);
__m128i overMul1 = _mm_mulhi_epu16(over1, alpha1);
__m128i underFinal = _mm_packus_epi16(underMul0, underMul1);
__m128i overFinal = _mm_packus_epi16(overMul0, overMul1);
return _mm_adds_epu8(overFinal, underFinal);
}