Optimization 预乘ARGB的SSE alpha混合

Optimization 预乘ARGB的SSE alpha混合,optimization,graphics,x86,sse,simd,Optimization,Graphics,X86,Sse,Simd,我正在尝试编写一个支持SSE的alpha合成器,这就是我想到的。首先,代码将混合两个分别为4像素的向量: // alpha blend two 128-bit (16 byte) SSE vectors containing 4 pre-multiplied ARGB values each // __attribute__((always_inline)) static inline __m128i blend4(__m128i under, __m128i over) { // sh

我正在尝试编写一个支持SSE的alpha合成器,这就是我想到的。首先,代码将混合两个分别为4像素的向量:

// alpha blend two 128-bit (16 byte) SSE vectors containing 4 pre-multiplied ARGB values each
//
__attribute__((always_inline))
static inline __m128i blend4(__m128i under, __m128i over) {
    // shuffle masks for alpha and 255 vector for 255-alpha
    //
    // NOTE: storing static __m128i here with _mm_set_si128 was _very_ slow, compiler doesn't seem
    // to know it can store this as a const, so it had guard variables and did real static initialization,
    // stick with arrays.
    //
    static const uint64_t allo[2] __attribute__((aligned(16))) = { 0x03ff03ff03ff03ff, 0x07ff07ff07ff07ff };
    static const uint64_t alhi[2] __attribute__((aligned(16))) = { 0x0bff0bff0bff0bff, 0x0fff0fff0fff0fff };
    static const uint64_t m255[2] __attribute__((aligned(16))) = { 0xff00ff00ff00ff00, 0xff00ff00ff00ff00 };

    // replicate top two pixels from under
    __m128i underhi = (__m128i)_mm_movehl_ps(
        (__m128)under,
        (__m128)under
    );

    __m128i u16_0 = _mm_cvtepu8_epi16(under);                   // convert 8-bit fields to 16-bit with zero extension
    __m128i u16_1 = _mm_cvtepu8_epi16(underhi);  
    __m128i al8_0 = _mm_shuffle_epi8 (over, *(__m128i*)&allo);  // replicate (alpha << 8) to each field
    __m128i al8_1 = _mm_shuffle_epi8 (over, *(__m128i*)&alhi);
    __m128i mal_0 = _mm_sub_epi8     (*(__m128i*)&m255, al8_0); // compute 255-alpha
    __m128i mal_1 = _mm_sub_epi8     (*(__m128i*)&m255, al8_1);
    __m128i mul_0 = _mm_mulhi_epu16  (u16_0, mal_0);            // under*(255-over.alpha)
    __m128i mul_1 = _mm_mulhi_epu16  (u16_1, mal_1);
    __m128i pixel = _mm_packus_epi16 (mul_0, mul_1);

    // add to background pixel with saturation
    return _mm_adds_epi8(over, pixel);
}
//alpha混合两个128位(16字节)的SSE向量,每个向量包含4个预乘的ARGB值
//
__属性_uuu((始终_内联))
静态内联\uuuum128i混合4(\uuuum128i低于,\uuuum128i高于){
//对alpha和255 alpha分别使用混洗遮罩和255矢量
//
//注意:在这里用集si128存储静态m128i非常慢,编译器看起来不太好
//要知道它可以将其存储为常量,所以它有保护变量并进行了真正的静态初始化,
//坚持使用数组。
//
静态常量uint64_t allo[2]___属性(对齐(16))={0x03ff03ff03ff03ff,0x07ff07ff07ff07ff};
静态常量uint64_t alhi[2]__属性(对齐(16))={0x0bfff0bfff0bff,0x0fff0fff0fff};
静态常量uint64_t m255[2]uuu属性(对齐(16))={0xff00ff00ff00ff00,0xff00ff00ff00};
//从下方复制顶部的两个像素
__m128i underhi=(m128i)mm\u movehl\u ps(
(uu m128)根据,
(uu m128)根据
);
__m128i u16_0=_mm_cvtep8_epi16(下);//将8位字段转换为16位,扩展名为零
__m128i u16_1=_mm_cvtep8_epi16(在HI下);

__m128i al8_0=_mm_shuffle_epi8(结束,*(u m128i*)&allo);//复制(alpha不是对您的问题的直接回答,但这太长了,无法发表评论,可能对某人有用

将alpha排列到每个16位通道的上半部分,这样您就可以使用
\u mm\u mulhi\u epu16
使用一条指令将产品排列到较低的位,这一技巧非常巧妙。我的问题稍有不同,因为我没有预乘alpha,我需要能够为整个纹理指定不透明度。我扩展了请注意以下事项:

__m128i blend4(__m128i under, __m128i over, float opacity) {
    const __m128i alpha16 = _mm_set1_epi16(alpha * 255);
    const __m128i allo = (__m128i) _mm_setr_epi32(0xff03ff03, 0xff03ff03, 0xff07ff07, 0x0ff7ff07);
    const __m128i alhi = (__m128i) _mm_setr_epi32(0xff0bff0b, 0xff0bff0b, 0xff0fff0f, 0x0fffff0f);
    const __m128i zero = (__m128i) _mm_setr_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000);
    const __m128i i255 = (__m128i) _mm_setr_epi32(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00);

    __m128i under0 = _mm_cvtepu8_epi16(under);
    __m128i under1 = _mm_unpackhi_epi8(under, zero);
    __m128i over0 = _mm_cvtepu8_epi16(over);
    __m128i over1 = _mm_unpackhi_epi8(over, zero);
    __m128i alpha0 = _mm_mullo_epi16(_mm_shuffle_epi8(over, allo), alpha16);
    __m128i alpha1 = _mm_mullo_epi16(_mm_shuffle_epi8(over, alhi), alpha16);
    __m128i invAlpha0 = _mm_xor_si128(i255, alpha0);
    __m128i invAlpha1 = _mm_xor_si128(i255, alpha1);
    __m128i underMul0 = _mm_mulhi_epu16(under0, invAlpha0);
    __m128i underMul1 = _mm_mulhi_epu16(under1, invAlpha1);
    __m128i overMul0 = _mm_mulhi_epu16(over0, alpha0);
    __m128i overMul1 = _mm_mulhi_epu16(over1, alpha1);
    __m128i underFinal = _mm_packus_epi16(underMul0, underMul1);
    __m128i overFinal = _mm_packus_epi16(overMul0, overMul1);
    return _mm_adds_epu8(overFinal, underFinal);
}

我首先将alpha洗入每个泳道的下半部分,这样结果的高位在与
alpha16
相乘后最终进入每个泳道的上半部分,然后从那里开始
\u mm\u mulhi\u epu16
技巧像往常一样工作。剩下的只是简单的alpha乘法。

看起来像是很多洗。更高版本的英特尔CPU(Haswell及更高版本)只有一个矢量洗牌单元。您的Sandybridge有两个128位整数洗牌单元。无论如何,请使用带零的
\u mm\u unpachi\u epi8
,而不是
movhlps
+
pmovzxbw
。(使用AVX,编译器可以重用相同的零向量,或者使用SSE,使用
xorps
创建零向量与Sandybridge系列上的NOP一样便宜。)使用unpachi/unpaclo代替movhlps+pmovzxbq使吞吐量增加了约5%,而且更简单,sweet。对于下半部分,您仍然可以使用
pmovzxbw
。这给了编译器复制和洗牌的机会,因此它可以对下半部分进行复制和洗牌,然后
punpckhbw
就地销毁原始向量。(我以前使用过它;它可以重复使用相同的零向量,但如果没有AVX,则必须在
下销毁(一份)
。)由于Sandybridge没有mov消除功能,避免使用
movdqa
实际上可以节省向量ALU吞吐量和前端带宽。使用
\u mm\u setzero\u si128()
(或
\u mm\u set1\u epi8(0)
或其他;编译器知道如何优化)。你的编译器会在内联后将其从循环中提升出来,就像使用非特殊常量一样。你试过了吗?我认为它基本上是为这样的事情而设计的,但它将其输入之一视为有符号的,并对i8xu8=>i16乘积的结果字和进行有符号16位饱和。可能会将alpha的范围从无符号移到s签名(
xor(v,set1_epi8(0x80))
)并在最后纠正偏差,您可以延迟解包。
 // blend 32/16/8/4 pixels at a time
    ssize_t ii=0;
    for (ii *= 2; ii < len/32; ii++) { blendN<8>(vdst+8*ii, vunder+8*ii, vover+8*ii); }
    for (ii *= 2; ii < len/16; ii++) { blendN<4>(vdst+4*ii, vunder+4*ii, vover+4*ii); }
    for (ii *= 2; ii < len/8;  ii++) { blendN<2>(vdst+2*ii, vunder+2*ii, vover+2*ii); }
    for (ii *= 2; ii < len/4;  ii++) { blendN<1>(vdst+1*ii, vunder+1*ii, vover+1*ii); }

    // handle remainder
    ii *= 4;
    for (; ii < len; ii++) {
        *(pdst+ii) = blend(*(punder+ii), *(pover+ii));
    }
__attribute__((always_inline))
static inline __m128i blend4(__m128i under, __m128i over) {
    // shuffle masks for alpha and 255 vector for 255-alpha
    //
    // NOTE: storing static __m128i is _very_ slow, compiler doesn't seem to know it can store
    // this as a const, so it had guard variables and did real static initialization. Stick with 
    // just const
    //
    const __m128i allo = (__m128i)_mm_setr_epi32(0x03ff03ff, 0x03ff03ff, 0x07ff07ff, 0x07ff07ff);
    const __m128i alhi = (__m128i)_mm_setr_epi32(0x0bff0bff, 0x0bff0bff, 0x0fff0fff, 0x0fff0fff);
    const __m128i zero = (__m128i)_mm_setr_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000);
    const __m128  m255 = (__m128 )_mm_setr_epi32(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00);

    __m128i u16_0 =   _mm_cvtepu8_epi16(under);               // convert 8-bit fields to 16-bit with zero extension
    __m128i u16_1 =   _mm_unpackhi_epi8(under, zero);
    __m128i al8_0 =   _mm_shuffle_epi8 (over,  allo);         // replicate (alpha << 8) to each field
    __m128i al8_1 =   _mm_shuffle_epi8 (over,  alhi);
    __m128i mal_0 = (__m128i)_mm_xor_ps(m255, (__m128)al8_0); // compute 255-alpha
    __m128i mal_1 = (__m128i)_mm_xor_ps(m255, (__m128)al8_1);
    __m128i mul_0 =   _mm_mulhi_epu16  (u16_0, mal_0);        // under*(255-over.alpha)
    __m128i mul_1 =   _mm_mulhi_epu16  (u16_1, mal_1);
    __m128i pixel =   _mm_packus_epi16 (mul_0, mul_1);

    // add to background pixel with saturation
    return _mm_adds_epi8(over, pixel);
}
__m128i blend4(__m128i under, __m128i over, float opacity) {
    const __m128i alpha16 = _mm_set1_epi16(alpha * 255);
    const __m128i allo = (__m128i) _mm_setr_epi32(0xff03ff03, 0xff03ff03, 0xff07ff07, 0x0ff7ff07);
    const __m128i alhi = (__m128i) _mm_setr_epi32(0xff0bff0b, 0xff0bff0b, 0xff0fff0f, 0x0fffff0f);
    const __m128i zero = (__m128i) _mm_setr_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000);
    const __m128i i255 = (__m128i) _mm_setr_epi32(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00);

    __m128i under0 = _mm_cvtepu8_epi16(under);
    __m128i under1 = _mm_unpackhi_epi8(under, zero);
    __m128i over0 = _mm_cvtepu8_epi16(over);
    __m128i over1 = _mm_unpackhi_epi8(over, zero);
    __m128i alpha0 = _mm_mullo_epi16(_mm_shuffle_epi8(over, allo), alpha16);
    __m128i alpha1 = _mm_mullo_epi16(_mm_shuffle_epi8(over, alhi), alpha16);
    __m128i invAlpha0 = _mm_xor_si128(i255, alpha0);
    __m128i invAlpha1 = _mm_xor_si128(i255, alpha1);
    __m128i underMul0 = _mm_mulhi_epu16(under0, invAlpha0);
    __m128i underMul1 = _mm_mulhi_epu16(under1, invAlpha1);
    __m128i overMul0 = _mm_mulhi_epu16(over0, alpha0);
    __m128i overMul1 = _mm_mulhi_epu16(over1, alpha1);
    __m128i underFinal = _mm_packus_epi16(underMul0, underMul1);
    __m128i overFinal = _mm_packus_epi16(overMul0, overMul1);
    return _mm_adds_epu8(overFinal, underFinal);
}