Warning: file_get_contents(/data/phpspider/zhask/data//catemap/8/swift/16.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C 带SSE4.1内部函数的双线性滤波器_C_Optimization_Filtering_Sse_Intrinsics - Fatal编程技术网

C 带SSE4.1内部函数的双线性滤波器

C 带SSE4.1内部函数的双线性滤波器,c,optimization,filtering,sse,intrinsics,C,Optimization,Filtering,Sse,Intrinsics,我现在正试图找出一个相当快速的双线性过滤函数,一次只对一个过滤样本进行过滤,作为习惯使用内部函数的练习-直到SSE41都可以 到目前为止,我有以下几点: inline __m128i DivideBy255_8xUint16(const __m128i value) { // Blinn 16bit divide by 255 trick but across 8 packed 16bit values const __m128i plus128 = _mm_add_epi16

我现在正试图找出一个相当快速的双线性过滤函数,一次只对一个过滤样本进行过滤,作为习惯使用内部函数的练习-直到SSE41都可以

到目前为止,我有以下几点:

inline __m128i DivideBy255_8xUint16(const __m128i value)
{
    //  Blinn 16bit divide by 255 trick but across 8 packed 16bit values
    const __m128i plus128 = _mm_add_epi16(value, _mm_set1_epi16(128));
    const __m128i plus128ThenDivideBy256 = _mm_srli_epi16(plus128, 8);          //  TODO:   Should this be an arithmetic or logical shift or does it matter?
    const __m128i partial = _mm_add_epi16(plus128, plus128ThenDivideBy256);
    const __m128i result = _mm_srli_epi16(partial, 8);                          //  TODO:   Should this be an arithmetic or logical shift or does it matter?


    return result;
}


inline uint32_t BilinearSSE41(const uint8_t* data, uint32_t pitch, uint32_t width, uint32_t height, float u, float v)
{
    //  TODO:   There are probably intrinsics I haven't found yet to avoid using these?
    //  0x80 is high bit set which means zero out that component
    const __m128i unpack_fraction_u_mask = _mm_set_epi8(0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0);
    const __m128i unpack_fraction_v_mask = _mm_set_epi8(0x80, 1, 0x80, 1, 0x80, 1, 0x80, 1, 0x80, 1, 0x80, 1, 0x80, 1, 0x80, 1);
    const __m128i unpack_two_texels_mask = _mm_set_epi8(0x80, 7, 0x80, 6, 0x80, 5, 0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, 0x80, 0);


    //  TODO:   Potentially wasting two channels of operations for now
    const __m128i size = _mm_set_epi32(0, 0, height - 1, width - 1);
    const __m128 uv = _mm_set_ps(0.0f, 0.0f, v, u);

    const __m128 floor_uv_f = _mm_floor_ps(uv);
    const __m128 fraction_uv_f = _mm_sub_ps(uv, floor_uv_f);
    const __m128 fraction255_uv_f = _mm_mul_ps(fraction_uv_f, _mm_set_ps1(255.0f));
    const __m128i fraction255_uv_i = _mm_cvttps_epi32(fraction255_uv_f);    //  TODO:   Did this get rounded correctly?

    const __m128i fraction255_u_i = _mm_shuffle_epi8(fraction255_uv_i, unpack_fraction_u_mask); //  Splat fraction_u*255 across all 16 bit words
    const __m128i fraction255_v_i = _mm_shuffle_epi8(fraction255_uv_i, unpack_fraction_v_mask); //  Splat fraction_v*255 across all 16 bit words

    const __m128i inverse_fraction255_u_i = _mm_sub_epi16(_mm_set1_epi16(255), fraction255_u_i);
    const __m128i inverse_fraction255_v_i = _mm_sub_epi16(_mm_set1_epi16(255), fraction255_v_i);

    const __m128i floor_uv_i = _mm_cvttps_epi32(floor_uv_f);
    const __m128i clipped_floor_uv_i = _mm_min_epu32(floor_uv_i, size); //  TODO:   I haven't clamped this probably if uv was less than zero yet...


    //  TODO:   Calculating the addresses in the SSE register set would maybe be better

    int u0 = _mm_extract_epi32(floor_uv_i, 0);
    int v0 = _mm_extract_epi32(floor_uv_i, 1);


    const uint8_t* row = data + (u0<<2) + pitch*v0;


    const __m128i row0_packed = _mm_loadl_epi64((const __m128i*)data);
    const __m128i row0 = _mm_shuffle_epi8(row0_packed, unpack_two_texels_mask);

    const __m128i row1_packed = _mm_loadl_epi64((const __m128i*)(data + pitch));
    const __m128i row1 = _mm_shuffle_epi8(row1_packed, unpack_two_texels_mask);


    //  Compute (row0*fraction)/255 + row1*(255 - fraction)/255 - probably slight precision loss across addition!
    const __m128i vlerp0 = DivideBy255_8xUint16(_mm_mullo_epi16(row0, fraction255_v_i));
    const __m128i vlerp1 = DivideBy255_8xUint16(_mm_mullo_epi16(row1, inverse_fraction255_v_i));
    const __m128i vlerp = _mm_adds_epi16(vlerp0, vlerp1);

    const __m128i hlerp0 = DivideBy255_8xUint16(_mm_mullo_epi16(vlerp, fraction255_u_i));
    const __m128i hlerp1 = DivideBy255_8xUint16(_mm_srli_si128(_mm_mullo_epi16(vlerp, inverse_fraction255_u_i), 16 - 2*4));
    const __m128i hlerp = _mm_adds_epi16(hlerp0, hlerp1);


    //  Pack down to 8bit from 16bit components and return 32bit ARGB result
    return _mm_extract_epi32(_mm_packus_epi16(hlerp, hlerp), 0);
}
inline\uuum128i除以255\u8uint16(常量值)
{
//Blinn 16位除以255技巧,但跨越8个压缩16位值
常量m128i plus128=_mm_add_epi16(值,_mm_set1_epi16(128));
const uu m128i plus128ThenDivideBy256=mm_srli_epi16(plus128,8);//TODO:这应该是一个算术或逻辑移位,还是很重要?
常量m128i部分=\u mm\u添加\u epi16(加128,加128);
const _m128i result=_mm_srli_epi16(partial,8);//TODO:这应该是一个算术或逻辑移位,还是很重要?
返回结果;
}
内联uint32双线性SE41(常数uint8*数据、uint32节距、uint32宽度、uint32高度、浮点数u、浮点数v)
{
//TODO:可能有一些我还没有找到的内在方法可以避免使用它们?
//0x80是高位集,这意味着将该组件归零
常量m128i解包分数掩码=_mm_集epi8(0x80,0,0x80,0,0x80,0,0x80,0,0x80,0,0x80,0,0x80,0,0,0x80,0,0x80,0);
常数m128i解包分数v_掩码=_mm_集_epi8(0x80,1,0x80,1,0x80,1,0x80,1,0x80,1,0x80,1,0x80,1,0x80,1,0x80,1);
常量m128i解包两个纹理掩码=\u mm\u集\u epi8(0x80,7,0x80,6,0x80,5,0x80,4,0x80,3,0x80,2,0x80,1,0x80,0);
//TODO:目前可能会浪费两个操作通道
常数m128i大小=_毫米_集_epi32(0,0,高度-1,宽度-1);
常数m128 uv=_mm_set_ps(0.0f,0.0f,v,u);
施工图m128地板uv f=_mm_地板ps(uv);
常数m128分数uv f=_mm_sub_ps(uv,地板uv f);
常数m128分位数255分位数f=(分位数f,分位数集ps1(255.0f));
const uu m128i fraction255_uv_i=_mm_cvttps_epi32(fraction255_uv_f);//TODO:这四舍五入是否正确?
const uuum128i fraction255 uuui=mm shuffle epi8(fraction255 uv i,解包分数u u掩码);//在所有16位字上显示分数u*255
const uuu m128i fraction255_v_i=_mm_shuffle_epi8(fraction255_uv_i,unpack_fraction_v_mask);//在所有16位字上显示分数_v*255
常数m128i反分位数255 i=mm_usub_epi16(mm_uset1_epi16(255),分位数255 i);
常数m128i逆分位数255\u v\u i=\u mm\u sub\u epi16(\u mm\u set1\u epi16(255),分位数255\u v\u i);
常数m128i地板uv i=\u mm\u cvttps\u epi32(地板uv\u f);
const uuu m128i剪裁的u地板u uv i=mm min epu32(地板uv i,大小);//TODO:如果uv小于零,我可能还没有夹紧这个。。。
//TODO:计算SSE寄存器集中的地址可能会更好
int u0=_mm_extract_epi32(地板uv_i,0);
int v0=_mm_extract_epi32(地板uv_i,1);

const uint8_t*row=data+(u0关于代码没有什么特别的说明。但是我使用SSE2编写了自己的双线性缩放代码。有关更多详细信息,请参阅StackOverflow问题

在我的代码中,我首先计算水平和垂直分数以及索引,而不是每像素。我认为这样更快

我在core2 cpu下的代码似乎内存有限,而不是cpu,因此不进行预LC可能会更快。

注意到您的评论“TODO:这应该是一个算术或逻辑移位,还是重要?”

算术移位用于有符号整数。逻辑移位用于无符号整数

    0x80000000 >> 4 is 0xf8000000 // Arithmetic shift
    0x80000000 >> 4 is 0x08000000 // Logical shift