Visual studio 2010 SSE绝对差和等效代码
我在H264AVC编码器/解码器中有这个函数,它被反复调用:Visual studio 2010 SSE绝对差和等效代码,visual-studio-2010,h.264,sse,encoder,decoder,Visual Studio 2010,H.264,Sse,Encoder,Decoder,我在H264AVC编码器/解码器中有这个函数,它被反复调用: UInt XDistortion::xGetSAD16x( XDistSearchStruct* pcDSS ) { XPel* pucCur = pcDSS->pYSearch; XPel* pucOrg = pcDSS->pYOrg; Int iStride = pcDSS->iYStride; Int iRows = pcDSS->iRows; UInt uiSum
UInt XDistortion::xGetSAD16x( XDistSearchStruct* pcDSS )
{
XPel* pucCur = pcDSS->pYSearch;
XPel* pucOrg = pcDSS->pYOrg;
Int iStride = pcDSS->iYStride;
Int iRows = pcDSS->iRows;
UInt uiSum = 0;
for( ; iRows != 0; iRows-- )
{
uiSum += Abs( pucOrg[0x0] - pucCur[0x0] );
uiSum += Abs( pucOrg[0x1] - pucCur[0x1] );
uiSum += Abs( pucOrg[0x2] - pucCur[0x2] );
uiSum += Abs( pucOrg[0x3] - pucCur[0x3] );
uiSum += Abs( pucOrg[0x4] - pucCur[0x4] );
uiSum += Abs( pucOrg[0x5] - pucCur[0x5] );
uiSum += Abs( pucOrg[0x6] - pucCur[0x6] );
uiSum += Abs( pucOrg[0x7] - pucCur[0x7] );
uiSum += Abs( pucOrg[0x8] - pucCur[0x8] );
uiSum += Abs( pucOrg[0x9] - pucCur[0x9] );
uiSum += Abs( pucOrg[0xa] - pucCur[0xa] );
uiSum += Abs( pucOrg[0xb] - pucCur[0xb] );
uiSum += Abs( pucOrg[0xc] - pucCur[0xc] );
uiSum += Abs( pucOrg[0xd] - pucCur[0xd] );
uiSum += Abs( pucOrg[0xe] - pucCur[0xe] );
uiSum += Abs( pucOrg[0xf] - pucCur[0xf] );
pucOrg += MB_BUFFER_WIDTH;
pucCur += iStride;
}
return uiSum;
}
我已将其替换为以下SSE代码,但它不起作用:
UInt XDistortion::xGetSAD16x( XDistSearchStruct* pcDSS )
{
XPel* pucCur = pcDSS->pYSearch;
XPel* pucOrg = pcDSS->pYOrg;
Int iStride = pcDSS->iYStride;
Int iRows = pcDSS->iRows;
__m128i uiSum = _mm_set1_epi32(0);
__m128i x1 = _mm_set1_epi8(MB_BUFFER_WIDTH);
__m128i x2 = _mm_set1_epi8(iStride);
for( ; iRows > 0; iRows -= 4)
{
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x0]), _mm_load_si128((__m128i*)&pucCur[0x0])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x1]), _mm_load_si128((__m128i*)&pucCur[0x1])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x2]), _mm_load_si128((__m128i*)&pucCur[0x2])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x3]), _mm_load_si128((__m128i*)&pucCur[0x3])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x4]), _mm_load_si128((__m128i*)&pucCur[0x4])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x5]), _mm_load_si128((__m128i*)&pucCur[0x5])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x6]), _mm_load_si128((__m128i*)&pucCur[0x6])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x7]), _mm_load_si128((__m128i*)&pucCur[0x7])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x8]), _mm_load_si128((__m128i*)&pucCur[0x8])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x9]), _mm_load_si128((__m128i*)&pucCur[0x9])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xa]), _mm_load_si128((__m128i*)&pucCur[0xa])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xb]), _mm_load_si128((__m128i*)&pucCur[0xb])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xc]), _mm_load_si128((__m128i*)&pucCur[0xc])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xd]), _mm_load_si128((__m128i*)&pucCur[0xd])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xe]), _mm_load_si128((__m128i*)&pucCur[0xe])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xf]), _mm_load_si128((__m128i*)&pucCur[0xf])))));
_mm_store_si128((__m128i*)pucOrg, _mm_add_epi32(_mm_load_si128((__m128i*)pucOrg), _mm_load_si128(&x1)));
_mm_store_si128((__m128i*)pucCur, _mm_add_epi32(_mm_load_si128((__m128i*)pucCur), _mm_load_si128(&x2)));
}
return _mm_extract_epi32(uiSum, 0);
}
我不知道我在转换过程中是否有任何错误,因为我不是SSE方面的专家。
有什么帮助吗?如果你能使用SSE,那就容易多了。我用过SSE,但它不起作用,我不知道为什么@Paul RYou需要更具体一些。哪一部分坏了?(看起来您试图将pucOrg视为128位的值,但结果不太理想。)