C++ 整数除法优化MSVC C++;
我正试图减少我的一部分代码的CPU使用,占用40%的CPU功耗。这一部分是:C++ 整数除法优化MSVC C++;,c++,visual-c++,optimization,integer,division,C++,Visual C++,Optimization,Integer,Division,我正试图减少我的一部分代码的CPU使用,占用40%的CPU功耗。这一部分是: void CalibrationFunction(cv::Mat* pMatSrc, cv::Mat* pMatDst, cv::Mat* pBlack, cv::Mat* pWhite, INT32 nRadioFactor) { if (pMatSrc && pMatDst && pMatSrc->data &&a
void CalibrationFunction(cv::Mat* pMatSrc, cv::Mat* pMatDst,
cv::Mat* pBlack, cv::Mat* pWhite, INT32 nRadioFactor)
{
if (pMatSrc && pMatDst && pMatSrc->data && pMatDst->data)
{
for (int i = 0; i < pMatSrc->size[0]; i++)
{
UINT16* pBlackVal = (UINT16*)(pBlack->data + i*pBlack->step[0]);
UINT16* pWhiteVal = (UINT16*)(pWhite->data + i*pWhite->step[0]);
UINT16* pData = (UINT16*)(pMatSrc->data + i*pMatSrc->step[0]);
INT32 nDif;
UINT16 un16Value;
for (int j = 0; j < pMatSrc->size[1]; j++)
{
nDif = (*pData) - (*pBlackVal);
un16Value = (UINT16)min(65535, (max(0, nDif) * nRadioFactor / max(1, (*pWhiteVal))));
pBlackVal++;
pWhiteVal++;
pData++;
int i0 = 0, i1 = j, i2 = i;
*(UINT16*)(pMatDst->data + i0 * pMatDst->step[0] + i1 * pMatDst->step[1] + i2 * pMatDst->step[2]) = un16Value;
}
}
}
}
无效校准功能(cv::Mat*pMatSrc、cv::Mat*pMatDst、,
cv::Mat*pBlack,cv::Mat*pWhite,INT32 nRadioFactor)
{
如果(pMatSrc&&pMatDst&&pMatSrc->data&&pMatDst->data)
{
对于(int i=0;ivoid CalibrationRadioSIMD(UINT16* pBlackVal, UINT16* pWhiteVal, UINT16* pData, UINT16* pResult, int size, int nRadioFactor = 2)
{
for (int hop = 0; hop < 100; hop++) {
UINT16* pResultTmp = pResult;
UINT16* pDataTmp = pData;
UINT16* pBlackValTmp = pBlackVal;
UINT16* pWhiteValTmp = pWhiteVal;
__m128i radio;
radio.m128i_i32[0] = nRadioFactor;
radio.m128i_i32[1] = nRadioFactor;
radio.m128i_i32[2] = nRadioFactor;
radio.m128i_i32[3] = nRadioFactor;
for (int j = 0; j < size / 8; j++)
{
//
// nDif = max(0, (*pData) - (*pBlackVal));
//
// 1/ Loads 128-bit value
// Address p must be 16-byte aligned.
// For the unaligned version, see _mm_loadu_si128.
__m128i reg_a = _mm_load_si128((__m128i*)pDataTmp);
__m128i reg_B = _mm_load_si128((__m128i*)pBlackValTmp);
__m128i reg_white = _mm_load_si128((__m128i*)pWhiteValTmp);
pDataTmp += 8;
pBlackValTmp += 8;
pWhiteValTmp += 8;
// 2/ Subtracts the 8 unsigned 16-bit integers of b from the 8 unsigned 16-bit integers of a and saturates.
__m128i reg_diff = _mm_subs_epu16(reg_a, reg_B);
/////////////////////////////////////////////////////////////////////////////
// unpack your vector of 8 x 16 bit unsigned shorts into two vectors of 32 bit unsigned ints, :
__m128i xlo = _mm_unpacklo_epi16(reg_diff, _mm_set1_epi16(0));
__m128i xhi = _mm_unpackhi_epi16(reg_diff, _mm_set1_epi16(0));
// This instruction multiplies two sets of 32-bit signed integers.
__m128i mullo = _mm_mullo_epi32(xlo, radio);
__m128i mulhi = _mm_mullo_epi32(xhi, radio);
// convert each of these vectors to float
__m128 ylo = _mm_cvtepi32_ps(mullo);
__m128 yhi = _mm_cvtepi32_ps(mulhi);
// Meme question que pour xlo et xhi
__m128i i32_whitelo = _mm_unpacklo_epi16(reg_white, _mm_set1_epi16(0));
__m128i i32_whitehi = _mm_unpackhi_epi16(reg_white, _mm_set1_epi16(0));
__m128 f32_white_lo = _mm_cvtepi32_ps(i32_whitelo);
__m128 f32_white_hi = _mm_cvtepi32_ps(i32_whitehi);
__m128 f32_res_lo = _mm_div_ps(ylo, f32_white_lo);
__m128 f32_res_hi = _mm_div_ps(yhi, f32_white_hi);
// Reconvertir en entier 16 bits
__m128i n32_res_lo = _mm_cvtps_epi32(f32_res_lo);
__m128i n32_res_hi = _mm_cvtps_epi32(f32_res_hi);
// Put result into result vector
UINT16* f32_res_lo_i16 = (UINT16*)&n32_res_lo;
UINT16* f32_res_hi_i16 = (UINT16*)&n32_res_hi;
int l = 0;
for (int k = 0; k < 4; k++) {
*(pResultTmp + k + 0) = *(f32_res_lo_i16 + l);
*(pResultTmp + k + 4) = *(f32_res_hi_i16 + l);
l += 2;
}
pResultTmp += 8;
}
}
}
void CalibrationRadioSIMD(UINT16*pBlackVal、UINT16*pWhiteVal、UINT16*pData、UINT16*pResult、int size、int nRadioFactor=2)
{
对于(int-hop=0;hop<100;hop++){
UINT16*PRESLTTMP=PRESLT;
UINT16*pDataTmp=pData;
UINT16*pBlackValTmp=pBlackVal;
UINT16*pWhiteValTmp=pWhiteVal;
__m128i收音机;
radio.m128i_i32[0]=nRadioFactor;
radio.m128i_i32[1]=nRadioFactor;
radio.m128i_i32[2]=nRadioFactor;
radio.m128i_i32[3]=nRadioFactor;
对于(int j=0;j
您可以为此创建一个逆(我指的是乘法逆)表。如果你允许一些不精确的地方,这个表将适合128k。它不适合一级缓存。因此,可能需要对其进行预取以最大限度地提高性能。对于SIMD,这是一个很好的例子
UINT16* pBlackVal = (UINT16*)(pBlack->data + i*pBlack->step[0]);
// Before first loop:
UINT16* blackData = (UINT16*) pBlack->data;
yourType blackStep = (yourType) pBlack->step[0];
...
UINT16* pBlackVal = (UINT16*)(blackData + i*blackStep);
a b c
d e f
g h i
abcdefghi
for r in [0, rowLength-1]
for c in [0, columnLength-1]
// do thing with data[rowLength * r + c] here