Warning: file_get_contents(/data/phpspider/zhask/data//catemap/6/xamarin/3.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C++ 整数除法优化MSVC C++;_C++_Visual C++_Optimization_Integer_Division - Fatal编程技术网

C++ 整数除法优化MSVC C++;

C++ 整数除法优化MSVC C++;,c++,visual-c++,optimization,integer,division,C++,Visual C++,Optimization,Integer,Division,我正试图减少我的一部分代码的CPU使用,占用40%的CPU功耗。这一部分是: void CalibrationFunction(cv::Mat* pMatSrc, cv::Mat* pMatDst, cv::Mat* pBlack, cv::Mat* pWhite, INT32 nRadioFactor) { if (pMatSrc && pMatDst && pMatSrc->data &&a

我正试图减少我的一部分代码的CPU使用,占用40%的CPU功耗。这一部分是:

void CalibrationFunction(cv::Mat* pMatSrc, cv::Mat* pMatDst, 
                         cv::Mat* pBlack, cv::Mat* pWhite, INT32 nRadioFactor)
{
  if (pMatSrc && pMatDst && pMatSrc->data && pMatDst->data)
  {    
    for (int i = 0; i < pMatSrc->size[0]; i++)
    {
      UINT16* pBlackVal = (UINT16*)(pBlack->data + i*pBlack->step[0]);
      UINT16* pWhiteVal = (UINT16*)(pWhite->data + i*pWhite->step[0]);
      UINT16* pData = (UINT16*)(pMatSrc->data + i*pMatSrc->step[0]);
      INT32 nDif;
      UINT16 un16Value;
      for (int j = 0; j < pMatSrc->size[1]; j++) 
      {
        nDif = (*pData) - (*pBlackVal);
        un16Value = (UINT16)min(65535, (max(0, nDif) * nRadioFactor / max(1, (*pWhiteVal))));
        pBlackVal++;
        pWhiteVal++;
        pData++;

        int i0 = 0, i1 = j, i2 = i;
        *(UINT16*)(pMatDst->data + i0 * pMatDst->step[0] + i1 * pMatDst->step[1] + i2 * pMatDst->step[2]) = un16Value;
      }
    }
  }
}
无效校准功能(cv::Mat*pMatSrc、cv::Mat*pMatDst、, cv::Mat*pBlack,cv::Mat*pWhite,INT32 nRadioFactor) { 如果(pMatSrc&&pMatDst&&pMatSrc->data&&pMatDst->data) { 对于(int i=0;isize[0];i++) { UINT16*pBlackVal=(UINT16*)(pBlack->data+i*pBlack->step[0]); UINT16*pWhiteVal=(UINT16*)(pWhite->data+i*pWhite->step[0]); UINT16*pData=(UINT16*)(pMatSrc->data+i*pMatSrc->step[0]); INT32 nDif; UINT16 UN16值; 对于(int j=0;jsize[1];j++) { nDif=(*pData)-(*pBlackVal); UN16值=(UINT16)最小值(65535,(最大值(0,nDif)*nRadioFactor/max(1,(*pHiteval)); pBlackVal++; pWhiteVal++; pData++; int i0=0,i1=j,i2=i; *(UINT16*)(pmattst->data+i0*pmattst->step[0]+i1*pmattst->step[1]+i2*pmattst->step[2])=un16Value; } } } } 多线程已经实现:使用了8个线程

我看了带整数算术运算的SIMD,但由于有整数除法,似乎这不是继续的方法。我研究了优化的整数除法库,但它们似乎只有在所有整数都被相同分母除法时才有效,这里的情况并非如此

有人要跟踪吗?我对异国情调的解决方案非常开放(用另一种语言编译代码并从当前的解决方案调用它,等等),唯一的限制是这必须是一个MSVC项目,使用MSVC编译器

我已经看到Intel用SMID实现了整数除法,也许我应该用Intel编译器编译这段代码,并从当前解决方案调用生成的二进制文件(我有一个Intel编译器),但这个解决方案只能在我的计算机上工作,在我看来,“过于自定义”

编辑

然而,我最终还是设法使用了SIMD。对于除法,技巧是将16位的8个整数的向量转换成32位的4个浮点的2个向量,然后我可以对它们进行除法,因为SIMD中有一些函数可以对浮点进行除法。除法后,重铸以获得16位整数结果。 使用SIMD的新函数速度快了6倍,目前已经足够了。我记住你的所有评论,因为如果我的解决方案的这一部分再次成为瓶颈,它们可能会有所帮助

编辑2

根据要求,以下是新代码:

void CalibrationRadioSIMD(UINT16* pBlackVal, UINT16* pWhiteVal, UINT16* pData, UINT16* pResult, int size, int nRadioFactor = 2)
{

    for (int hop = 0; hop < 100; hop++) {
        UINT16* pResultTmp = pResult;
        UINT16* pDataTmp = pData;
        UINT16* pBlackValTmp = pBlackVal;
        UINT16* pWhiteValTmp = pWhiteVal;
        __m128i radio;

        radio.m128i_i32[0] = nRadioFactor;
        radio.m128i_i32[1] = nRadioFactor;
        radio.m128i_i32[2] = nRadioFactor;
        radio.m128i_i32[3] = nRadioFactor;
        for (int j = 0; j < size / 8; j++)
        {
            //
            // nDif = max(0, (*pData) - (*pBlackVal));
            // 
            // 1/ Loads 128-bit value
            //    Address p must be 16-byte aligned. 
            //    For the unaligned version, see _mm_loadu_si128.
            __m128i reg_a = _mm_load_si128((__m128i*)pDataTmp);
            __m128i reg_B = _mm_load_si128((__m128i*)pBlackValTmp);
            __m128i reg_white = _mm_load_si128((__m128i*)pWhiteValTmp);
            pDataTmp += 8;
            pBlackValTmp += 8;
            pWhiteValTmp += 8;

            // 2/ Subtracts the 8 unsigned 16-bit integers of b from the 8 unsigned 16-bit integers of a and saturates.
            __m128i reg_diff = _mm_subs_epu16(reg_a, reg_B);


            /////////////////////////////////////////////////////////////////////////////

            // unpack your vector of 8 x 16 bit unsigned shorts into two vectors of 32 bit unsigned ints, :
            __m128i xlo = _mm_unpacklo_epi16(reg_diff, _mm_set1_epi16(0));
            __m128i xhi = _mm_unpackhi_epi16(reg_diff, _mm_set1_epi16(0));

            // This instruction multiplies two sets of 32-bit signed integers.
            __m128i mullo = _mm_mullo_epi32(xlo, radio);
            __m128i mulhi = _mm_mullo_epi32(xhi, radio);

            // convert each of these vectors to float
            __m128 ylo = _mm_cvtepi32_ps(mullo);
            __m128 yhi = _mm_cvtepi32_ps(mulhi);


            // Meme question que pour xlo et xhi
            __m128i i32_whitelo = _mm_unpacklo_epi16(reg_white, _mm_set1_epi16(0));
            __m128i i32_whitehi = _mm_unpackhi_epi16(reg_white, _mm_set1_epi16(0));

            __m128 f32_white_lo = _mm_cvtepi32_ps(i32_whitelo);
            __m128 f32_white_hi = _mm_cvtepi32_ps(i32_whitehi);

            __m128 f32_res_lo = _mm_div_ps(ylo, f32_white_lo);
            __m128 f32_res_hi = _mm_div_ps(yhi, f32_white_hi);

            // Reconvertir en entier 16 bits

            __m128i n32_res_lo = _mm_cvtps_epi32(f32_res_lo);
            __m128i n32_res_hi = _mm_cvtps_epi32(f32_res_hi);

            // Put result into result vector
            UINT16* f32_res_lo_i16 = (UINT16*)&n32_res_lo;
            UINT16* f32_res_hi_i16 = (UINT16*)&n32_res_hi;
            int l = 0;
            for (int k = 0; k < 4; k++) {
                *(pResultTmp + k + 0) = *(f32_res_lo_i16 + l);
                *(pResultTmp + k + 4) = *(f32_res_hi_i16 + l);
                l += 2;
            }
            pResultTmp += 8;

        }
    }
}
void CalibrationRadioSIMD(UINT16*pBlackVal、UINT16*pWhiteVal、UINT16*pData、UINT16*pResult、int size、int nRadioFactor=2)
{
对于(int-hop=0;hop<100;hop++){
UINT16*PRESLTTMP=PRESLT;
UINT16*pDataTmp=pData;
UINT16*pBlackValTmp=pBlackVal;
UINT16*pWhiteValTmp=pWhiteVal;
__m128i收音机;
radio.m128i_i32[0]=nRadioFactor;
radio.m128i_i32[1]=nRadioFactor;
radio.m128i_i32[2]=nRadioFactor;
radio.m128i_i32[3]=nRadioFactor;
对于(int j=0;j
您可以为此创建一个逆(我指的是乘法逆)表。如果你允许一些不精确的地方,这个表将适合128k。它不适合一级缓存。因此,可能需要对其进行预取以最大限度地提高性能。对于SIMD,这是一个很好的例子
UINT16* pBlackVal = (UINT16*)(pBlack->data + i*pBlack->step[0]);
// Before first loop:
UINT16* blackData = (UINT16*) pBlack->data;
yourType blackStep = (yourType) pBlack->step[0];

...

UINT16* pBlackVal = (UINT16*)(blackData + i*blackStep);
a b c
d e f
g h i
abcdefghi
for r in [0, rowLength-1]
    for c in [0, columnLength-1]
        // do thing with data[rowLength * r + c] here