Assembly 何时以及如何在simd例程中执行浮点转换？_Assembly_X86_Sse_Simd_Webassembly

Assembly 何时以及如何在simd例程中执行浮点转换？

assembly x86

Assembly 何时以及如何在simd例程中执行浮点转换？,assembly,x86,sse,simd,webassembly,Assembly,X86,Sse,Simd,Webassembly,我在计算两幅图像的双向（水平和垂直）前缀和（扫描），得到像素和、平方和以及两幅图像的叉积。所有的计算都是在32位整数中完成的，在我进入最后一个过程之前，需要将32位整数转换为双倍，以计算窗口函数中两个图像的平均值、方差和协方差 export function weberSsim( pixels1: ImageMatrix, pixels2: ImageMatrix, options: Options ): MSSIMMatrix { // console.time("w

我在计算两幅图像的双向（水平和垂直）前缀和（扫描），得到像素和、平方和以及两幅图像的叉积。所有的计算都是在32位整数中完成的，在我进入最后一个过程之前，需要将32位整数转换为双倍，以计算窗口函数中两个图像的平均值、方差和协方差

export function weberSsim(
  pixels1: ImageMatrix,
  pixels2: ImageMatrix,
  options: Options
): MSSIMMatrix {
  // console.time("weber ssim");
  const { bitDepth, k1, k2, windowSize} = options
  const L = (1 << bitDepth) - 1
  const c1 = k1 * L * (k1 * L)
  const c2 = k2 * L * (k2 * L)
  const windowSquared = windowSize * windowSize
  const pixels1Data = pixels1.data;
  const pixels2Data = pixels2.data;
  const width = pixels1.width;
  const height = pixels1.height;
  // Produces exactly the same output as the C++ prefix sum above.
  const sumMatrix = weberSumMatrix(pixels1Data, pixels2Data, width, height);
  const windowHeight = height-windowSize;
  const windowWidth = width-windowSize;
  const imageSize = width*height;
  const ssims = new Array(windowHeight*windowWidth);


  // lets handle w = 0 h = 0 first and initialize mssim

  let cumulativeSsim;
  const reciprocalWindowSquared =  1 / windowSquared;
  {
    const windowOffset = windowSize - 1;
    let bottomOffset = windowOffset*width;
    {
      const meanx = (sumMatrix[bottomOffset+ windowOffset]) * reciprocalWindowSquared;
      const meany = (
        sumMatrix[imageSize + bottomOffset+ windowOffset]) * reciprocalWindowSquared;
      const varx = (
        sumMatrix[2*imageSize + bottomOffset+ windowOffset]) * reciprocalWindowSquared - meanx*meanx ;
      const vary = (
        sumMatrix[3*imageSize + bottomOffset+ windowOffset])  * reciprocalWindowSquared - meany*meany;
      const cov = (
        sumMatrix[4*imageSize + bottomOffset+ windowOffset])  * reciprocalWindowSquared - meanx*meany;
      const na = 2 * meanx * meany + c1
      const nb = 2 * cov + c2
      const da = meanx * meanx + meany * meany + c1
      const db = varx + vary + c2
      const ssim = (na * nb) / (da * db)
      ssims[0] = ssim
      // mssim = ssim
      cumulativeSsim = ssim;
    }



    // next handle all of the h = 0, w > 0 cases first
    for (let w = 1; w <  windowWidth; ++w) {
      // in h =0 cases, there is no top left or top right
      let leftOffset = w - 1;
      const rightx = sumMatrix[bottomOffset+leftOffset];
      const leftx = sumMatrix[bottomOffset+(windowOffset+w)];
      const meanx = (leftx-rightx)* reciprocalWindowSquared;
      const righty= sumMatrix[imageSize + bottomOffset+ leftOffset];
      const lefty = sumMatrix[imageSize + bottomOffset+ (windowOffset+w)];
      const meany = (lefty-righty) * reciprocalWindowSquared;
      const rightxx = sumMatrix[2*imageSize + bottomOffset+leftOffset];
      const leftxx = sumMatrix[2*imageSize + bottomOffset+ (windowOffset+w)];
      const varx = (leftxx-rightxx) * reciprocalWindowSquared - meanx*meanx ;
      const rightyy = sumMatrix[3*imageSize + bottomOffset+leftOffset];
      const leftyy = sumMatrix[3*imageSize + bottomOffset+ (windowOffset+w)]
      const vary = (leftyy - rightyy)  * reciprocalWindowSquared - meany*meany;
      const rightxy = sumMatrix[4*imageSize + bottomOffset+leftOffset];
      const leftxy = sumMatrix[4*imageSize + bottomOffset+ (windowOffset+w)];
      const cov = (leftxy-rightxy)  * reciprocalWindowSquared - meanx*meany;
      const na = 2 * meanx * meany + c1
      const nb = 2 * cov + c2
      const da = meanx * meanx + meany * meany + c1
      const db = varx + vary + c2
      const ssim = (na * nb) / (da *db)
      ssims[w] = ssim
      // mssim = mssim + (ssim - mssim) / (i + 1)
      cumulativeSsim += ssim;
    }
  }

  const windowOffset = windowSize - 1;
  // There will be lots of branch misses if we don't split the w==0 and h==0 cases
  for (let h = 1; h < windowHeight; ++h) {
    // now the w=0 on each line
    let bottomOffset = (h+windowSize-1)*width;
    let topOffset = (h-1)*width;
    {
      // since there is no left side we can skip two operations
      const topx = sumMatrix[topOffset+ windowOffset];
      const bottomx = sumMatrix[bottomOffset+ windowOffset];
      const meanx = (bottomx - topx) * reciprocalWindowSquared;
      const topy = sumMatrix[imageSize + topOffset+ windowOffset];
      const bottomy = sumMatrix[imageSize + bottomOffset+ windowOffset];
      const meany = (bottomy - topy) * reciprocalWindowSquared;
      const topxx = sumMatrix[2*imageSize + topOffset+ windowOffset];
      const bottomxx = sumMatrix[2*imageSize + bottomOffset+ windowOffset];
      const varx = (bottomxx-topxx)  * reciprocalWindowSquared - meanx*meanx ;
      const topyy = sumMatrix[3*imageSize + topOffset+ windowOffset];
      const bottomyy = sumMatrix[3*imageSize + bottomOffset+ windowOffset];
      const vary = (bottomyy-topyy)  * reciprocalWindowSquared - meany*meany;
      const topxy = sumMatrix[4*imageSize + topOffset+ windowOffset];
      const bottomxy = sumMatrix[4*imageSize + bottomOffset+ windowOffset];
      const cov = (bottomxy-topxy)  * reciprocalWindowSquared - meanx*meany;
      const na = 2 * meanx * meany + c1
      const nb = 2 * cov + c2
      const da = meanx * meanx + meany * meany + c1
      const db = varx + vary + c2
      const ssim = (na * nb) / (da *db)
      ssims[h*windowWidth] = ssim
      // mssim = mssim + (ssim - mssim) / (i + 1)
      cumulativeSsim += ssim;
    }


    for (let w = 1; w < windowWidth; ++w) {
      // add top left sub top right sub bottom left add bottom right
      const rightOffset = w + windowSize - 1;
      const leftOffset = w - 1;
      const meanx = (sumMatrix[topOffset + leftOffset]
        - sumMatrix[topOffset+ rightOffset]
        - sumMatrix[bottomOffset+leftOffset]
        + sumMatrix[bottomOffset+ rightOffset]) * reciprocalWindowSquared;
      const meany = (sumMatrix[imageSize+ topOffset + leftOffset]
        - sumMatrix[imageSize + topOffset+ rightOffset]
        - sumMatrix[imageSize + bottomOffset+leftOffset]
        + sumMatrix[imageSize + bottomOffset+ rightOffset]) * reciprocalWindowSquared;
      const varx = (sumMatrix[2*imageSize+ topOffset + leftOffset]
        - sumMatrix[2*imageSize + topOffset+ rightOffset]
        - sumMatrix[2*imageSize + bottomOffset+leftOffset]
        + sumMatrix[2*imageSize + bottomOffset+ rightOffset]) * reciprocalWindowSquared - meanx*meanx ;
      const vary = (sumMatrix[3*imageSize+ topOffset + leftOffset]
        - sumMatrix[3*imageSize + topOffset+ rightOffset]
        - sumMatrix[3*imageSize + bottomOffset+leftOffset]
        + sumMatrix[3*imageSize + bottomOffset+ rightOffset])  * reciprocalWindowSquared - meany*meany;
      const cov = (sumMatrix[4*imageSize+ topOffset + leftOffset]
        - sumMatrix[4*imageSize + topOffset+ rightOffset]
        - sumMatrix[4*imageSize + bottomOffset+leftOffset]
        + sumMatrix[4*imageSize + bottomOffset+ rightOffset])  * reciprocalWindowSquared - meanx*meany;
      const na = 2 * meanx * meany + c1
      const nb = 2 * cov + c2
      const da = meanx * meanx + meany * meany + c1
      const db = varx + vary + c2
      const ssim = (na * nb) / (da * db)
      ssims[h*windowWidth+w] = ssim
      cumulativeSsim += ssim;
      // mssim = mssim + (ssim - mssim) / (i + 1)
    }
  }
  const mssim = cumulativeSsim / (windowHeight*windowWidth);


  return { data: ssims, width, height, mssim }
}

首先，这是最好的方法吗？我可以用双精度构建整个前缀和数组，并且没有转换步骤

第二，如果这是正确的方法，那么使用压缩双simd操作会带来很多好处吗？我只能有把握地假设我一次能得到2个单位

第三，我应该将数据单元打包在一起，还是将其保留为当前的平面格式？[平面格式是指像素按“分量”进行细分的格式。如果您获得32位RGBA输入，即8位R、8位G、8位B和8位A，则压缩格式将为RGBARGBA，而平面格式将为RRRRRRRRRRR…GGGGGGGGGGGGGG…BBBBB…AAAA…等等。]

以下是我迄今为止完成的与本主题相关的三项功能。前两个版本是标量版本，因此更容易阅读和理解发生了什么。第三个是功能1的当前SIMD实现。第四个功能（缺失且尚未完成）是本问题的主题，可能是第二个功能的SIMD实现

std::unique_ptr<uint32_t[],boost::alignment::aligned_delete> computeSumMatrixForwardScalar2PassAll(uint8_t const* pImgData1, uint8_t const* pImgData2,
                                                                                                unsigned width, unsigned height)
{
  using namespace simdpp;
  std::unique_ptr<uint32_t[], boost::alignment::aligned_delete> sumArray((uint32_t*)boost::alignment::aligned_alloc(64, 5*width*height*sizeof(uint32_t)));
  auto pSumArray = sumArray.get();
  BOOST_ALIGN_ASSUME_ALIGNED(pImgData1, 64);
  BOOST_ALIGN_ASSUME_ALIGNED(pImgData2, 64);
  BOOST_ALIGN_ASSUME_ALIGNED(pSumArray, 64);
//#pramga omp parallel for private(h) shared(pImgData, pSumArray, w )

#pragma omp for simd
  for (unsigned h = 0; h < height; ++h)
  {
    uint32_t lastValX = 0;
    uint32_t lastValY = 0;
    uint32_t lastValXX = 0;
    uint32_t lastValYY = 0;
    uint32_t lastValXY = 0;
    for (unsigned w = 0; w < width; ++w)
    {
      uint32_t imgValX      = pImgData1[h * width + w];
      uint32_t newValX      = lastValX + imgValX;
      uint32_t newValXX = lastValXX + imgValX*imgValX;
      uint32_t imgValY = pImgData2[h*width + w];
      uint32_t newValY = lastValY + imgValY;
      uint32_t newValYY = lastValYY + imgValY*imgValY;
      uint32_t newValXY = lastValXY + imgValX*imgValY;
      pSumArray[h*width + w]= newValX;
      pSumArray[width*height+h*width + w] = newValY;
      pSumArray[2*width*height+ h*width + w] = newValXX;

      pSumArray[3*width*height+h*width + w] = newValYY;
      pSumArray[4*width*height+h*width + w] = newValXY;
      lastValX              = newValX;
      lastValXX = newValXX;
      lastValY = newValY;
      lastValYY = newValYY;
      lastValXY = newValXY;
    }
  }
  for (unsigned i = 0; i < 5; ++i) {
    for (unsigned h = 0; h+1 < height; ++h)
    {
      for (unsigned w = 0; w < width; ++w) {
        uint32_t above = pSumArray[i*width*height + h * width + w];
        uint32_t current = pSumArray[i*width*height+ (h+1) *width +w];
        pSumArray[i*width*height + (h+1) * width +w]= above+current;
      }
    }
  }

  return sumArray;
}

std:：unique_ptr计算SummatrixForwardScalar2Passall（uint8_t const*pImgData1，uint8_t const*pImgData2，
无符号宽度、无符号高度）
{
使用名称空间simdpp；
std:：unique_ptr sumArray（（uint32_t*）boost:：alignment:：aligned_alloc（64，5*宽度*高度*大小（uint32_t））；
auto pSumArray=sumArray.get（）；
推进对齐假设对齐（pImgData1，64）；
推进对齐假设对齐（pImgData2，64）；
增强对齐（pSumArray，64）；
//#pramga omp并行专用（h）共享（皮姆格达塔，普苏马拉伊，w）
#用于simd的pragma omp
用于（无符号h=0；h<高度；++h）
{
uint32_t lastValX=0；
uint32_t lastValY=0；
uint32_t lastValXX=0；
uint32_t lastValYY=0；
uint32_t lastValXY=0；
用于（无符号w=0；w


第二：SSIM转换——注意不同的语言——因为我还没有完成它的C++实现。注意，它在其中调用weberSumMatrix，这与上面的函数相同
export function weberSsim(
  pixels1: ImageMatrix,
  pixels2: ImageMatrix,
  options: Options
): MSSIMMatrix {
  // console.time("weber ssim");
  const { bitDepth, k1, k2, windowSize} = options
  const L = (1 << bitDepth) - 1
  const c1 = k1 * L * (k1 * L)
  const c2 = k2 * L * (k2 * L)
  const windowSquared = windowSize * windowSize
  const pixels1Data = pixels1.data;
  const pixels2Data = pixels2.data;
  const width = pixels1.width;
  const height = pixels1.height;
  // Produces exactly the same output as the C++ prefix sum above.
  const sumMatrix = weberSumMatrix(pixels1Data, pixels2Data, width, height);
  const windowHeight = height-windowSize;
  const windowWidth = width-windowSize;
  const imageSize = width*height;
  const ssims = new Array(windowHeight*windowWidth);


  // lets handle w = 0 h = 0 first and initialize mssim

  let cumulativeSsim;
  const reciprocalWindowSquared =  1 / windowSquared;
  {
    const windowOffset = windowSize - 1;
    let bottomOffset = windowOffset*width;
    {
      const meanx = (sumMatrix[bottomOffset+ windowOffset]) * reciprocalWindowSquared;
      const meany = (
        sumMatrix[imageSize + bottomOffset+ windowOffset]) * reciprocalWindowSquared;
      const varx = (
        sumMatrix[2*imageSize + bottomOffset+ windowOffset]) * reciprocalWindowSquared - meanx*meanx ;
      const vary = (
        sumMatrix[3*imageSize + bottomOffset+ windowOffset])  * reciprocalWindowSquared - meany*meany;
      const cov = (
        sumMatrix[4*imageSize + bottomOffset+ windowOffset])  * reciprocalWindowSquared - meanx*meany;
      const na = 2 * meanx * meany + c1
      const nb = 2 * cov + c2
      const da = meanx * meanx + meany * meany + c1
      const db = varx + vary + c2
      const ssim = (na * nb) / (da * db)
      ssims[0] = ssim
      // mssim = ssim
      cumulativeSsim = ssim;
    }



    // next handle all of the h = 0, w > 0 cases first
    for (let w = 1; w <  windowWidth; ++w) {
      // in h =0 cases, there is no top left or top right
      let leftOffset = w - 1;
      const rightx = sumMatrix[bottomOffset+leftOffset];
      const leftx = sumMatrix[bottomOffset+(windowOffset+w)];
      const meanx = (leftx-rightx)* reciprocalWindowSquared;
      const righty= sumMatrix[imageSize + bottomOffset+ leftOffset];
      const lefty = sumMatrix[imageSize + bottomOffset+ (windowOffset+w)];
      const meany = (lefty-righty) * reciprocalWindowSquared;
      const rightxx = sumMatrix[2*imageSize + bottomOffset+leftOffset];
      const leftxx = sumMatrix[2*imageSize + bottomOffset+ (windowOffset+w)];
      const varx = (leftxx-rightxx) * reciprocalWindowSquared - meanx*meanx ;
      const rightyy = sumMatrix[3*imageSize + bottomOffset+leftOffset];
      const leftyy = sumMatrix[3*imageSize + bottomOffset+ (windowOffset+w)]
      const vary = (leftyy - rightyy)  * reciprocalWindowSquared - meany*meany;
      const rightxy = sumMatrix[4*imageSize + bottomOffset+leftOffset];
      const leftxy = sumMatrix[4*imageSize + bottomOffset+ (windowOffset+w)];
      const cov = (leftxy-rightxy)  * reciprocalWindowSquared - meanx*meany;
      const na = 2 * meanx * meany + c1
      const nb = 2 * cov + c2
      const da = meanx * meanx + meany * meany + c1
      const db = varx + vary + c2
      const ssim = (na * nb) / (da *db)
      ssims[w] = ssim
      // mssim = mssim + (ssim - mssim) / (i + 1)
      cumulativeSsim += ssim;
    }
  }

  const windowOffset = windowSize - 1;
  // There will be lots of branch misses if we don't split the w==0 and h==0 cases
  for (let h = 1; h < windowHeight; ++h) {
    // now the w=0 on each line
    let bottomOffset = (h+windowSize-1)*width;
    let topOffset = (h-1)*width;
    {
      // since there is no left side we can skip two operations
      const topx = sumMatrix[topOffset+ windowOffset];
      const bottomx = sumMatrix[bottomOffset+ windowOffset];
      const meanx = (bottomx - topx) * reciprocalWindowSquared;
      const topy = sumMatrix[imageSize + topOffset+ windowOffset];
      const bottomy = sumMatrix[imageSize + bottomOffset+ windowOffset];
      const meany = (bottomy - topy) * reciprocalWindowSquared;
      const topxx = sumMatrix[2*imageSize + topOffset+ windowOffset];
      const bottomxx = sumMatrix[2*imageSize + bottomOffset+ windowOffset];
      const varx = (bottomxx-topxx)  * reciprocalWindowSquared - meanx*meanx ;
      const topyy = sumMatrix[3*imageSize + topOffset+ windowOffset];
      const bottomyy = sumMatrix[3*imageSize + bottomOffset+ windowOffset];
      const vary = (bottomyy-topyy)  * reciprocalWindowSquared - meany*meany;
      const topxy = sumMatrix[4*imageSize + topOffset+ windowOffset];
      const bottomxy = sumMatrix[4*imageSize + bottomOffset+ windowOffset];
      const cov = (bottomxy-topxy)  * reciprocalWindowSquared - meanx*meany;
      const na = 2 * meanx * meany + c1
      const nb = 2 * cov + c2
      const da = meanx * meanx + meany * meany + c1
      const db = varx + vary + c2
      const ssim = (na * nb) / (da *db)
      ssims[h*windowWidth] = ssim
      // mssim = mssim + (ssim - mssim) / (i + 1)
      cumulativeSsim += ssim;
    }


    for (let w = 1; w < windowWidth; ++w) {
      // add top left sub top right sub bottom left add bottom right
      const rightOffset = w + windowSize - 1;
      const leftOffset = w - 1;
      const meanx = (sumMatrix[topOffset + leftOffset]
        - sumMatrix[topOffset+ rightOffset]
        - sumMatrix[bottomOffset+leftOffset]
        + sumMatrix[bottomOffset+ rightOffset]) * reciprocalWindowSquared;
      const meany = (sumMatrix[imageSize+ topOffset + leftOffset]
        - sumMatrix[imageSize + topOffset+ rightOffset]
        - sumMatrix[imageSize + bottomOffset+leftOffset]
        + sumMatrix[imageSize + bottomOffset+ rightOffset]) * reciprocalWindowSquared;
      const varx = (sumMatrix[2*imageSize+ topOffset + leftOffset]
        - sumMatrix[2*imageSize + topOffset+ rightOffset]
        - sumMatrix[2*imageSize + bottomOffset+leftOffset]
        + sumMatrix[2*imageSize + bottomOffset+ rightOffset]) * reciprocalWindowSquared - meanx*meanx ;
      const vary = (sumMatrix[3*imageSize+ topOffset + leftOffset]
        - sumMatrix[3*imageSize + topOffset+ rightOffset]
        - sumMatrix[3*imageSize + bottomOffset+leftOffset]
        + sumMatrix[3*imageSize + bottomOffset+ rightOffset])  * reciprocalWindowSquared - meany*meany;
      const cov = (sumMatrix[4*imageSize+ topOffset + leftOffset]
        - sumMatrix[4*imageSize + topOffset+ rightOffset]
        - sumMatrix[4*imageSize + bottomOffset+leftOffset]
        + sumMatrix[4*imageSize + bottomOffset+ rightOffset])  * reciprocalWindowSquared - meanx*meany;
      const na = 2 * meanx * meany + c1
      const nb = 2 * cov + c2
      const da = meanx * meanx + meany * meany + c1
      const db = varx + vary + c2
      const ssim = (na * nb) / (da * db)
      ssims[h*windowWidth+w] = ssim
      cumulativeSsim += ssim;
      // mssim = mssim + (ssim - mssim) / (i + 1)
    }
  }
  const mssim = cumulativeSsim / (windowHeight*windowWidth);


  return { data: ssims, width, height, mssim }
}

导出功能weberSsim(
像素1:ImageMatrix，
像素2:ImageMatrix，
选项：选项
)：MSSIMMatrix{
//控制台时间（“韦伯ssim”）；
常量{bitDepth，k1，k2，windowSize}=options
常数L=（首先是10个案例
用于（设w=1；w// Return acc64 += a32. The a32 is viewed as uint32_t lanes, the accumulator is uint64_t
inline __m128i integerAdd( __m128i a32, __m128i acc64 )
{
    const __m128i low = _mm_and_si128( a32, _mm_set1_epi64x( UINT_MAX ) );
    acc64 = _mm_add_epi64( acc64, low );
    const __m128i high = _mm_srli_epi64( a32, 32 );
    acc64 = _mm_add_epi64( acc64, high );
    return acc64;
}

// Compute a32 * b32, add to the accumulator. The two inputs are viewed as uint32_t lanes, the accumulator is uint64_t
inline __m128i integerFma( __m128i a32, __m128i b32, __m128i acc64 )
{
    const __m128i low = _mm_mul_epu32( a32, b32 );
    a32 = _mm_srli_si128( a32, 4 );
    b32 = _mm_srli_si128( b32, 4 );
    const __m128i high = _mm_mul_epu32( a32, b32 );
    acc64 = _mm_add_epi64( acc64, low );
    acc64 = _mm_add_epi64( acc64, high );
    return acc64;
}

// Add both 64-bit lanes of the accumulator, convert to double
inline double accumulatedValue( __m128i acc64 )
{
    acc64 = _mm_add_epi64( acc64, _mm_unpackhi_epi64( acc64, acc64 ) );
    const uint64_t v = (uint64_t)_mm_cvtsi128_si64( acc64 );
    return (double)v;
}