C 优化元素2^x-1的乘法_C_Simd_Avx

C 优化元素2^x-1的乘法

C 优化元素2^x-1的乘法,c,simd,avx,C,Simd,Avx,是否有任何已知优化用于将已知为2^x-1（1，3，7…）的几个（3到5）字节（int8）相乘这是在字节数组多次乘以（2^x-1）/2^x的情况下。除法很简单（为右移添加指数），但分子有点麻烦此外，指数x仅在1..31中，且所有指数之和始终小于32 // In reality there are 16 of these (i.e. a[16], b[16], c[16]) // ( a + b + c ) < 32 char a = 2; char b = 16; char c =

是否有任何已知优化用于将已知为2^x-1（1，3，7…）的几个（3到5）字节（int8）相乘

这是在字节数组多次乘以（2^x-1）/2^x的情况下。除法很简单（为右移添加指数），但分子有点麻烦

此外，指数x仅在1..31中，且所有指数之和始终小于32

// In reality there are 16 of these (i.e. a[16], b[16], c[16])
// ( a + b + c ) < 32
char  a = 2;
char  b = 16;
char  c = 8;

// Ratio/scale, there are 16 of these (i.e. r[16])
// It might work storing in log2 and using int8 or int16
// with fixed point approximation
<x?>  r = ( a - 1 ) * ( b - 1 ) * ( c - 1 ) / ( a * b * c );

// Big original value, just one
int   v = 1234567890;
// This might be done by scaling down to log2, too
// it is used for a comparison only
// doesn't need full 32b precission
// This is also 16 values, of course (i.e. rv[16])
int  rv = v * r;

//实际上有16种（即a[16]、b[16]、c[16]）
//（a+b+c）<32
字符a=2；
字符b=16；
字符c=8；
//比率/刻度，其中有16个（即r[16]）
//它可以在log2中存储并使用int8或int16
//不动点逼近
r=（a-1）*（b-1）*（c-1）/（a*b*c）；
//原始价值大，只有一个
INTV=1234567890；
//这也可以通过缩小到log2来实现
//它仅用于比较
//不需要全32b精度
//当然，这也是16个值（即rv[16]）
int rv=v*r；

这不是很简单吗

a * (2^x - 1) = (a << x) - a

a*（2^x-1）=（a你考虑过使用一个简单的预计算查找表吗？如果我正确理解你的问题，x0
、x1
和x2
总是在1到31之间，并且可以存储在5位中，因此只有2^15=32768
组合。这意味着r
可以用几位进行计算在一个相当小的表中，移位和按位OR来计算索引和单个查找
当然，此表格查找无法矢量化。
我所看到的是（与您上次的计算有点相反）：
请注意，展开式中的所有项都是2的幂，根据您的约束，所有指数都小于32。当然，所有32个可能的项都可以“预计算”。然后，只需将这些项的2^j相加即可（3坦率地说，该函数不适合缺少整数运算的AVX指令集。SSE2或AVX2提供的直接整数左移位几乎肯定是最快的方法。但是，从您对Aleksander Z.答案的评论来看，我想您正在评估替代方法
将此问题强加于AVX单元需要我们对数字的数量进行创新。通过未对齐的加载和按位屏蔽，我们可以将单个字节值洗牌到32位浮点的最顶端字节中，其中定义数字2^n次幂的指数位于该字节中
这几乎可以得到所需的幂函数，除了我们缺少指数字段的最低有效位，需要使用平方根来调整它。同样，我们还需要通过乘法设置指数偏差
无论如何，请查看下面的代码以了解详细信息，因为在这里逐字重复注释没有什么意义。请注意，未对齐的读取（但忽略）最多在数组前三个字节，因此请根据需要添加填充。还要注意，结果字是交错的，结果1存储字节{0,4,8,12，…}等等
哦，很明显，结果将是近似值，因为使用了浮点运算
void compute(const unsigned char (*ptr)[32], size_t len) {
    const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x3F000000U));
    const __m256 normalize = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F000000U));
    const __m256 offset = _mm256_set1_ps(1);

    __m256 result1 = _mm256_set1_ps(1);
    __m256 result2 = _mm256_set1_ps(1);
    __m256 result3 = _mm256_set1_ps(1);
    __m256 result4 = _mm256_set1_ps(1);

    do {
        // Mask out every forth byte into a separate variable using unaligned
        // loads to simulate 8-to-32 bit integer unpacking
        __m256 real1 = _mm256_loadu_ps((const float *) &ptr[0][-3]); 
        __m256 real2 = _mm256_loadu_ps((const float *) &ptr[0][-2]);
        __m256 real3 = _mm256_loadu_ps((const float *) &ptr[0][-1]);
        __m256 real4 = _mm256_loadu_ps((const float *) &ptr[0][-0]);
        real1 = _mm256_and_ps(real1, mask);
        real2 = _mm256_and_ps(real2, mask);
        real3 = _mm256_and_ps(real3, mask);
        real4 = _mm256_and_ps(real4, mask);
        // The binary values are 2^2x * 2^-BIAS once the masked-once top bytes
        // are interpreted as IEEE-754 floating-point exponent bytes.
        // Unfortunately we are overshooting the exponent field by one bit,
        // hence the doubled exponents. Anyway, let's at least multiply the
        // bias away
        real1 = _mm256_mul_ps(real1, normalize);
        real2 = _mm256_mul_ps(real2, normalize);
        real3 = _mm256_mul_ps(real3, normalize);
        real4 = _mm256_mul_ps(real4, normalize);
        // Use a fast aproximate reciprocal square root to halve the exponent,
        // yielding ~1/2^x.
        // You'd think this case of the reciprocal lookup table would be
        // precise, yet it seems not to be. Perhaps twiddling the rounding
        // mode or biasing the values may make it so.
        real1 = _mm256_rsqrt_ps(real1);
        real2 = _mm256_rsqrt_ps(real2);
        real3 = _mm256_rsqrt_ps(real3);
        real4 = _mm256_rsqrt_ps(real4);
        // Compute (2^x-1)/2^x as 1-1/2^x
        real1 = _mm256_sub_ps(offset, real1);
        real2 = _mm256_sub_ps(offset, real2);
        real3 = _mm256_sub_ps(offset, real3);
        real4 = _mm256_sub_ps(offset, real4);
        // Finally multiply the running products
        result1 = _mm256_mul_ps(result1, real1);
        result2 = _mm256_mul_ps(result2, real2);
        result3 = _mm256_mul_ps(result3, real3);
        result4 = _mm256_mul_ps(result4, real4);
    } while(++ptr, --len);

    /*
     * Do something useful with result1..4 here
     */
}

像往常一样，问题要求优化解决方案，而不是建议如何解决原始问题。这很烦人
仅使用了32个唯一的乘数mx：
m0=（20-1）/20=0/1=0

m1=（21-1）/21=1/2=0.5

m2=（22-1）/22=3/4=0.75

m3=（23-1）/23=7/8=0.875

m4=（24-1）/24=15/16=0.9375

m5=（25-1）/25=31/32=0.96875

m6=（26-1）/26=63/64=0.984375

m7=（27-1）/27=127/128=0.9921875

m8=（28-1）/28=255/256=0.99609375

m9=（29-1）/29=511/512=0.998046875

m10=（210-1）/210=1023/1024=0.999024375

m11=（211-1）/211=2047/2048=0.99951171875

m12=（212-1）/212=4095/4096=0.999755859375

m13=（213-1）/213=8191/8192=0.99987779296875

m14=（214-1）/214=16383/16384=0.999938964375

m15=（215-1）/215=32767/32768=0.99996482421875

m16=（216-1）/216=65535/65536=0.99998484712109375

m17=（217-1）/217=131071/131072=0.999999237060546875

m18=（218-1）/218=262143/262144=0.9999996185302734375

m19=（219-1）/219=524287/524288=0.999980926513671875

m20=（220-1）/220=1048575/1048576=0.99999044632568359375

m21=（221-1）/221=2097151/2097152=0.99999523162841796875

m22=（222-1）/222=4194303/4194304=0.99999761581428984375

m23=（223-1）/223=8388607/8388608=0.9999988071044921875

m24=（224-1）/224=16777215/16777216=0.9999994039535524609375

m25=（225-1）/225=33554431/33554432=0.99999997019767676123046875

m26=（226-1）/226=67108863/67108864=0.999999850883880651234375

m27=（227-1）/227=134217727/134217728=0.99999925494941940403076171875

m28=（228-1）/228=268435455/268435456=0.99999962747097015380859375

m29=（229-1）/229=536870911/536870912=0.999999981335485076904296875

m30=（230-1）/230=1073741823/1073741824=0.999999090677472538452184375

m31=（231-1）/231=2147483647/2147483648=0.99999995343387126922607421875
上面的十进制值都是精确的
将三到五个乘数（上表中）相乘，最后用一个“大数字”得到最终结果
一个普通的查找表需要32个条目。一个包含两个乘法器乘积的查找表需要322=1024个条目。一个包含三个乘法器乘积的查找表需要323=32768个条目。四个乘法器的查找表需要1048576个条目，并且通常太大而无法缓存当前处理器上的icient
前25个条目（m0到m24，包括）是精确的，但后7个条目（m25到m31，包括）无法表示，并且evalu
void compute(const unsigned char (*ptr)[32], size_t len) {
    const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x3F000000U));
    const __m256 normalize = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F000000U));
    const __m256 offset = _mm256_set1_ps(1);

    __m256 result1 = _mm256_set1_ps(1);
    __m256 result2 = _mm256_set1_ps(1);
    __m256 result3 = _mm256_set1_ps(1);
    __m256 result4 = _mm256_set1_ps(1);

    do {
        // Mask out every forth byte into a separate variable using unaligned
        // loads to simulate 8-to-32 bit integer unpacking
        __m256 real1 = _mm256_loadu_ps((const float *) &ptr[0][-3]); 
        __m256 real2 = _mm256_loadu_ps((const float *) &ptr[0][-2]);
        __m256 real3 = _mm256_loadu_ps((const float *) &ptr[0][-1]);
        __m256 real4 = _mm256_loadu_ps((const float *) &ptr[0][-0]);
        real1 = _mm256_and_ps(real1, mask);
        real2 = _mm256_and_ps(real2, mask);
        real3 = _mm256_and_ps(real3, mask);
        real4 = _mm256_and_ps(real4, mask);
        // The binary values are 2^2x * 2^-BIAS once the masked-once top bytes
        // are interpreted as IEEE-754 floating-point exponent bytes.
        // Unfortunately we are overshooting the exponent field by one bit,
        // hence the doubled exponents. Anyway, let's at least multiply the
        // bias away
        real1 = _mm256_mul_ps(real1, normalize);
        real2 = _mm256_mul_ps(real2, normalize);
        real3 = _mm256_mul_ps(real3, normalize);
        real4 = _mm256_mul_ps(real4, normalize);
        // Use a fast aproximate reciprocal square root to halve the exponent,
        // yielding ~1/2^x.
        // You'd think this case of the reciprocal lookup table would be
        // precise, yet it seems not to be. Perhaps twiddling the rounding
        // mode or biasing the values may make it so.
        real1 = _mm256_rsqrt_ps(real1);
        real2 = _mm256_rsqrt_ps(real2);
        real3 = _mm256_rsqrt_ps(real3);
        real4 = _mm256_rsqrt_ps(real4);
        // Compute (2^x-1)/2^x as 1-1/2^x
        real1 = _mm256_sub_ps(offset, real1);
        real2 = _mm256_sub_ps(offset, real2);
        real3 = _mm256_sub_ps(offset, real3);
        real4 = _mm256_sub_ps(offset, real4);
        // Finally multiply the running products
        result1 = _mm256_mul_ps(result1, real1);
        result2 = _mm256_mul_ps(result2, real2);
        result3 = _mm256_mul_ps(result3, real3);
        result4 = _mm256_mul_ps(result4, real4);
    } while(++ptr, --len);

    /*
     * Do something useful with result1..4 here
     */
}