Assembly 可以用imul指令执行多精度有符号乘法吗？_Assembly_X86 64_Multiplication_Multiprecision

Assembly 可以用imul指令执行多精度有符号乘法吗？

assembly

Assembly 可以用imul指令执行多精度有符号乘法吗？,assembly,x86-64,multiplication,multiprecision,Assembly,X86 64,Multiplication,Multiprecision,我正在编写一个函数库，为有符号整数类型s0128，s0256，s0512，s1024和浮点类型f0128，f0256，f0512，f1024提供所有常规运算符和函数我现在正在编写s0128，s0256，s0512，s1024乘法例程，但得到的错误结果让我感到困惑。我假设我可以与64位imul rcx指令（在rdx:rax中产生128位结果）级联乘法，就像我可以与mul rcx指令的无符号操作数级联乘法一样。。。但是带有imul的答案是错误的我怀疑有什么诀窍可以让这一切顺利进行，也许是混合了i

我正在编写一个函数库，为有符号整数类型

s0128

，

s0256

，

s0512

，

s1024

和浮点类型

f0128

，

f0256

，

f0512

，

f1024

提供所有常规运算符和函数

我现在正在编写

s0128

，

s0256

，

s0512

，

s1024

乘法例程，但得到的错误结果让我感到困惑。我假设我可以与64位

imul rcx

指令（在

rdx:rax

中产生128位结果）级联乘法，就像我可以与

mul rcx

指令的无符号操作数级联乘法一样。。。但是带有

imul

的答案是错误的

我怀疑有什么诀窍可以让这一切顺利进行，也许是混合了

imul

和

mul

指令——或者别的什么。还是有什么原因不能用有符号乘法指令实现更大的乘法

为了让您了解这项技术，我将描述最小的版本，用于

s0128

操作数

           arg2.1   arg2.0  : two 64-bit parts of s0128 operand
           arg1.1   arg1.0  : two 64-bit parts of s0128 operand
           ---------------
       0  out.edx  out.eax  : output of arg1.0 * arg2.0
 out.edx  out.eax           : output of arg1.0 * arg2.1
 -------------------------
 out.2    out.1    out.0    : sum the above intermediate results
 out.edx  out.eax           : output of arg1.1 * arg2.0
 -------------------------
 out.2    out.1    out.0    : sum the above intermediate results

每次代码将两个64位值相乘时，它都会在

edx:eax

中生成一个128位的结果。每次代码生成128位结果时，它将该结果与

addq

、

adcq

、

adcq

指令（其中最终的

adcq

指令仅加零以确保任何进位标志得到传播）相加为64位寄存器的三倍

作为测试，当我将小负数乘以小正数时，结果为负数，但在128位

s0128

结果中，上64位值的底部有一个或两个非零位。这对我来说意味着在多精度有符号乘法中的传播有些不太正确

#include <stdint.h>
#include <stdio.h>

int64_t mul32x32( int32_t x, int32_t y )
{
    int16_t x_hi = 0xFFFF & (x >> 16);
    int16_t y_hi = 0xFFFF & (y >> 16);

    uint16_t x_lo = x & 0xFFFF;
    uint16_t y_lo = y & 0xFFFF;


    uint32_t lo_lo = (uint32_t)x_lo * y_lo;    // unsigned x unsigned
    int32_t  lo_hi = (x_lo * (int32_t)y_hi);   // unsigned x   signed
    int32_t  hi_lo = ((int32_t)x_hi * y_lo);   //   signed x unsigned
    int32_t  hi_hi = ((int32_t)x_hi * y_hi);   //   signed x   signed


    int64_t  prod = lo_lo 
                  + (((int64_t)lo_hi + hi_lo) << 16) 
                  + ((int64_t)hi_hi << 32);

    return prod;
}

int check(int a, int b)
{
    int64_t ref = (int64_t)a * (int64_t)b;
    int64_t tst = mul32x32(a, b);

    if (ref != tst)
    {
        printf("%.8X x %.8X => %.16llX vs %.16llX\n",
                (unsigned int)a,         (unsigned int)b, 
                (unsigned long long)ref, (unsigned long long)tst);
        return 1;
    }

    return 0;
}


int main()
{
    int a = (int)0xABCDEF01;
    int b = (int)0x12345678;
    int c = (int)0x1234EF01;
    int d = (int)0xABCD5678;

    int fail = 0;

    fail += check(a, a);
    fail += check(a, b);
    fail += check(a, c);
    fail += check(a, d);

    fail += check(b, b);
    fail += check(b, c);
    fail += check(b, d);

    fail += check(c, c);
    fail += check(c, d);

    fail += check(d, d);

    printf("%d tests failed\n", fail);
    return 0;
}

当然，对于

s0256

、

s0512

、

s1024

，级联的范围要大得多

我错过了什么？我必须将两个操作数都转换为无符号，执行无符号乘法，然后如果其中一个（但不是全部）操作数为负数，则对结果求反吗？或者我可以用有符号乘法指令计算多精度结果吗？

当你用较小的乘法构建扩展精度有符号乘法时，你最终得到的是有符号和无符号算术的混合

特别是，如果将有符号值一分为二，则将上半部分视为有符号，下半部分视为无符号。事实上，扩展精度加法也是如此

考虑这个任意示例，其中

AH

和

AL

表示

的上下半部分，而

BH

和

BL

表示

的上下半部分。（注意：这些不是表示x86寄存器的一半，只是被乘数的一半。）

项是无符号的，

项是有符号的

              AH : AL
           x  BH : BL
  -------------------
              AL * BL    unsigned x unsigned => zero extend to full precision
         AH * BL           signed x unsigned => sign extend to full precision
         AL * BH         unsigned x   signed => sign extend to full precision
    AH * BH                signed x   signed

AL*BL

产品未签名，因为AL和BL都未签名。因此，当您将其提升到结果的全部精度时，它会得到零扩展

AL*BH

和

AH*BL

产品混合了有符号和无符号值。生成的产品已签名，当您将其提升到结果的全部精度时，需要对其进行签名扩展

下面的C代码演示了以16×16乘法实现的32×32乘法。在64×64乘法中构建128×128乘法时，同样的原则也适用

#include <stdint.h>
#include <stdio.h>

int64_t mul32x32( int32_t x, int32_t y )
{
    int16_t x_hi = 0xFFFF & (x >> 16);
    int16_t y_hi = 0xFFFF & (y >> 16);

    uint16_t x_lo = x & 0xFFFF;
    uint16_t y_lo = y & 0xFFFF;


    uint32_t lo_lo = (uint32_t)x_lo * y_lo;    // unsigned x unsigned
    int32_t  lo_hi = (x_lo * (int32_t)y_hi);   // unsigned x   signed
    int32_t  hi_lo = ((int32_t)x_hi * y_lo);   //   signed x unsigned
    int32_t  hi_hi = ((int32_t)x_hi * y_hi);   //   signed x   signed


    int64_t  prod = lo_lo 
                  + (((int64_t)lo_hi + hi_lo) << 16) 
                  + ((int64_t)hi_hi << 32);

    return prod;
}

int check(int a, int b)
{
    int64_t ref = (int64_t)a * (int64_t)b;
    int64_t tst = mul32x32(a, b);

    if (ref != tst)
    {
        printf("%.8X x %.8X => %.16llX vs %.16llX\n",
                (unsigned int)a,         (unsigned int)b, 
                (unsigned long long)ref, (unsigned long long)tst);
        return 1;
    }

    return 0;
}


int main()
{
    int a = (int)0xABCDEF01;
    int b = (int)0x12345678;
    int c = (int)0x1234EF01;
    int d = (int)0xABCD5678;

    int fail = 0;

    fail += check(a, a);
    fail += check(a, b);
    fail += check(a, c);
    fail += check(a, d);

    fail += check(b, b);
    fail += check(b, c);
    fail += check(b, d);

    fail += check(c, c);
    fail += check(c, d);

    fail += check(d, d);

    printf("%d tests failed\n", fail);
    return 0;
}

由于混合有符号性和符号扩展的乐趣，通常更容易将有符号×有符号乘法实现为无符号×无符号乘法，如果被乘数的符号不同，则在最后有条件地求反。（事实上，当你达到扩展精度浮点值时，只要你保持像IEEE-754那样的符号大小形式，你就不必处理符号乘法。）

演示如何有效地否定扩展精度值。（虽然有点过时，但您可能会发现它很有趣/有用。）

当您用较小的乘法构建扩展精度有符号乘法时，最终将得到有符号和无符号算术的混合