C 如果您将一个大int值转换为float,会发生什么

C 如果您将一个大int值转换为float,会发生什么,c,bit-manipulation,C,Bit Manipulation,这是一个一般性的问题,当我使用GCC4.4将一个非常大/小的有符号整数转换为浮点时,会发生什么 我在做演员时看到一些奇怪的行为。以下是一些例子: 必须通过以下方法获得: float f = (float)x; unsigned int r; memcpy(&r, &f, sizeof(unsigned int)); ./btest -f float_i2f -1 0x80800001 input: 1000000010000000000000000000000

这是一个一般性的问题,当我使用GCC4.4将一个非常大/小的有符号整数转换为浮点时,会发生什么

我在做演员时看到一些奇怪的行为。以下是一些例子:

必须通过以下方法获得:

float f = (float)x;
unsigned int r;
memcpy(&r, &f, sizeof(unsigned int));

./btest -f float_i2f -1 0x80800001
input:          10000000100000000000000000000001
absolute value: 01111111011111111111111111111111

exponent:       10011101
mantissa:       00000000011111101111111111111111  (right shifted absolute value)

EXPECT:         11001110111111101111111111111111  (sign|exponent|mantissa)
MUST BE:        11001110111111110000000000000000  (sign ok, exponent ok,
                                                     mantissa???)

./btest -f float_i2f -1 0x3f7fffe0

EXPECT:    01001110011111011111111111111111
MUST BE:   01001110011111100000000000000000

./btest -f float_i2f -1 0x80004999                                                                  


EXPECT:    11001110111111111111111101101100
MUST BE:   11001110111111111111111101101101    (<- 1 added at the end)
float f=(float)x;
无符号整数r;
memcpy(&r,&f,sizeof(unsigned int));
./b测试-f浮点数i2f-1 0x80800001
输入:100000010000000000000001
绝对值:0111110111111111111111111111111
指数:10011101
尾数:000000000 111111 011111111(右移绝对值)
期望值:1100111011111011111111111(符号|指数|尾数)
必须是:1100111011111100000000000000(符号ok,指数ok,
尾数??)
./b测试-f浮点数i2f-1 0x3f7fffe0
期望值:010011110011111011111111111
必须是:010011110011100000000000000000
./b测试-f浮点数i2f-1 0x80004999
预计:110011101111111111111101101100
必须为:110011101111111111111101101101101(>31)&0x1;
/*x的绝对值*/
INTA=符号~x+1:x;
/*计算指数*/
int e=158;
int t=a;
而(!(t>>31)和0x1){
t 8)和((0x1>8>31)和0x1){
t 8)和((0x1>8 HOmasks[8]){
/*围捕*/
m+=1;
}else如果(S==HOmasks[8]){
/*四舍五入*/
m=m+(m&1);
}
/*在尾数中也设置指数的最后一位的特殊情况
*尾数本身是0*/

if(m&(0x132位
float
使用一些位作为指数,因此无法准确表示所有32位整数值

64位
双精度
可以精确存储任何32位整数值

Wikipedia有一个关于浮点的缩写条目,以及许多关于浮点数字内部的细节——当前的标准是IEEE 754:2008。它指出,32位浮点使用一位作为符号,8位作为指数,留下23位显式位和1位隐式位作为尾数,这就是为什么绝对值最多可以达到224准确地表示



我认为很明显,32位整数不能准确地存储到32位浮点中。我的问题是:如果我存储一个大于2^24或小于-2^24的整数,会发生什么?我如何复制它

一旦绝对值大于224,整数值就不能精确地表示在32位浮点的尾数的24个有效数字中,因此只有前导的24个数字是可靠的。浮点舍入也起作用

您可以使用类似以下代码进行演示: #包括 #包括

typedef union Ufloat
{
    uint32_t    i;
    float       f;
} Ufloat;

static void dump_value(uint32_t i, uint32_t v)
{
    Ufloat u = { .i = v };
    printf("0x%.8" PRIX32 ": 0x%.8" PRIX32 " = %15.7e = %15.6A\n", i, v, u.f, u.f);
}

int main(void)
{
    uint32_t lo = 1 << 23;
    uint32_t hi = 1 << 28;
    Ufloat u;

    for (uint32_t v = lo; v < hi; v <<= 1)
    {
        u.f = v;
        dump_value(v, u.i);
    }

    lo = (1 << 24) - 16;
    hi = lo + 64;

    for (uint32_t v = lo; v < hi; v++)
    {
        u.f = v;
        dump_value(v, u.i);
    }

    return 0;
}
输出的第一部分表明,某些整数值仍然可以精确存储;特别是,2的幂可以精确存储。事实上,更精确(但不那么简洁)的是,任何整数,其中绝对值的二进制表示不超过24个有效位(任何尾随数字都是零)可以精确地表示。值不一定能够精确地打印,但这与精确地存储值是分开的

第二个(较大)部分输出表明,在224-1之前,整数值可以精确表示。224本身的值也可以精确表示,但224+1不是,因此它看起来与224相同。相比之下,224+2可以仅用24个二进制数字后跟1个零表示,因此可以精确表示。对224+2重复一次,令人厌恶增量大于2。看起来“舍入-偶数”模式有效;这就是结果显示1个值然后显示3个值的原因


(顺便提一下,我注意到没有办法规定传递给
printf()
double
——根据默认参数提升规则(ISO/IEC 9899:2011§6.5.2.2函数调用,^6)从
float
转换而来的
float()
-逻辑上可以使用
h
修饰符,但没有定义。)

32位的
浮点
使用一些位作为指数,因此无法准确表示所有32位整数值

64位
双精度
可以精确存储任何32位整数值

Wikipedia有一个关于浮点的缩写条目,以及许多关于浮点数字内部的细节——当前的标准是IEEE 754:2008。它指出,32位浮点使用一位作为符号,8位作为指数,留下23位显式位和1位隐式位作为尾数,这就是为什么绝对值最多可以达到224准确地表示



我认为很明显,32位整数不能准确地存储到32位浮点中。我的问题是:如果我存储一个大于2^24或小于-2^24的整数,会发生什么?我如何复制它

一旦绝对值大于224,整数值就不能精确地表示在32位浮点的尾数的24个有效数字中,因此只有前导的24个数字是可靠的。浮点舍入也起作用

您可以使用类似以下代码进行演示: #包括 #包括

typedef union Ufloat
{
    uint32_t    i;
    float       f;
} Ufloat;

static void dump_value(uint32_t i, uint32_t v)
{
    Ufloat u = { .i = v };
    printf("0x%.8" PRIX32 ": 0x%.8" PRIX32 " = %15.7e = %15.6A\n", i, v, u.f, u.f);
}

int main(void)
{
    uint32_t lo = 1 << 23;
    uint32_t hi = 1 << 28;
    Ufloat u;

    for (uint32_t v = lo; v < hi; v <<= 1)
    {
        u.f = v;
        dump_value(v, u.i);
    }

    lo = (1 << 24) - 16;
    hi = lo + 64;

    for (uint32_t v = lo; v < hi; v++)
    {
        u.f = v;
        dump_value(v, u.i);
    }

    return 0;
}
输出的第一部分表明,某些整数值仍然可以精确存储;特别是,2的幂可以精确存储。事实上,更精确(但不那么简洁)的是,任何整数,其中绝对值的二进制表示不超过24个有效位(任何尾随数字都是零)可以精确地表示。值不一定能够精确地打印,但这与精确地存储值是分开的

输出的第二个(较大)部分表明,在224-1之前,整数值可以精确表示。224本身的值也可以精确表示,但224+1不是,因此它看起来与224相同。相反,224+2
typedef union Ufloat
{
    uint32_t    i;
    float       f;
} Ufloat;

static void dump_value(uint32_t i, uint32_t v)
{
    Ufloat u = { .i = v };
    printf("0x%.8" PRIX32 ": 0x%.8" PRIX32 " = %15.7e = %15.6A\n", i, v, u.f, u.f);
}

int main(void)
{
    uint32_t lo = 1 << 23;
    uint32_t hi = 1 << 28;
    Ufloat u;

    for (uint32_t v = lo; v < hi; v <<= 1)
    {
        u.f = v;
        dump_value(v, u.i);
    }

    lo = (1 << 24) - 16;
    hi = lo + 64;

    for (uint32_t v = lo; v < hi; v++)
    {
        u.f = v;
        dump_value(v, u.i);
    }

    return 0;
}
0x00800000: 0x4B000000 =   8.3886080e+06 =  0X1.000000P+23
0x01000000: 0x4B800000 =   1.6777216e+07 =  0X1.000000P+24
0x02000000: 0x4C000000 =   3.3554432e+07 =  0X1.000000P+25
0x04000000: 0x4C800000 =   6.7108864e+07 =  0X1.000000P+26
0x08000000: 0x4D000000 =   1.3421773e+08 =  0X1.000000P+27
0x00FFFFF0: 0x4B7FFFF0 =   1.6777200e+07 =  0X1.FFFFE0P+23
0x00FFFFF1: 0x4B7FFFF1 =   1.6777201e+07 =  0X1.FFFFE2P+23
0x00FFFFF2: 0x4B7FFFF2 =   1.6777202e+07 =  0X1.FFFFE4P+23
0x00FFFFF3: 0x4B7FFFF3 =   1.6777203e+07 =  0X1.FFFFE6P+23
0x00FFFFF4: 0x4B7FFFF4 =   1.6777204e+07 =  0X1.FFFFE8P+23
0x00FFFFF5: 0x4B7FFFF5 =   1.6777205e+07 =  0X1.FFFFEAP+23
0x00FFFFF6: 0x4B7FFFF6 =   1.6777206e+07 =  0X1.FFFFECP+23
0x00FFFFF7: 0x4B7FFFF7 =   1.6777207e+07 =  0X1.FFFFEEP+23
0x00FFFFF8: 0x4B7FFFF8 =   1.6777208e+07 =  0X1.FFFFF0P+23
0x00FFFFF9: 0x4B7FFFF9 =   1.6777209e+07 =  0X1.FFFFF2P+23
0x00FFFFFA: 0x4B7FFFFA =   1.6777210e+07 =  0X1.FFFFF4P+23
0x00FFFFFB: 0x4B7FFFFB =   1.6777211e+07 =  0X1.FFFFF6P+23
0x00FFFFFC: 0x4B7FFFFC =   1.6777212e+07 =  0X1.FFFFF8P+23
0x00FFFFFD: 0x4B7FFFFD =   1.6777213e+07 =  0X1.FFFFFAP+23
0x00FFFFFE: 0x4B7FFFFE =   1.6777214e+07 =  0X1.FFFFFCP+23
0x00FFFFFF: 0x4B7FFFFF =   1.6777215e+07 =  0X1.FFFFFEP+23
0x01000000: 0x4B800000 =   1.6777216e+07 =  0X1.000000P+24
0x01000001: 0x4B800000 =   1.6777216e+07 =  0X1.000000P+24
0x01000002: 0x4B800001 =   1.6777218e+07 =  0X1.000002P+24
0x01000003: 0x4B800002 =   1.6777220e+07 =  0X1.000004P+24
0x01000004: 0x4B800002 =   1.6777220e+07 =  0X1.000004P+24
0x01000005: 0x4B800002 =   1.6777220e+07 =  0X1.000004P+24
0x01000006: 0x4B800003 =   1.6777222e+07 =  0X1.000006P+24
0x01000007: 0x4B800004 =   1.6777224e+07 =  0X1.000008P+24
0x01000008: 0x4B800004 =   1.6777224e+07 =  0X1.000008P+24
0x01000009: 0x4B800004 =   1.6777224e+07 =  0X1.000008P+24
0x0100000A: 0x4B800005 =   1.6777226e+07 =  0X1.00000AP+24
0x0100000B: 0x4B800006 =   1.6777228e+07 =  0X1.00000CP+24
0x0100000C: 0x4B800006 =   1.6777228e+07 =  0X1.00000CP+24
0x0100000D: 0x4B800006 =   1.6777228e+07 =  0X1.00000CP+24
0x0100000E: 0x4B800007 =   1.6777230e+07 =  0X1.00000EP+24
0x0100000F: 0x4B800008 =   1.6777232e+07 =  0X1.000010P+24
0x01000010: 0x4B800008 =   1.6777232e+07 =  0X1.000010P+24
0x01000011: 0x4B800008 =   1.6777232e+07 =  0X1.000010P+24
0x01000012: 0x4B800009 =   1.6777234e+07 =  0X1.000012P+24
0x01000013: 0x4B80000A =   1.6777236e+07 =  0X1.000014P+24
0x01000014: 0x4B80000A =   1.6777236e+07 =  0X1.000014P+24
0x01000015: 0x4B80000A =   1.6777236e+07 =  0X1.000014P+24
0x01000016: 0x4B80000B =   1.6777238e+07 =  0X1.000016P+24
0x01000017: 0x4B80000C =   1.6777240e+07 =  0X1.000018P+24
0x01000018: 0x4B80000C =   1.6777240e+07 =  0X1.000018P+24
0x01000019: 0x4B80000C =   1.6777240e+07 =  0X1.000018P+24
0x0100001A: 0x4B80000D =   1.6777242e+07 =  0X1.00001AP+24
0x0100001B: 0x4B80000E =   1.6777244e+07 =  0X1.00001CP+24
0x0100001C: 0x4B80000E =   1.6777244e+07 =  0X1.00001CP+24
0x0100001D: 0x4B80000E =   1.6777244e+07 =  0X1.00001CP+24
0x0100001E: 0x4B80000F =   1.6777246e+07 =  0X1.00001EP+24
0x0100001F: 0x4B800010 =   1.6777248e+07 =  0X1.000020P+24
0x01000020: 0x4B800010 =   1.6777248e+07 =  0X1.000020P+24
0x01000021: 0x4B800010 =   1.6777248e+07 =  0X1.000020P+24
0x01000022: 0x4B800011 =   1.6777250e+07 =  0X1.000022P+24
0x01000023: 0x4B800012 =   1.6777252e+07 =  0X1.000024P+24
0x01000024: 0x4B800012 =   1.6777252e+07 =  0X1.000024P+24
0x01000025: 0x4B800012 =   1.6777252e+07 =  0X1.000024P+24
0x01000026: 0x4B800013 =   1.6777254e+07 =  0X1.000026P+24
0x01000027: 0x4B800014 =   1.6777256e+07 =  0X1.000028P+24
0x01000028: 0x4B800014 =   1.6777256e+07 =  0X1.000028P+24
0x01000029: 0x4B800014 =   1.6777256e+07 =  0X1.000028P+24
0x0100002A: 0x4B800015 =   1.6777258e+07 =  0X1.00002AP+24
0x0100002B: 0x4B800016 =   1.6777260e+07 =  0X1.00002CP+24
0x0100002C: 0x4B800016 =   1.6777260e+07 =  0X1.00002CP+24
0x0100002D: 0x4B800016 =   1.6777260e+07 =  0X1.00002CP+24
0x0100002E: 0x4B800017 =   1.6777262e+07 =  0X1.00002EP+24
0x0100002F: 0x4B800018 =   1.6777264e+07 =  0X1.000030P+24
weights:    128 64 32 16 8 4 2 1
binary num:   0  0  0  0 1 1 1 1
weights:      x x x 128 64 32 16 8 | 4 2 1
binary num:   0 0 0   0  0  0  0 1 | 1 1 1
unsigned number;          //our number
unsigned bitsToShift;     //number of bits to shift

assert(bitsToShift < 8);  //8 bits

unsigned guardMasks[8] = {0, 1, 3, 7, 0xf, 0x1f, 0x3f}
unsigned LOvalues[8] = {0, 1, 2, 4, 0x8, 0x10, 0x20, 0x40} //divided by 2 for faster comparison

unsigned guardBits = number & guardMasks[bitsToShift]; //value of the guard bits
number = number >> bitsToShift;

if(guardBits > LOvalues[bitsToShift]) {
...
} else if (guardBits == LOvalues[bitsToShift]) {
...
} else { //guardBits < LOvalues[bitsToShift]
...
}