C 整数立方根_C_Optimization_Math_Gcc_Numerical Analysis

C 整数立方根

c optimization math gcc

C 整数立方根,c,optimization,math,gcc,numerical-analysis,C,Optimization,Math,Gcc,Numerical Analysis,我正在寻找64位（无符号）立方根的快速代码。（我正在使用C并使用gcc进行编译，但我认为所需的大部分工作都与语言和编译器无关。）我将用ulong表示一个64位未命名整数给定一个输入n，我要求（积分）返回值r为 r * r * r <= n && n < (r + 1) * (r + 1) * (r + 1) 是不正确的，因为向范围的末尾舍入。简单的代码类 return (ulong)pow(n, 1.0/3); ulong cuberoot(ulong n) {

我正在寻找64位（无符号）立方根的快速代码。（我正在使用C并使用gcc进行编译，但我认为所需的大部分工作都与语言和编译器无关。）我将用ulong表示一个64位未命名整数

给定一个输入n，我要求（积分）返回值r为

r * r * r <= n && n < (r + 1) * (r + 1) * (r + 1)

是不正确的，因为向范围的末尾舍入。简单的代码类

return (ulong)pow(n, 1.0/3);

ulong
cuberoot(ulong n)
{
    ulong ret = pow(n + 0.5, 1.0/3);
    if (n < 100000000000001ULL)
        return ret;
    if (n >= 18446724184312856125ULL)
        return 2642245ULL;
    if (ret * ret * ret > n) {
        ret--;
        while (ret * ret * ret > n)
            ret--;
        return ret;
    }
    while ((ret + 1) * (ret + 1) * (ret + 1) <= n)
        ret++;
    return ret;
}

《黑客的喜悦》一书中有解决这一问题和许多其他问题的算法。代码是在线的。编辑：该代码在64位整数中无法正常工作，本书中关于如何修复64位整数的说明有些混乱。正确的64位实现（包括测试用例）处于联机状态

我怀疑你的

平方根

函数是否“正确”工作-对于参数，它应该是

ulong a

，而不是

：）（但是同样的方法可以使用

cbrt

而不是

sqrt

，尽管不是所有的C数学库都有立方根函数），然后将其转化为计算机算法，以2为基数，而不是以10为基数

我们最终得到一个类似（伪代码）的算法：

找到最大的n，这样（1您可以尝试牛顿步来修正舍入误差：
ulong r = (ulong)pow(n, 1.0/3);
if(r==0) return r; /* avoid divide by 0 later on */
ulong r3 = r*r*r;
ulong slope = 3*r*r;

ulong r1 = r+1;
ulong r13 = r1*r1*r1;

/* making sure to handle unsigned arithmetic correctly */
if(n >= r13) r+= (n - r3)/slope;
if(n < r3)   r-= (r3 - n)/slope;

ulongr=（ulong）功率（n，1.0/3）；
if（r==0）返回r；/*避免稍后除以0*/
ulong r3=r*r*r；
ulong坡度=3*r*r；
ulong r1=r+1；
ulong r13=r1*r1*r1；
/*确保正确处理无符号算术*/
如果（n>=r13）r+=（n-r3）/斜率；
如果（n

一个牛顿步骤应该足够了，但您可能会有一个（或更多？）错误。您可以使用最终检查和增量步骤检查/修复这些错误，如在OQ中：
while(r*r*r > n) --r;
while((r+1)*(r+1)*(r+1) <= n) ++r;

while（r*r*r>n）-r；
虽然（（r+1）*（r+1）*（r+1）如果pow
太贵，您可以使用计数前导零指令获得结果的近似值，然后使用查找表，然后使用一些牛顿步骤来完成它
int k = __builtin_clz(n); // counts # of leading zeros (often a single assembly insn)
int b = 64 - k;           // # of bits in n
int top8 = n >> (b - 8);  // top 8 bits of n (top bit is always 1)
int approx = table[b][top8 & 0x7f];

// On my pc: Math.Sqrt 35 ns, cbrt64 <70ns, cbrt32 <25 ns, (cbrt12 < 10ns)

// cbrt64(ulong x) is a C# version of:
// http://www.hackersdelight.org/hdcodetxt/acbrt.c.txt     (acbrt1)

// cbrt32(uint x) is a C# version of:
// http://www.hackersdelight.org/hdcodetxt/icbrt.c.txt     (icbrt1)

// Union in C#:
// http://www.hanselman.com/blog/UnionsOrAnEquivalentInCSairamasTipOfTheDay.aspx

using System.Runtime.InteropServices;  
[StructLayout(LayoutKind.Explicit)]  
public struct fu_32   // float <==> uint
{
[FieldOffset(0)]
public float f;
[FieldOffset(0)]
public uint u;
}

private static uint cbrt64(ulong x)
{
    if (x >= 18446724184312856125) return 2642245;
    float fx = (float)x;
    fu_32 fu32 = new fu_32();
    fu32.f = fx;
    uint uy = fu32.u / 4;
    uy += uy / 4;
    uy += uy / 16;
    uy += uy / 256;
    uy += 0x2a5137a0;
    fu32.u = uy;
    float fy = fu32.f;
    fy = 0.33333333f * (fx / (fy * fy) + 2.0f * fy);
    int y0 = (int)                                      
        (0.33333333f * (fx / (fy * fy) + 2.0f * fy));    
    uint y1 = (uint)y0;                                 

    ulong y2, y3;
    if (y1 >= 2642245)
    {
        y1 = 2642245;
        y2 = 6981458640025;
        y3 = 18446724184312856125;
    }
    else
    {
        y2 = (ulong)y1 * y1;
        y3 = y2 * y1;
    }
    if (y3 > x)
    {
        y1 -= 1;
        y2 -= 2 * y1 + 1;
        y3 -= 3 * y2 + 3 * y1 + 1;
        while (y3 > x)
        {
            y1 -= 1;
            y2 -= 2 * y1 + 1;
            y3 -= 3 * y2 + 3 * y1 + 1;
        }
        return y1;
    }
    do
    {
        y3 += 3 * y2 + 3 * y1 + 1;
        y2 += 2 * y1 + 1;
        y1 += 1;
    }
    while (y3 <= x);
    return y1 - 1;
}

private static uint cbrt32(uint x)
{
    uint y = 0, z = 0, b = 0;
    int s = x < 1u << 24 ? x < 1u << 12 ? x < 1u << 06 ? x < 1u << 03 ? 00 : 03 :
                                                         x < 1u << 09 ? 06 : 09 :
                                          x < 1u << 18 ? x < 1u << 15 ? 12 : 15 :
                                                         x < 1u << 21 ? 18 : 21 :
                           x >= 1u << 30 ? 30 : x < 1u << 27 ? 24 : 27;
    do
    {
        y *= 2;
        z *= 4;
        b = 3 * y + 3 * z + 1 << s;
        if (x >= b)
        {
            x -= b;
            z += 2 * y + 1;
            y += 1;
        }
        s -= 3;
    }
    while (s >= 0);
    return y;
}

private static uint cbrt12(uint x) // x < ~255
{
    uint y = 0, a = 0, b = 1, c = 0;
    while (a < x)
    {
        y++;
        b += c;
        a += b;
        c += 6;
    }
    if (a != x) y--;
    return y;
} 

给定b
和top8
，您可以使用一个查找表（在我的代码中，8K个条目）找到与cuberoot（n）
很好的近似值。使用一些牛顿步骤（参见comingstorm的答案）完成它。
//在我的电脑上：Math.Sqrt 35 ns，cbrt64=2642245）
int k = __builtin_clz(n); // counts # of leading zeros (often a single assembly insn)
int b = 64 - k;           // # of bits in n
int top8 = n >> (b - 8);  // top 8 bits of n (top bit is always 1)
int approx = table[b][top8 & 0x7f];

// On my pc: Math.Sqrt 35 ns, cbrt64 <70ns, cbrt32 <25 ns, (cbrt12 < 10ns)

// cbrt64(ulong x) is a C# version of:
// http://www.hackersdelight.org/hdcodetxt/acbrt.c.txt     (acbrt1)

// cbrt32(uint x) is a C# version of:
// http://www.hackersdelight.org/hdcodetxt/icbrt.c.txt     (icbrt1)

// Union in C#:
// http://www.hanselman.com/blog/UnionsOrAnEquivalentInCSairamasTipOfTheDay.aspx

using System.Runtime.InteropServices;  
[StructLayout(LayoutKind.Explicit)]  
public struct fu_32   // float <==> uint
{
[FieldOffset(0)]
public float f;
[FieldOffset(0)]
public uint u;
}

private static uint cbrt64(ulong x)
{
    if (x >= 18446724184312856125) return 2642245;
    float fx = (float)x;
    fu_32 fu32 = new fu_32();
    fu32.f = fx;
    uint uy = fu32.u / 4;
    uy += uy / 4;
    uy += uy / 16;
    uy += uy / 256;
    uy += 0x2a5137a0;
    fu32.u = uy;
    float fy = fu32.f;
    fy = 0.33333333f * (fx / (fy * fy) + 2.0f * fy);
    int y0 = (int)                                      
        (0.33333333f * (fx / (fy * fy) + 2.0f * fy));    
    uint y1 = (uint)y0;                                 

    ulong y2, y3;
    if (y1 >= 2642245)
    {
        y1 = 2642245;
        y2 = 6981458640025;
        y3 = 18446724184312856125;
    }
    else
    {
        y2 = (ulong)y1 * y1;
        y3 = y2 * y1;
    }
    if (y3 > x)
    {
        y1 -= 1;
        y2 -= 2 * y1 + 1;
        y3 -= 3 * y2 + 3 * y1 + 1;
        while (y3 > x)
        {
            y1 -= 1;
            y2 -= 2 * y1 + 1;
            y3 -= 3 * y2 + 3 * y1 + 1;
        }
        return y1;
    }
    do
    {
        y3 += 3 * y2 + 3 * y1 + 1;
        y2 += 2 * y1 + 1;
        y1 += 1;
    }
    while (y3 <= x);
    return y1 - 1;
}

private static uint cbrt32(uint x)
{
    uint y = 0, z = 0, b = 0;
    int s = x < 1u << 24 ? x < 1u << 12 ? x < 1u << 06 ? x < 1u << 03 ? 00 : 03 :
                                                         x < 1u << 09 ? 06 : 09 :
                                          x < 1u << 18 ? x < 1u << 15 ? 12 : 15 :
                                                         x < 1u << 21 ? 18 : 21 :
                           x >= 1u << 30 ? 30 : x < 1u << 27 ? 24 : 27;
    do
    {
        y *= 2;
        z *= 4;
        b = 3 * y + 3 * z + 1 << s;
        if (x >= b)
        {
            x -= b;
            z += 2 * y + 1;
            y += 1;
        }
        s -= 3;
    }
    while (s >= 0);
    return y;
}

private static uint cbrt12(uint x) // x < ~255
{
    uint y = 0, a = 0, b = 1, c = 0;
    while (a < x)
    {
        y++;
        b += c;
        a += b;
        c += 6;
    }
    if (a != x) y--;
    return y;
} 

{
y1=2642245；
y2=6981458640025；
y3=18446724184312856125；
}
其他的
{
y2=（ulong）y1*y1；
y3=y2*y1；
}
如果（y3>x）
{
y1-=1；
y2-=2*y1+1；
y3-=3*y2+3*y1+1；
而（y3>x）
{
y1-=1；
y2-=2*y1+1；
y3-=3*y2+3*y1+1；
}
返回y1；
}
做
{
y3+=3*y2+3*y1+1；
y2+=2*y1+1；
y1+=1；
}
虽然（y3我已经将中的1.5.2
（第k个根）中的算法改编为（k==3）
，并给出了对初始猜测的“相对”准确的高估-该算法的性能似乎超过了上面的“黑客之乐”代码
不仅如此，作为一个文本，它还提供了理论背景、正确性证明和终止标准
假设我们能够产生一个“相对”好的初始高估值，我还没有找到一个超过（7）次迭代的情况。（这与64位值有2^6位有效相关吗？）无论哪种方式，它都是对HacDel代码中（21）次迭代的改进-线性O（b）收敛，尽管有一个明显快得多的环体
我使用的初始估计是基于值（x）中有效位数的“四舍五入”。给定（b）值（x）中的有效位，我们可以说：2^（b-1）x^（1/3）

静态内联uint32\u t u64\u cbrt（uint64\u t x）
{
uint64_t r0=1，r1；
/*IEEE-754 cbrt*可能不准确*/
如果（x==0）/*cbrt（0）：*/
返回（0）；
intb=（64）————————————————————————————————————————————————————————————；
r0感谢您的更正。我可以尝试一下，但我不清楚x（在相应的问题中）是否永远都不会太小。不过，我将查看链接。黑客的喜悦代码当然不适用于64位整数；它不适用于8589934592、8589934593、8602523648等。不过，我可能能够调整它。平方根（）自适应（sqrt->cbrt，0xFFFFFFFF->2642245）也会失败，从3375开始。如果两侧都安装了防护装置，则会在1844672418431285125处失败。哦，对不起。如果用于64位整数，代码中会出现溢出。问题（及其修复）是书中描述的，但显然不在网站上的代码中。这里的固定版本：Argh，抱歉。原来书中的代码也有bug：）。无论如何，这里有固定版本（这次测试了，包括测试驱动程序！）。函数是单调的（实际上是对多维数据集根数字的二进制搜索）测试驱动程序会检查所有“关键”点（0、i3和（i3）-1的所有i，以便计算不会溢出，0xFFFFFFFFFFFFFF）。至少在使用VC++编译时，这一点肯定是正确的：）您的“不成熟”实现的缓慢部分是什么？是pow（）吗调用或一个/两个循环？pow调用非常昂贵（按指令计数约140个时钟）但是剩下的不是免费的，尤其是分支预测失误；考虑到这一点，可能要花费80个时钟。好主意，但我不认为pow的偏差超过两个，所以牛顿的方法是过分的。所以，也许一些更便宜的近似+牛顿的方法会更快？也许。我将不得不研究它。有趣。如文中所述，与naive版本相比没有太大的竞争力，但应该很接近（足够需要测试；指令计数表明<370个周期）。我认为你的（result | 1@Charles）替代方案还不够
static inline uint32_t u64_cbrt (uint64_t x)
{
    uint64_t r0 = 1, r1;

    /* IEEE-754 cbrt *may* not be exact. */

    if (x == 0) /* cbrt(0) : */
        return (0);

    int b = (64) - __builtin_clzll(x);
    r0 <<= (b + 2) / 3; /* ceil(b / 3) */

    do /* quadratic convergence: */
    {
        r1 = r0;
        r0 = (2 * r1 + x / (r1 * r1)) / 3;
    }
    while (r0 < r1);

    return ((uint32_t) r1); /* floor(cbrt(x)); */
}