在x86-64 skylake上以可重启序列优化percpu 2级位向量_C_Assembly_Linux Kernel_X86 64_Micro Optimization

在x86-64 skylake上以可重启序列优化percpu 2级位向量

c assembly linux-kernel

在x86-64 skylake上以可重启序列优化percpu 2级位向量,c,assembly,linux-kernel,x86-64,micro-optimization,C,Assembly,Linux Kernel,X86 64,Micro Optimization,我很好奇如何才能最好地优化下面的程序集，特别是“跳转到这里查看程序集”（便于control-f搜索）下代码块中的部分我正在写一些代码，热路径基本上是在一个位向量中找到一个0位，然后返回所说的位位向量包括： struct 2l_bitvec { // outer vector with bits indicating with inner vectors have available slots uint64_t v1; // inner vect

我很好奇如何才能最好地优化下面的程序集，特别是“跳转到这里查看程序集”（便于control-f搜索）下代码块中的部分

我正在写一些代码，热路径基本上是在一个位向量中找到一个0位，然后返回所说的位

位向量包括：

struct 2l_bitvec {
       // outer vector with bits indicating with inner vectors have available slots
       uint64_t v1;

       // inner vector with actual index bits
       uint64_t v2[64];
} 2l_bitvec;

每个cpu都有一个bitvec（或多个链接在一个慢得多的路径数据结构中）

为了管理这些位向量内的一致性，我正在使用（向下滚动一点以获得我所能找到的最佳手册页）

由于使用了

rseq

（这是超级热代码），所以逻辑都是在内联汇编中编写的

我试图编写的C代码如下所示：

#define LIKELY(X)   __builtin_expect(!!(X), 1)
#define UNLIKELY(X) __builtin_expect((X), 0)
uint64_t __attribute__((noinline))
restarting_l2_set_idx(uint64_t * v1, const uint32_t start_cpu) {
    
// if ever preempted, migrated, or catch a signal return here
catch_something_label:
    
    if (start_cpu != __rseq_abi.cpu_id_start) {
        return 4097;
    }

    uint64_t temp_v1 = *v1;
    while (LIKELY(temp_v1 != (~(0UL)))) {
        const uint32_t idx_v1  = _tzcnt_u64((~temp_v1));
        
        uint64_t       temp_v2 = v1[idx_v1 + 1];
        if (LIKELY(temp_v2 != (~(0UL)))) {
            const uint32_t idx = _tzcnt_u64(~temp_v2);
            
            temp_v2 |= ((1UL) << idx);
            v1[idx + 1] = temp_v2;
            
            return 64 * idx_v1 + idx;
        }
        else {
            temp_v1 |= ((1UL) << idx_v1);
            *v1 = temp_v1;
        }
    }
    
    return -1;
}

在伪代码中，包含所有

rseq

内容的程序集看起来是：

/*
Type assembly will look like as follow:
foo(..., uint32_t start_cpu) 
    RSEQ_INFO_DEF(32) 
    RSEQ_CS_ARR_DEF() 
    RSEQ_PREP_CS_DEF()

    // maybe some setup stuff (or maybe abort)

    "1:\n\t"    

    RSEQ_CMP_CUR_VS_START_CPUS()
    // handle migrated somehow

    <actual critical section here>
    "2:\n\t" (this is end label of critical section)

    // if abort is in another code section
    RSEQ_START_ABORT_DEF()
    <logical for abort here>
        // if this is goto generally jmp %l[abort]
        // otherwise some actual logic (usually set return var)
    RSEQ_END_ABORT_DEF()
    : <output variables, only if NOT goto asm>
    : <input variables> +
     [ start_cpu ] "g"(start_cpu), // always
    : <clobber registers> +
      "memory", "cc" // minimum clobbers
    #ifdef IS_GOTO_ASM
    : <jump labels OUTSIDE of the asm>
    #endif
*/

我改成了这个

        "movq %[v1], %[v2]\n\t"         // v2 = v1
        "salq $3, %[idx_v1]\n\t"        // idx_v1 = 8 * idx_v1
        "addq %[idx_v1], %[v2]\n\t"     // v2 += idx_v1 (index by uint64_t)
        "movq 8(%[v2]), %[temp_v2]\n\t" // temp_v2 = *(v + 8)

同样，由于

idx_v1

现在具有8倍的位位置，它表示以下代码也会发生变化：

        // in 7: label
        "btsq %[idx_v1], %[temp_v1]\n\t"

到

及

到

然而，我不确定这是否真的是一种性能改进。我想这可能是因为我确实需要为提交存储

v2

编辑2： @彼得考德斯指出我的编辑是愚蠢的：我可以暂时删除

v2

和使用

movq 8（%[v1]、%[idx_v1]、8）、%[temp_v2]

获取

temp_v2

和

movq%[temp_v2]，8（%[v1]，%[idx_v1]，8）

来存储它。对不起，我的第一次编辑太幼稚了：（

内联函数经常会破坏东西-因为您的约束被破坏。您修改了

%[temp\u v1]

寄存器，但您告诉编译器它是一个

“r”

纯输入操作数。看起来您只是希望编译器为您分配一个暂存寄存器；使用

“=r”来完成此操作

dummy output operand！也许你应该发布到codereview.SE（仍然要看整个问题，它很长；这只是我注意到的第一件事。）我在你告诉读者跳转到的代码块中读到的

#ifdef FAST\u ABORT

部分没有对它的评论。如果你有一个宏，为什么不在那里使用它呢？

lea

可能不值得；我认为你只做了一个纯

mov

加载和可选的一个纯

mov

存储，所以

lea

只是成本您需要1个前端uop。索引寻址模式的微融合适用于Haswell/Skylake上的store。除非您需要store addrees uop能够在端口7上运行，以避免端口2/3上的其他瓶颈？不太可能。当您删除

lea

时，您现在又使用了2条指令？？？这与我的意思相反：使用addressing模式，而不是ALU指令。您的代码从不将值

%[v2]

存储在任何地方，您只将其用作存储目标。只需在加载和存储中使用

8（%[v1]，%[idx_v1]，8）

寻址模式，如

movq 8（%[v1]，%[idx_v1]，8），%[temp_v2]

，与后一个存储类似。索引负载仍然是单个uop，延迟与

mov（%reg），%reg

（4c负载使用延迟快速路径仅在地址直接来自另一个负载时适用。）谢谢@PeterCordes。我已经花了相当多的时间在我的其他函数中优化asm，使用了您在这里和我的其他帖子中的有用评论（并修复了我对变量修饰符的使用）。现在，我的代码在关键部分发出咕噜声的情况下大约快了3倍！（我学到了很多！）

        "leaq 8(%[v1],%[idx_v1],8), %[v2]\n\t"
        "movq (%[v2]), %[temp_v2]\n\t"

        "movq %[v1], %[v2]\n\t"         // v2 = v1
        "salq $3, %[idx_v1]\n\t"        // idx_v1 = 8 * idx_v1
        "addq %[idx_v1], %[v2]\n\t"     // v2 += idx_v1 (index by uint64_t)
        "movq 8(%[v2]), %[temp_v2]\n\t" // temp_v2 = *(v + 8)

        // in 7: label
        "btsq %[idx_v1], %[temp_v1]\n\t"

        "sarq $3, %[idx_v1]\n\t"
        "btsq %[idx_v1], %[temp_v1]\n\t"

        // in 9: label
        "salq $6, %[idx_v1]\n\t"

        "salq $3, %[idx_v1]\n\t"