C++ 为什么在这个解包的std:：string dtor中有一个锁定的xadd指令？_C++_Gcc_Assembly_X86 64_Atomic

C++ 为什么在这个解包的std:：string dtor中有一个锁定的xadd指令？

c++ gcc assembly

C++ 为什么在这个解包的std:：string dtor中有一个锁定的xadd指令？,c++,gcc,assembly,x86-64,atomic,C++,Gcc,Assembly,X86 64,Atomic,我有一个非常简单的代码： #include <string> #include <iostream> int main() { std::string s("abc"); std::cout << s; } 然后反编译，最有趣的是： 00000000004009a0 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10>: 4009a0: 48 81 ff a0 11 60 00

我有一个非常简单的代码：

#include <string>
#include <iostream>

int main() {
    std::string s("abc");
    std::cout << s;
}

然后反编译，最有趣的是：

00000000004009a0 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10>:
  4009a0:       48 81 ff a0 11 60 00    cmp    rdi,0x6011a0
  4009a7:       75 01                   jne    4009aa <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0xa>
  4009a9:       c3                      ret    
  4009aa:       b8 00 00 00 00          mov    eax,0x0
  4009af:       48 85 c0                test   rax,rax
  4009b2:       74 11                   je     4009c5 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x25>
  4009b4:       83 c8 ff                or     eax,0xffffffff
  4009b7:       f0 0f c1 47 10          lock xadd DWORD PTR [rdi+0x10],eax
  4009bc:       85 c0                   test   eax,eax
  4009be:       7f e9                   jg     4009a9 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x9>
  4009c0:       e9 cb fd ff ff          jmp    400790 <_ZdlPv@plt>
  4009c5:       8b 47 10                mov    eax,DWORD PTR [rdi+0x10]
  4009c8:       8d 50 ff                lea    edx,[rax-0x1]
  4009cb:       89 57 10                mov    DWORD PTR [rdi+0x10],edx
  4009ce:       eb ec                   jmp    4009bc <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x1c>

0000000000 4009A0:
4009a0:48 81 ff a0 11 60 00 cmp rdi，0x6011a0
4009a7:75 01 jne 4009aa
4009a9:c3 ret
4009aa:B800 mov eax，0x0
4009af:48 85 c0测试rax，rax
4009b2:74 11 je 4009c5
4009b4:83 c8 ff或eax，0xffffffff
4009b7:f0 0f c1 47 10锁xadd DWORD PTR[rdi+0x10]，eax
4009bc:85 c0测试eax，eax
4009be:7f e9 jg 4009a9
4009c0:e9 cb fd ff ff jmp 400790
4009c5:8b 47 10 mov eax，DWORD PTR[rdi+0x10]
4009c8:8d 50 ff lea edx[rax-0x1]
4009cb:89 57 10 mov DWORD PTR[rdi+0x10]，edx
4009ce:eb ec jmp 4009bc

为什么

\u ZNSs4\u Rep10\u M\u disposeERKSaIcE.isra.10

（这是

std:：basic\u string:：：\u Rep:：\u M\u dispose（std:：allocator const&）[clone.isra.10]

）是前缀为xadd的锁

接下来的一个问题是如何避免它？

它看起来像是与字符串关联的代码。锁定指令递减引用计数，然后仅当包含实际字符串数据的可能共享缓冲区的引用计数为零（即，它不是共享的：没有其他字符串对象引用它）时，才调用

运算符delete

由于libstdc++是开源的，我们可以通过查看源代码来确认这一点

已反汇编的函数，

\u ZNSs4\u Rep10\u M\u disposeERKSaIcE

de-mangles1 to

std:：basic\u string:：\u Rep:：\u M\u dispose（std:：allocator const&）

。以下是gcc-4.x era2中libstdc++的主要功能：

void
_M_dispose（常数分配和分配）
{
#如果_GLIBCXX_FULLY_DYNAMIC_STRING==0
if（uuu builtin_uexpect（this！=&us_uempty_rep（），false））
#恩迪夫
{
//对种族检测器友好。有关更多信息，请参阅bits/c++配置。
_GLIBCXX\u同步\u发生在\u之前（&this->\u M\u refcount）；
如果（\uuuu gnu\u cxx:：\uuuu exchange\u和\u add\u dispatch（&this->\u M\u refcount），
-1） _M_refcount）；
_M_销毁（__a）；
}
}
}//XXX公吨

，我们可以注释你所提供的程序集，将每个指令映射回C++源：

00000000004009a0 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10>:

  # the next two lines implement the check:
  # if (__builtin_expect(this != &_S_empty_rep(), false))
  # which is an empty string optimization. The S_empty_rep singleton
  # is at address 0x6011a0 and if the current buffer points to that
  # we are done (execute the ret)
  4009a0: cmp    rdi,0x6011a0
  4009a7: jne    4009aa <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0xa>
  4009a9: ret

  # now we are in the implementation of
  # __gnu_cxx::__exchange_and_add_dispatch(&this->_M_refcount, -1)
  # which dispatches either to an atomic version of the add function
  # or the non-atomic version, depending on the value of `eax` which
  # is always directly set to zero, so the non-atomic version is 
  # *always called* (see details below)
  4009aa: mov    eax,0x0
  4009af: test   rax,rax
  4009b2: je     4009c5 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x25>

  # this is the atomic version of the decrement you were concerned about
  # but we never execute this code because the test above always jumps
  # to 4009c5 (the non-atomic version)
  4009b4: or     eax,0xffffffff
  4009b7: lock xadd DWORD PTR [rdi+0x10],eax
  4009bc: test   eax,eax
  # check if the result of the xadd was zero, if not skip the delete
  4009be: jg     4009a9 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x9>
  # the delete call
  4009c0: jmp    400790 <_ZdlPv@plt> # tailcall

  # the non-atomic version starts here, this is the code that is 
  # always executed
  4009c5: mov    eax,DWORD PTR [rdi+0x10]
  4009c8: lea    edx,[rax-0x1]
  4009cb: mov    DWORD PTR [rdi+0x10],edx
  # this jumps up to the test eax,eax check which calls operator delete
  # if the refcount was zero
  4009ce: jmp    4009bc <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x1c>

0000000000 4009A0:
#接下来的两行执行检查：
#if（uuu builtin_uexpect（this！=&us_uempty_rep（），false））
#这是一个空字符串优化。S_empty_rep singleton
#位于地址0x6011a0，如果当前缓冲区指向该地址
#我们完成了（执行ret）
4009a0:cmp rdi，0x6011a0
4009a7:jne 4009aa
4009a9:ret
#现在我们正在实施
#\u gnu\u cxx:：\u交换\u和\u添加\u分派（&this->\u M\u refcount，-1）
#它向add函数的原子版本发送
#或者非原子版本，取决于'eax'的值
#始终直接设置为零，因此非原子版本为
#*始终调用*（请参阅下面的详细信息）
4009aa:mov eax，0x0
4009af：测试rax，rax
4009b2:je 4009c5
#这是你所关心的减量的原子版本
#但是我们从不执行这段代码，因为上面的测试总是跳转
#至4009c5（非原子版本）
4009b4:或eax，0xffffffff
4009b7:锁xadd DWORD PTR[rdi+0x10]，eax
4009bc：测试eax，eax
#检查xadd的结果是否为零，如果不是，则跳过删除
4009be:jg 4009a9
#删除呼叫
4009c0:jmp 400790#tailcall
#非原子版本从这里开始，这是
#总是执行
4009c5:mov eax，DWORD PTR[rdi+0x10]
4009c8:lea edx[rax-0x1]
4009cb:mov DWORD PTR[rdi+0x10]，edx
#这跳到测试eax，eax检查，它调用操作符delete
#如果refcount为零
4009ce:jmp 4009bc

需要注意的是，您所关心的

lock xadd

代码从未执行过。有一个

mov-eax，0

后接一个

test-rax，rax；je

-此测试始终成功，并且跳转始终发生，因为

rax

始终为零

这里发生的事情是，

\uuuuu gnu\ucxx:：\uuuu atomic\u add\u dispatch

的实现方式是检查流程是否确实是单线程的。如果它确实是单线程的，那么它就不必为诸如

\uuuuuuuu-atomic\u-add\u-dispatch

之类的事情使用昂贵的原子指令——它只是使用常规的非原子加法。它通过检查pthreads函数的地址，

\uupthread\u key\u create

-如果该值为零，则未链接到

pthread

库，因此进程肯定是单线程的。在本例中，此pthread函数的地址在链接时解析为

（编译命令行上没有

-lpthread

），这就是

mov eax，0x0

的来源。在链接时，根据这些知识进行优化已经太迟了，所以残留的原子增量代码仍然存在，但永远不会执行。有关此机制的详细信息，请参阅

执行的代码是函数的最后一部分，从

4009c5

开始。此代码也会以非原子方式递减引用计数。决定这两个选项之间的检查可能基于进程是否是多线程的，例如，

-lpthread

是否已链接。无论出于何种原因，此检查在

\uuuuuuu exchange\u和

内部以一种防止编译器实际删除分支的原子部分的方式实现，即使在某些情况下知道它永远不会被执行
00000000004009a0 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10>:

  # the next two lines implement the check:
  # if (__builtin_expect(this != &_S_empty_rep(), false))
  # which is an empty string optimization. The S_empty_rep singleton
  # is at address 0x6011a0 and if the current buffer points to that
  # we are done (execute the ret)
  4009a0: cmp    rdi,0x6011a0
  4009a7: jne    4009aa <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0xa>
  4009a9: ret

  # now we are in the implementation of
  # __gnu_cxx::__exchange_and_add_dispatch(&this->_M_refcount, -1)
  # which dispatches either to an atomic version of the add function
  # or the non-atomic version, depending on the value of `eax` which
  # is always directly set to zero, so the non-atomic version is 
  # *always called* (see details below)
  4009aa: mov    eax,0x0
  4009af: test   rax,rax
  4009b2: je     4009c5 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x25>

  # this is the atomic version of the decrement you were concerned about
  # but we never execute this code because the test above always jumps
  # to 4009c5 (the non-atomic version)
  4009b4: or     eax,0xffffffff
  4009b7: lock xadd DWORD PTR [rdi+0x10],eax
  4009bc: test   eax,eax
  # check if the result of the xadd was zero, if not skip the delete
  4009be: jg     4009a9 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x9>
  # the delete call
  4009c0: jmp    400790 <_ZdlPv@plt> # tailcall

  # the non-atomic version starts here, this is the code that is 
  # always executed
  4009c5: mov    eax,DWORD PTR [rdi+0x10]
  4009c8: lea    edx,[rax-0x1]
  4009cb: mov    DWORD PTR [rdi+0x10],edx
  # this jumps up to the test eax,eax check which calls operator delete
  # if the refcount was zero
  4009ce: jmp    4009bc <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x1c>