C++ 为什么在这个解包的std::string dtor中有一个锁定的xadd指令?

C++ 为什么在这个解包的std::string dtor中有一个锁定的xadd指令?,c++,gcc,assembly,x86-64,atomic,C++,Gcc,Assembly,X86 64,Atomic,我有一个非常简单的代码: #include <string> #include <iostream> int main() { std::string s("abc"); std::cout << s; } 然后反编译,最有趣的是: 00000000004009a0 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10>: 4009a0: 48 81 ff a0 11 60 00

我有一个非常简单的代码:

#include <string>
#include <iostream>

int main() {
    std::string s("abc");
    std::cout << s;
}
然后反编译,最有趣的是:

00000000004009a0 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10>:
  4009a0:       48 81 ff a0 11 60 00    cmp    rdi,0x6011a0
  4009a7:       75 01                   jne    4009aa <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0xa>
  4009a9:       c3                      ret    
  4009aa:       b8 00 00 00 00          mov    eax,0x0
  4009af:       48 85 c0                test   rax,rax
  4009b2:       74 11                   je     4009c5 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x25>
  4009b4:       83 c8 ff                or     eax,0xffffffff
  4009b7:       f0 0f c1 47 10          lock xadd DWORD PTR [rdi+0x10],eax
  4009bc:       85 c0                   test   eax,eax
  4009be:       7f e9                   jg     4009a9 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x9>
  4009c0:       e9 cb fd ff ff          jmp    400790 <_ZdlPv@plt>
  4009c5:       8b 47 10                mov    eax,DWORD PTR [rdi+0x10]
  4009c8:       8d 50 ff                lea    edx,[rax-0x1]
  4009cb:       89 57 10                mov    DWORD PTR [rdi+0x10],edx
  4009ce:       eb ec                   jmp    4009bc <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x1c>
0000000000 4009A0:
4009a0:48 81 ff a0 11 60 00 cmp rdi,0x6011a0
4009a7:75 01 jne 4009aa
4009a9:c3 ret
4009aa:B800 mov eax,0x0
4009af:48 85 c0测试rax,rax
4009b2:74 11 je 4009c5
4009b4:83 c8 ff或eax,0xffffffff
4009b7:f0 0f c1 47 10锁xadd DWORD PTR[rdi+0x10],eax
4009bc:85 c0测试eax,eax
4009be:7f e9 jg 4009a9
4009c0:e9 cb fd ff ff jmp 400790
4009c5:8b 47 10 mov eax,DWORD PTR[rdi+0x10]
4009c8:8d 50 ff lea edx[rax-0x1]
4009cb:89 57 10 mov DWORD PTR[rdi+0x10],edx
4009ce:eb ec jmp 4009bc
为什么
\u ZNSs4\u Rep10\u M\u disposeERKSaIcE.isra.10
(这是
std::basic\u string:::\u Rep::\u M\u dispose(std::allocator const&)[clone.isra.10]
)是前缀为xadd的锁


接下来的一个问题是如何避免它?

它看起来像是与字符串关联的代码。锁定指令递减引用计数,然后仅当包含实际字符串数据的可能共享缓冲区的引用计数为零(即,它不是共享的:没有其他字符串对象引用它)时,才调用
运算符delete

由于libstdc++是开源的,我们可以通过查看源代码来确认这一点

已反汇编的函数,
\u ZNSs4\u Rep10\u M\u disposeERKSaIcE
de-mangles1 to
std::basic\u string::\u Rep::\u M\u dispose(std::allocator const&)
。以下是gcc-4.x era2中libstdc++的主要功能:

void
_M_dispose(常数分配和分配)
{
#如果_GLIBCXX_FULLY_DYNAMIC_STRING==0
if(uuu builtin_uexpect(this!=&us_uempty_rep(),false))
#恩迪夫
{
//对种族检测器友好。有关更多信息,请参阅bits/c++配置。
_GLIBCXX\u同步\u发生在\u之前(&this->\u M\u refcount);
如果(\uuuu gnu\u cxx::\uuuu exchange\u和\u add\u dispatch(&this->\u M\u refcount),
-1) _M_refcount);
_M_销毁(__a);
}
}
}//XXX公吨

,我们可以注释你所提供的程序集,将每个指令映射回C++源:

00000000004009a0 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10>:

  # the next two lines implement the check:
  # if (__builtin_expect(this != &_S_empty_rep(), false))
  # which is an empty string optimization. The S_empty_rep singleton
  # is at address 0x6011a0 and if the current buffer points to that
  # we are done (execute the ret)
  4009a0: cmp    rdi,0x6011a0
  4009a7: jne    4009aa <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0xa>
  4009a9: ret

  # now we are in the implementation of
  # __gnu_cxx::__exchange_and_add_dispatch(&this->_M_refcount, -1)
  # which dispatches either to an atomic version of the add function
  # or the non-atomic version, depending on the value of `eax` which
  # is always directly set to zero, so the non-atomic version is 
  # *always called* (see details below)
  4009aa: mov    eax,0x0
  4009af: test   rax,rax
  4009b2: je     4009c5 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x25>

  # this is the atomic version of the decrement you were concerned about
  # but we never execute this code because the test above always jumps
  # to 4009c5 (the non-atomic version)
  4009b4: or     eax,0xffffffff
  4009b7: lock xadd DWORD PTR [rdi+0x10],eax
  4009bc: test   eax,eax
  # check if the result of the xadd was zero, if not skip the delete
  4009be: jg     4009a9 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x9>
  # the delete call
  4009c0: jmp    400790 <_ZdlPv@plt> # tailcall

  # the non-atomic version starts here, this is the code that is 
  # always executed
  4009c5: mov    eax,DWORD PTR [rdi+0x10]
  4009c8: lea    edx,[rax-0x1]
  4009cb: mov    DWORD PTR [rdi+0x10],edx
  # this jumps up to the test eax,eax check which calls operator delete
  # if the refcount was zero
  4009ce: jmp    4009bc <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x1c>
0000000000 4009A0:
#接下来的两行执行检查:
#if(uuu builtin_uexpect(this!=&us_uempty_rep(),false))
#这是一个空字符串优化。S_empty_rep singleton
#位于地址0x6011a0,如果当前缓冲区指向该地址
#我们完成了(执行ret)
4009a0:cmp rdi,0x6011a0
4009a7:jne 4009aa
4009a9:ret
#现在我们正在实施
#\u gnu\u cxx::\u交换\u和\u添加\u分派(&this->\u M\u refcount,-1)
#它向add函数的原子版本发送
#或者非原子版本,取决于'eax'的值
#始终直接设置为零,因此非原子版本为
#*始终调用*(请参阅下面的详细信息)
4009aa:mov eax,0x0
4009af:测试rax,rax
4009b2:je 4009c5
#这是你所关心的减量的原子版本
#但是我们从不执行这段代码,因为上面的测试总是跳转
#至4009c5(非原子版本)
4009b4:或eax,0xffffffff
4009b7:锁xadd DWORD PTR[rdi+0x10],eax
4009bc:测试eax,eax
#检查xadd的结果是否为零,如果不是,则跳过删除
4009be:jg 4009a9
#删除呼叫
4009c0:jmp 400790#tailcall
#非原子版本从这里开始,这是
#总是执行
4009c5:mov eax,DWORD PTR[rdi+0x10]
4009c8:lea edx[rax-0x1]
4009cb:mov DWORD PTR[rdi+0x10],edx
#这跳到测试eax,eax检查,它调用操作符delete
#如果refcount为零
4009ce:jmp 4009bc
需要注意的是,您所关心的
lock xadd
代码从未执行过。有一个
mov-eax,0
后接一个
test-rax,rax;je
-此测试始终成功,并且跳转始终发生,因为
rax
始终为零

这里发生的事情是,
\uuuuu gnu\ucxx::\uuuu atomic\u add\u dispatch
的实现方式是检查流程是否确实是单线程的。如果它确实是单线程的,那么它就不必为诸如
\uuuuuuuu-atomic\u-add\u-dispatch
之类的事情使用昂贵的原子指令——它只是使用常规的非原子加法。它通过检查pthreads函数的地址,
\uupthread\u key\u create
-如果该值为零,则未链接到
pthread
库,因此进程肯定是单线程的。在本例中,此pthread函数的地址在链接时解析为
0
(编译命令行上没有
-lpthread
),这就是
mov eax,0x0
的来源。在链接时,根据这些知识进行优化已经太迟了,所以残留的原子增量代码仍然存在,但永远不会执行。有关此机制的详细信息,请参阅

执行的代码是函数的最后一部分,从
4009c5
开始。此代码也会以非原子方式递减引用计数。决定这两个选项之间的检查可能基于进程是否是多线程的,例如,
-lpthread
是否已链接。无论出于何种原因,此检查在
\uuuuuuu exchange\u和
内部以一种防止编译器实际删除分支的原子部分的方式实现,即使在某些情况下知道它永远不会被执行
00000000004009a0 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10>:

  # the next two lines implement the check:
  # if (__builtin_expect(this != &_S_empty_rep(), false))
  # which is an empty string optimization. The S_empty_rep singleton
  # is at address 0x6011a0 and if the current buffer points to that
  # we are done (execute the ret)
  4009a0: cmp    rdi,0x6011a0
  4009a7: jne    4009aa <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0xa>
  4009a9: ret

  # now we are in the implementation of
  # __gnu_cxx::__exchange_and_add_dispatch(&this->_M_refcount, -1)
  # which dispatches either to an atomic version of the add function
  # or the non-atomic version, depending on the value of `eax` which
  # is always directly set to zero, so the non-atomic version is 
  # *always called* (see details below)
  4009aa: mov    eax,0x0
  4009af: test   rax,rax
  4009b2: je     4009c5 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x25>

  # this is the atomic version of the decrement you were concerned about
  # but we never execute this code because the test above always jumps
  # to 4009c5 (the non-atomic version)
  4009b4: or     eax,0xffffffff
  4009b7: lock xadd DWORD PTR [rdi+0x10],eax
  4009bc: test   eax,eax
  # check if the result of the xadd was zero, if not skip the delete
  4009be: jg     4009a9 <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x9>
  # the delete call
  4009c0: jmp    400790 <_ZdlPv@plt> # tailcall

  # the non-atomic version starts here, this is the code that is 
  # always executed
  4009c5: mov    eax,DWORD PTR [rdi+0x10]
  4009c8: lea    edx,[rax-0x1]
  4009cb: mov    DWORD PTR [rdi+0x10],edx
  # this jumps up to the test eax,eax check which calls operator delete
  # if the refcount was zero
  4009ce: jmp    4009bc <_ZNSs4_Rep10_M_disposeERKSaIcE.isra.10+0x1c>