C Linux上线程本地变量访问的速度有多快_C_Linux_Multithreading_Gcc_X86 64

C Linux上线程本地变量访问的速度有多快

c linux multithreading gcc

C Linux上线程本地变量访问的速度有多快,c,linux,multithreading,gcc,x86-64,C,Linux,Multithreading,Gcc,X86 64,Linux中访问线程本地变量的速度有多快。从gcc编译器生成的代码中，我可以看到is使用了fs段寄存器。因此，显然，访问线程局部变量不应该花费额外的周期然而，我一直在读关于线程局部变量访问缓慢的恐怖故事。怎么会？当然，有时不同的编译器使用的方法不同于使用fs段寄存器，但是通过fs段寄存器访问线程局部变量是否也很慢 Linux中访问线程本地变量的速度有多快这取决于很多事情某些处理器（i*86）具有特殊段（fs，或gs处于x86\u 64模式）。其他处理器没有（但通常它们会保留一个寄存器，用于

Linux中访问线程本地变量的速度有多快。从gcc编译器生成的代码中，我可以看到is使用了

fs

段寄存器。因此，显然，访问线程局部变量不应该花费额外的周期

然而，我一直在读关于线程局部变量访问缓慢的恐怖故事。怎么会？当然，有时不同的编译器使用的方法不同于使用

fs

段寄存器，但是通过

fs

段寄存器访问线程局部变量是否也很慢

Linux中访问线程本地变量的速度有多快

这取决于很多事情

某些处理器（

i*86

）具有特殊段（

fs

，或

gs

处于

x86\u 64

模式）。其他处理器没有（但通常它们会保留一个寄存器，用于访问当前线程，并且使用该专用寄存器很容易找到

TLS

）

在

i*86

上，使用

fs

，访问速度几乎与直接内存访问一样快

我一直在读关于线程局部变量访问缓慢的恐怖故事

如果你能提供一些类似恐怖故事的链接，这会有所帮助。如果没有这些链接，就不可能知道他们的作者是否知道他们在谈论什么

然而，我一直在读关于线程局部变量访问缓慢的恐怖故事。为什么

让我用一个例子来演示Linux x86_64上线程局部变量的慢度

无

\u线程

变量，无慢度

#include "stdio.h"
#include "math.h"

__thread double tlvar;
//following line is needed so get_value() is not inlined by compiler
double get_value() __attribute__ ((noinline));
double get_value()
{
  return tlvar;
}

int main()
{
  int i;
  double f=0.0;

  tlvar = 1.0;
  for(i=0; i<1000000000; i++)
  {
    f += sqrt(get_value());
  }
  printf("f = %f\n", f);
  return 1;
}

我将使用此测试的性能作为基础

    #include "stdio.h"
    #include "math.h"

    double tlvar;
    //following line is needed so get_value() is not inlined by compiler
    double get_value() __attribute__ ((noinline));
    double get_value()
    {
      return tlvar;
    }
    int main()

    {
      int i;
      double f=0.0;
      tlvar = 1.0;
      for(i=0; i<1000000000; i++)
      {
         f += sqrt(get_value());
      }
      printf("f = %f\n", f);
      return 1;
    }

可执行文件（不在共享库中）中有

\u线程

变量，仍然没有慢度

#include "stdio.h"
#include "math.h"

__thread double tlvar;
//following line is needed so get_value() is not inlined by compiler
double get_value() __attribute__ ((noinline));
double get_value()
{
  return tlvar;
}

int main()
{
  int i;
  double f=0.0;

  tlvar = 1.0;
  for(i=0; i<1000000000; i++)
  {
    f += sqrt(get_value());
  }
  printf("f = %f\n", f);
  return 1;
}

因此，很明显，当

\uu线程var在可执行文件中时，它的速度与普通全局变量一样快


有一个\u线程
变量，它位于共享库中，有慢度
#include "stdio.h"
#include "math.h"

__thread double tlvar;
//following line is needed so get_value() is not inlined by compiler
double get_value() __attribute__ ((noinline));
double get_value()
{
  return tlvar;
}

int main()
{
  int i;
  double f=0.0;

  tlvar = 1.0;
  for(i=0; i<1000000000; i++)
  {
    f += sqrt(get_value());
  }
  printf("f = %f\n", f);
  return 1;
}

可执行文件：
共享库：
它的运行速度几乎慢了两倍：
$ time ./inet_test_main
f = 1000000000.000000

real    0m9.978s
user    0m9.906s
sys     0m0.004s

最后，这是perf
报告的内容-\u tls\u get\u addr-CPU利用率的21%：
$ perf report --stdio
#
# Events: 10K cpu-clock
#
# Overhead         Command        Shared Object              Symbol
# ........  ..............  ...................  ..................
#
    58.05%  inet_test_main  libinet_test_lib.so  [.] test
    21.15%  inet_test_main  ld-2.12.so           [.] __tls_get_addr
    10.69%  inet_test_main  libinet_test_lib.so  [.] get_value
     5.07%  inet_test_main  libinet_test_lib.so  [.] get_value@plt
     4.82%  inet_test_main  libinet_test_lib.so  [.] __tls_get_addr@plt
     0.23%  inet_test_main  [kernel.kallsyms]    [k] 0xffffffffa0165b75


因此，正如您所看到的，当线程局部变量位于共享库中时（声明为静态且仅在共享库中使用）速度相当慢。如果共享库中的线程局部变量很少被访问，那么性能就不会有问题。如果像在这个测试中一样经常使用它，那么开销将非常大
评论中提到的文档讨论了四种可能的TLS访问模型。坦白地说，我不明白什么时候使用“Initial exec TLS model”，但是对于其他三个模型，只有当\uuu thread
变量在可执行文件中并且可以从可执行文件访问时，才可以避免调用\utls\u get\u addr（）。。有没有人觉得有动力读这篇文章并用一个简短的回答来概括它数据“恐怖故事”可能来自通过pthreads_setspecific的TSS（线程特定存储）。TSS比TLS慢，但如果做得不好的话，它会比TLS慢很多。我可以给你讲一个关于非线程局部变量（一个简单的整数计数器）慢的恐怖故事，它通过几个线程进行修改，并且由于缓存窥探而使系统慢到爬行。让它成为本地线程，并在最后对所有本地线程进行求和，这给了我100倍或类似的加速！我正在使用的工具有着完全相同的问题，我解决了它，就像你做的一样，也就是说，使用线程局部变量：）！干杯恐怖故事？没问题：我在一个嵌入式MIPS平台上工作过，在这个平台上，每次访问线程本地存储都会导致非常慢的内核调用。您可以在该平台上每秒进行大约8000次TLS访问。所有这些测试都需要+1。伟大的然而，每次操作5纳秒并不是我所说的真正的慢。它与函数调用的顺序相同，因此除非线程局部变量实际上是您唯一要做的事情，否则它永远不会成为问题。线程同步通常要昂贵得多。如果您可以通过使用线程本地存储来避免这种情况，那么您就有了一个巨大的win-shared库，您可以在共享库中使用-ftls model=initial exec或u属性（（tls_model（“initial exec”）），但您必须非常小心。它会中断dlopen，并且共享库对象加载的顺序也变得很重要，因为如果已经加载了太多的静态或动态TLS对象，则设置了静态\u TLS标志的elf可能无法加载。（即应首先加载静态tls对象）
$ cat inet_test_main.c
#include "stdio.h"
#include "math.h"
int test();

int main()
{
   test();
   return 1;
}

$ cat inet_test_lib.c
#include "stdio.h"
#include "math.h"

static __thread double tlvar;
//following line is needed so get_value() is not inlined by compiler
double get_value() __attribute__ ((noinline));
double get_value()
{
  return tlvar;
}

int test()
{
  int i;
  double f=0.0;
  tlvar = 1.0;
  for(i=0; i<1000000000; i++)
  {
    f += sqrt(get_value());
  }
  printf("f = %f\n", f);
  return 1;
}

Dump of assembler code for function get_value:
=> 0x00007ffff7dfc6d0 <+0>:     lea    0x200329(%rip),%rdi        # 0x7ffff7ffca00
   0x00007ffff7dfc6d7 <+7>:     callq  0x7ffff7dfc5c8 <__tls_get_addr@plt>
   0x00007ffff7dfc6dc <+12>:    movsd  0x0(%rax),%xmm0
   0x00007ffff7dfc6e4 <+20>:    retq
End of assembler dump.

(gdb) disas __tls_get_addr
Dump of assembler code for function __tls_get_addr:
   0x0000003c40a114d0 <+0>:     push   %rbx
   0x0000003c40a114d1 <+1>:     mov    %rdi,%rbx
=> 0x0000003c40a114d4 <+4>:     mov    %fs:0x8,%rdi
   0x0000003c40a114dd <+13>:    mov    0x20fa74(%rip),%rax        # 0x3c40c20f58 <_rtld_local+3928>
   0x0000003c40a114e4 <+20>:    cmp    %rax,(%rdi)
   0x0000003c40a114e7 <+23>:    jne    0x3c40a11505 <__tls_get_addr+53>
   0x0000003c40a114e9 <+25>:    xor    %esi,%esi
   0x0000003c40a114eb <+27>:    mov    (%rbx),%rdx
   0x0000003c40a114ee <+30>:    mov    %rdx,%rax
   0x0000003c40a114f1 <+33>:    shl    $0x4,%rax
   0x0000003c40a114f5 <+37>:    mov    (%rax,%rdi,1),%rax
   0x0000003c40a114f9 <+41>:    cmp    $0xffffffffffffffff,%rax
   0x0000003c40a114fd <+45>:    je     0x3c40a1151b <__tls_get_addr+75>
   0x0000003c40a114ff <+47>:    add    0x8(%rbx),%rax
   0x0000003c40a11503 <+51>:    pop    %rbx
   0x0000003c40a11504 <+52>:    retq
   0x0000003c40a11505 <+53>:    mov    (%rbx),%rdi
   0x0000003c40a11508 <+56>:    callq  0x3c40a11200 <_dl_update_slotinfo>
   0x0000003c40a1150d <+61>:    mov    %rax,%rsi
   0x0000003c40a11510 <+64>:    mov    %fs:0x8,%rdi
   0x0000003c40a11519 <+73>:    jmp    0x3c40a114eb <__tls_get_addr+27>
   0x0000003c40a1151b <+75>:    callq  0x3c40a11000 <tls_get_addr_tail>
   0x0000003c40a11520 <+80>:    jmp    0x3c40a114ff <__tls_get_addr+47>
End of assembler dump.

$ time ./inet_test_main
f = 1000000000.000000

real    0m9.978s
user    0m9.906s
sys     0m0.004s

$ perf report --stdio
#
# Events: 10K cpu-clock
#
# Overhead         Command        Shared Object              Symbol
# ........  ..............  ...................  ..................
#
    58.05%  inet_test_main  libinet_test_lib.so  [.] test
    21.15%  inet_test_main  ld-2.12.so           [.] __tls_get_addr
    10.69%  inet_test_main  libinet_test_lib.so  [.] get_value
     5.07%  inet_test_main  libinet_test_lib.so  [.] get_value@plt
     4.82%  inet_test_main  libinet_test_lib.so  [.] __tls_get_addr@plt
     0.23%  inet_test_main  [kernel.kallsyms]    [k] 0xffffffffa0165b75