Linux kernel 在x86内核中获取TSC速率_Linux Kernel_Tsc

Linux kernel 在x86内核中获取TSC速率

linux-kernel

Linux kernel 在x86内核中获取TSC速率,linux-kernel,tsc,Linux Kernel,Tsc,我有一个运行在Atom上的嵌入式Linux系统，这是一个足够新的CPU，有一个不变的TSC（时间戳计数器），内核在启动时测量其频率。我在自己的代码中使用TSC来保持时间（避免内核调用），我的启动代码测量TSC速率，但我宁愿只使用内核的测量值。有没有办法从内核中检索这个？它不在/proc/cpuinfo的任何地方。TSC速率与/proc/cpuinfo中的“cpu MHz”直接相关。实际上，更好的数字是“bogomips”。原因是，虽然TSC的频率是最大CPU频率，但在调用时，当前的“CPU Mh

我有一个运行在Atom上的嵌入式Linux系统，这是一个足够新的CPU，有一个不变的TSC（时间戳计数器），内核在启动时测量其频率。我在自己的代码中使用TSC来保持时间（避免内核调用），我的启动代码测量TSC速率，但我宁愿只使用内核的测量值。有没有办法从内核中检索这个？它不在/proc/cpuinfo的任何地方。

TSC速率与

/proc/cpuinfo

中的“cpu MHz”直接相关。实际上，更好的数字是“bogomips”。原因是，虽然TSC的频率是最大CPU频率，但在调用时，当前的“CPU Mhz”可能会有所不同

bogomips值在引导时计算。您需要根据核心数和处理器计数（即超线程数）来调整此值，这将为您提供[分数]MHz。这就是我用来做你想做的事

要获取处理器计数，请查找最后一行“processor:”。处理器计数为

+1

。称之为“cpu_计数”

要获得内核数量，任何“cpu内核：”都可以工作。磁芯数为

。称之为“核心计数”

因此，公式是：

smt_count = cpu_count;
if (core_count)
    smt_count /= core_count;
cpu_freq_in_khz = (bogomips * scale_factor) / smt_count;

这是从我的实际代码中提取的，如下所示

这是我实际使用的代码。您无法直接使用它，因为它依赖于我的样板文件，但它应该会给您一些想法，特别是如何计算

// syslgx/tvtsc -- system time routines (RDTSC)

#include <tgb.h>
#include <zprt.h>

tgb_t systvinit_tgb[] = {
    { .tgb_val = 1, .tgb_tag = "cpu_mhz" },
    { .tgb_val = 2, .tgb_tag = "bogomips" },
    { .tgb_val = 3, .tgb_tag = "processor" },
    { .tgb_val = 4, .tgb_tag = "cpu_cores" },
    { .tgb_val = 5, .tgb_tag = "clflush_size" },
    { .tgb_val = 6, .tgb_tag = "cache_alignment" },
    TGBEOT
};

// _systvinit -- get CPU speed
static void
_systvinit(void)
{
    const char *file;
    const char *dlm;
    XFIL *xfsrc;
    int matchflg;
    char *cp;
    char *cur;
    char *rhs;
    char lhs[1000];
    tgb_pc tgb;
    syskhz_t khzcpu;
    syskhz_t khzbogo;
    syskhz_t khzcur;
    sysmpi_p mpi;

    file = "/proc/cpuinfo";

    xfsrc = fopen(file,"r");
    if (xfsrc == NULL)
        sysfault("systvinit: unable to open '%s' -- %s\n",file,xstrerror());

    dlm = " \t";

    khzcpu = 0;
    khzbogo = 0;

    mpi = &SYS->sys_cpucnt;
    SYSZAPME(mpi);

    // (1) look for "cpu MHz : 3192.515" (preferred)
    // (2) look for "bogomips : 3192.51" (alternate)
    // FIXME/CAE -- on machines with speed-step, bogomips may be preferred (or
    // disable it)
    while (1) {
        cp = fgets(lhs,sizeof(lhs),xfsrc);
        if (cp == NULL)
            break;

        // strip newline
        cp = strchr(lhs,'\n');
        if (cp != NULL)
            *cp = 0;

        // look for symbol value divider
        cp = strchr(lhs,':');
        if (cp == NULL)
            continue;

        // split symbol and value
        *cp = 0;
        rhs = cp + 1;

        // strip trailing whitespace from symbol
        for (cp -= 1;  cp >= lhs;  --cp) {
            if (! XCTWHITE(*cp))
                break;
            *cp = 0;
        }

        // convert "foo bar" into "foo_bar"
        for (cp = lhs;  *cp != 0;  ++cp) {
            if (XCTWHITE(*cp))
                *cp = '_';
        }

        // match on interesting data
        matchflg = 0;
        for (tgb = systvinit_tgb;  TGBMORE(tgb);  ++tgb) {
            if (strcasecmp(lhs,tgb->tgb_tag) == 0) {
                matchflg = tgb->tgb_val;
                break;
            }
        }
        if (! matchflg)
            continue;

        // look for the value
        cp = strtok_r(rhs,dlm,&cur);
        if (cp == NULL)
            continue;

        zprt(ZPXHOWSETUP,"_systvinit: GRAB/%d lhs='%s' cp='%s'\n",
            matchflg,lhs,cp);

        // process the value
        // NOTE: because of Intel's speed step, take the highest cpu speed
        switch (matchflg) {
        case 1:  // genuine CPU speed
            khzcur = _systvinitkhz(cp);
            if (khzcur > khzcpu)
                khzcpu = khzcur;
            break;

        case 2:  // the consolation prize
            khzcur = _systvinitkhz(cp);

            // we've seen some "wild" values
            if (khzcur > 10000000)
                break;

            if (khzcur > khzbogo)
                khzbogo = khzcur;
            break;

        case 3:  // remember # of cpu's so we can adjust bogomips
            mpi->mpi_cpucnt = atoi(cp);
            mpi->mpi_cpucnt += 1;
            break;

        case 4:  // remember # of cpu cores so we can adjust bogomips
            mpi->mpi_corecnt = atoi(cp);
            break;

        case 5:  // cache flush size
            mpi->mpi_cshflush = atoi(cp);
            break;

        case 6:  // cache alignment
            mpi->mpi_cshalign = atoi(cp);
            break;
        }
    }

    fclose(xfsrc);

    // we want to know the number of hyperthreads
    mpi->mpi_smtcnt = mpi->mpi_cpucnt;
    if (mpi->mpi_corecnt)
        mpi->mpi_smtcnt /= mpi->mpi_corecnt;

    zprt(ZPXHOWSETUP,"_systvinit: FINAL khzcpu=%d khzbogo=%d mpi_cpucnt=%d mpi_corecnt=%d mpi_smtcnt=%d mpi_cshalign=%d mpi_cshflush=%d\n",
        khzcpu,khzbogo,mpi->mpi_cpucnt,mpi->mpi_corecnt,mpi->mpi_smtcnt,
        mpi->mpi_cshalign,mpi->mpi_cshflush);

    if ((mpi->mpi_cshalign == 0) || (mpi->mpi_cshflush == 0))
        sysfault("_systvinit: cache parameter fault\n");

    do {
        // use the best reference
        // FIXME/CAE -- with speed step, bogomips is better
#if 0
        if (khzcpu != 0)
            break;
#endif

        khzcpu = khzbogo;
        if (mpi->mpi_smtcnt)
            khzcpu /= mpi->mpi_smtcnt;
        if (khzcpu != 0)
            break;

        sysfault("_systvinit: unable to obtain cpu speed\n");
    } while (0);

    systvkhz(khzcpu);

    zprt(ZPXHOWSETUP,"_systvinit: EXIT\n");
}

// _systvinitkhz -- decode value
// RETURNS: CPU freq in khz
static syskhz_t
_systvinitkhz(char *str)
{
    char *src;
    char *dst;
    int rhscnt;
    char bf[100];
    syskhz_t khz;

    zprt(ZPXHOWSETUP,"_systvinitkhz: ENTER str='%s'\n",str);

    dst = bf;
    src = str;

    // get lhs of lhs.rhs
    for (;  *src != 0;  ++src, ++dst) {
        if (*src == '.')
            break;
        *dst = *src;
    }

    // skip over the dot
    ++src;

    // get rhs of lhs.rhs and determine how many rhs digits we have
    rhscnt = 0;
    for (;  *src != 0;  ++src, ++dst, ++rhscnt)
        *dst = *src;

    *dst = 0;

    khz = atol(bf);
    zprt(ZPXHOWSETUP,"_systvinitkhz: PRESCALE bf='%s' khz=%d rhscnt=%d\n",
        bf,khz,rhscnt);

    // scale down (e.g. we got xxxx.yyyy)
    for (;  rhscnt > 3;  --rhscnt)
        khz /= 10;

    // scale up (e.g. we got xxxx.yy--bogomips does this)
    for (;  rhscnt < 3;  ++rhscnt)
        khz *= 10;

    zprt(ZPXHOWSETUP,"_systvinitkhz: EXIT khz=%d\n",khz);

    return khz;
}

该文件的内容是以kHz为单位的最大CPU频率。其他CPU核心也有类似的文件。对于大多数正常的主板来说，这些文件应该是相同的（例如，由相同型号的芯片组成的主板，不要试图混合使用i7和原子）。否则，你就必须以每个核心为基础跟踪信息，这很快就会变得一团糟

给定的目录还有其他有趣的文件。例如，如果您的处理器有“速度步长”[其他一些文件可以告诉您]，您可以通过将

performance

写入

scaling\u调控器

文件来强制实现最大性能。这将禁用速度步进的使用

如果处理器没有常数，那么必须禁用速度步进[并以最大速率运行内核]才能获得准确的测量值

我看了一下，似乎没有一种内置的方法可以直接从内核获取此信息

然而，（我猜这是您想要的）是由内核导出的。您可以编写一个小型内核模块，公开一个sysfs接口，并使用它从用户空间读取

tsc_khz

的值

如果写一个内核模块不是一个选项，它可能会使用一些黑暗的魔法™ 直接从内核内存空间读取值。解析内核二进制文件或

System.map

文件，找到

tsc_khz

符号的位置并从中读取。当然，只有在内核配置了适当的选项的情况下，这才是可能的

最后，从阅读报告来看，TSC在某些平台上可能不稳定。我不太了解x86 arch的内部工作原理，但这可能是您需要考虑的因素。

BPFtrace 作为root用户，您可以使用bpftrace检索内核的TSC速率：

# bpftrace -e 'BEGIN { printf("%u\n", *kaddr("tsc_khz")); exit(); }' | tail -n

（在CentOS 7和Fedora 29上进行了测试）

这是在中定义、导出和维护/校准的值

GDB 或者，也可以作为root用户，从

/proc/kcore

读取，例如：

# gdb /dev/null /proc/kcore -ex 'x/uw 0x'$(grep '\<tsc_khz\>' /proc/kallsyms \
    | cut -d' ' -f1) -batch 2>/dev/null | tail -n 1 | cut -f2

当然，您也可以编写一个小型内核模块，通过

/sys

伪文件系统提供对

tsc_khz

的访问。更妙的是，已经有人这么做了，还有一个。有了这一点，以下几点应该起作用：

# modprobe tsc_freq_khz
$ cat /sys/devices/system/cpu/cpu0/tsc_freq_khz

（在Fedora29上测试，读取sysfs文件不需要root）

内核消息如果上面没有任何选项，您可以从内核日志解析TSC速率。但这很快就会变得难看，因为您在不同的硬件和内核上看到不同类型的消息，例如在Fedora 29 i7系统上：

$ journalctl --boot | grep 'kernel: tsc:' -i | cut -d' ' -f5-
kernel: tsc: Detected 2800.000 MHz processor
kernel: tsc: Detected 2808.000 MHz TSC

但在Fedora 29 Intel Atom上：

kernel: tsc: Detected 2200.000 MHz processor

在CentOS 7 i5系统上时：

kernel: tsc: Fast TSC calibration using PIT
kernel: tsc: Detected 1895.542 MHz processor
kernel: tsc: Refined TSC clocksource calibration: 1895.614 MHz

性能值 Linux内核还没有提供读取TSC速率的API。但它确实提供了一个用于获取可用于将TSC计数转换为纳秒的

mult

和

shift

值的方法。这些值是从

tsc_khz

-中导出的，其中

tsc_khz

被初始化和校准。它们与用户空间共享

使用perf API并访问共享页面的示例程序：

#include <asm/unistd.h>
#include <inttypes.h>
#include <linux/perf_event.h>
#include <stdio.h>
#include <sys/mman.h>
#include <unistd.h>

static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
           int cpu, int group_fd, unsigned long flags)
{
    return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}

在Fedora29上进行了测试，它也适用于非root用户

这些值可用于使用如下函数将TSC计数转换为纳秒：

static uint64_t mul_u64_u32_shr(uint64_t cyc, uint32_t mult, uint32_t shift)
{
    __uint128_t x = cyc;
    x *= mult;
    x >>= shift;
    return x;
}

如果处理器有速度步进且没有“恒定”TSC，则TSC随当前CPU时钟速度而变化。所有现代x86处理器都有恒定的_tsc（正如OP提到的那样）。顺便说一句，必要的信息[如我的回答中所述]在

/proc/cpuinfo

中，我喜欢编写内核模块的想法，但由于我从未这样做过，我认为我倾向于相信bogomips/2值。在我的四核1.6GHz Atom上，TSC计数为1.6GHz，但bogomips表示为3.2GHz。在我的四核3.5GHz i7-4770K上，TSC计数为3.5GHz，但bogomips表示为7GHz。我添加了更多解释[以及代码]。试着调整一下。我已经使用这个代码段20年了，所以它是[通常：-）]正确的。您需要“cores”行和最后一个“processor”行值来计算SMT[HyperRead]计数。Bogomips基于超线程（例如，时钟频率为3和2超线程，bogo将为3*2或6），因此我们需要将bogo除以超线程计数以获得频率这是一个不错的理论，但我的四核原子不进行超线程，并且仍然显示Bogomips va

#include <asm/unistd.h>
#include <inttypes.h>
#include <linux/perf_event.h>
#include <stdio.h>
#include <sys/mman.h>
#include <unistd.h>

static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
           int cpu, int group_fd, unsigned long flags)
{
    return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}

int main(int argc, char **argv)
{
    struct perf_event_attr pe = {
        .type = PERF_TYPE_HARDWARE,
        .size = sizeof(struct perf_event_attr),
        .config = PERF_COUNT_HW_INSTRUCTIONS,
        .disabled = 1,
        .exclude_kernel = 1,
        .exclude_hv = 1
    };
    int fd = perf_event_open(&pe, 0, -1, -1, 0);
    if (fd == -1) {
        perror("perf_event_open failed");
        return 1;
    }
    void *addr = mmap(NULL, 4*1024, PROT_READ, MAP_SHARED, fd, 0);
    if (!addr) {
        perror("mmap failed");
        return 1;
    }
    struct perf_event_mmap_page *pc = addr;
    if (pc->cap_user_time != 1) {
        fprintf(stderr, "Perf system doesn't support user time\n");
        return 1;
    }
    printf("%16s   %5s\n", "mult", "shift");
    printf("%16" PRIu32 "   %5" PRIu16 "\n", pc->time_mult, pc->time_shift);
    close(fd);
}

static uint64_t mul_u64_u32_shr(uint64_t cyc, uint32_t mult, uint32_t shift)
{
    __uint128_t x = cyc;
    x *= mult;
    x >>= shift;
    return x;
}