C L2硬件预取器真的有用吗?

C L2硬件预取器真的有用吗?,c,performance,assembly,x86-64,avx,C,Performance,Assembly,X86 64,Avx,我在威士忌湖i7-8565U上,分析性能计数器和复制512 KiB数据的时间(比二级缓存大小大两倍),并对二级硬件预取器的工作产生了一些误解 在中,有MSR0x1A4的位0用于控制L2 HW预取器(1用于禁用) 考虑以下基准: memcopy.h: void *avx_memcpy_forward_lsls(void *restrict, const void *restrict, size_t); memcopy.S: avx_memcpy_forward_lsls: shr rd

我在威士忌湖i7-8565U上,分析性能计数器和复制512 KiB数据的时间(比二级缓存大小大两倍),并对二级硬件预取器的工作产生了一些误解

在中,有MSR
0x1A4
的位0用于控制L2 HW预取器(1用于禁用)


考虑以下基准:

memcopy.h

void *avx_memcpy_forward_lsls(void *restrict, const void *restrict, size_t);
memcopy.S

avx_memcpy_forward_lsls:
    shr rdx, 0x3
    xor rcx, rcx
avx_memcpy_forward_loop_lsls:
    vmovdqa ymm0, [rsi + 8*rcx]
    vmovdqa [rdi + rcx*8], ymm0
    vmovdqa ymm1, [rsi + 8*rcx + 0x20]
    vmovdqa [rdi + rcx*8 + 0x20], ymm1
    add rcx, 0x08
    cmp rdx, rcx
    ja avx_memcpy_forward_loop_lsls
    ret
main.c

#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
#include <x86intrin.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include "memcopy.h"

#define ITERATIONS 1000
#define BUF_SIZE 512 * 1024

_Alignas(64) char src[BUF_SIZE];
_Alignas(64) char dest[BUF_SIZE];

static void __run_benchmark(unsigned runs, unsigned run_iterations,
                    void *(*fn)(void *, const void*, size_t), void *dest, const void* src, size_t sz);

#define run_benchmark(runs, run_iterations, fn, dest, src, sz) \
    do{\
        printf("Benchmarking " #fn "\n");\
        __run_benchmark(runs, run_iterations, fn, dest, src, sz);\
    }while(0)

int main(void){
    int fd = open("/dev/urandom", O_RDONLY);
    read(fd, src, sizeof src);
    run_benchmark(20, ITERATIONS, avx_memcpy_forward_lsls, dest, src, BUF_SIZE);
}

static inline void benchmark_copy_function(unsigned iterations, void *(*fn)(void *, const void *, size_t),
                                               void *restrict dest, const void *restrict src, size_t sz){
    while(iterations --> 0){
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
    }
}

static void __run_benchmark(unsigned runs, unsigned run_iterations,
                    void *(*fn)(void *, const void*, size_t), void *dest, const void* src, size_t sz){
    unsigned current_run = 1;
    while(current_run <= runs){
        benchmark_copy_function(run_iterations, fn, dest, src, sz);
        printf("Run %d finished\n", current_run);
        current_run++;
    }
}
Run:

$ taskset -c 0 sudo ../profile.sh ./bin 

 Performance counter stats for './bin':

    10 486 164 071      L1-dcache-loads                                               (12,13%)
    10 461 354 384      L1-dcache-load-misses     #   99,76% of all L1-dcache hits    (12,05%)
    10 481 930 413      L1-dcache-stores                                              (12,05%)
    10 461 136 686      l1d.replacement                                               (12,12%)
    31 466 394 422      l1d_pend_miss.fb_full                                         (12,11%)
   211 853 643 294      l1d_pend_miss.pending                                         (12,09%)
     1 759 204 317      LLC-loads                                                     (12,16%)
            31 007      LLC-load-misses           #    0,00% of all LL-cache hits     (12,16%)
     3 154 901 630      LLC-stores                                                    (6,19%)
    15 867 315 545      l2_rqsts.all_pf                                               (9,22%)
                 0      sw_prefetch_access.t1_t2                                      (12,22%)
         1 393 306      l2_lines_out.useless_hwpf                                     (12,16%)
     3 549 170 919      l2_rqsts.pf_hit                                               (12,09%)
    12 356 247 643      l2_rqsts.pf_miss                                              (12,06%)
                 0      load_hit_pre.sw_pf                                            (12,09%)
     3 159 712 695      l2_rqsts.rfo_hit                                              (12,06%)
     1 207 642 335      l2_rqsts.rfo_miss                                             (12,02%)
     4 366 526 618      l2_rqsts.all_rfo                                              (12,06%)
     5 240 013 774      offcore_requests.all_data_rd                                     (12,06%)
    19 936 657 118      offcore_requests.all_requests                                     (12,09%)
     1 761 660 763      offcore_response.demand_data_rd.any_response                                     (12,12%)
       287 044 397      bus-cycles                                                    (12,15%)
    36 816 767 779      resource_stalls.any                                           (12,15%)
    36 553 997 653      resource_stalls.sb                                            (12,15%)
    38 035 066 210      uops_retired.stall_cycles                                     (12,12%)
    24 766 225 119      uops_executed.stall_cycles                                     (12,09%)
    40 478 455 041      uops_issued.stall_cycles                                      (12,05%)
    24 497 256 548      cycle_activity.stalls_l1d_miss                                     (12,02%)
    12 611 038 018      cycle_activity.stalls_l2_miss                                     (12,09%)
        10 228 869      cycle_activity.stalls_l3_miss                                     (12,12%)
    24 707 614 483      cycle_activity.stalls_mem_any                                     (12,22%)
    24 776 110 104      cycle_activity.stalls_total                                     (12,22%)
    48 914 478 241      cycles                                                        (12,19%)

      12,155774555 seconds time elapsed

      11,984577000 seconds user
       0,015984000 seconds sys
II.

MSR:

$ sudo rdmsr -p 0 0x1A4
0
$ sudo rdmsr -p 0 0x1A4
1
Run:

$ taskset -c 0 sudo ../profile.sh ./bin

 Performance counter stats for './bin':

    10 508 027 832      L1-dcache-loads                                               (12,05%)
    10 463 643 206      L1-dcache-load-misses     #   99,58% of all L1-dcache hits    (12,09%)
    10 481 296 605      L1-dcache-stores                                              (12,12%)
    10 444 854 468      l1d.replacement                                               (12,15%)
    29 287 445 744      l1d_pend_miss.fb_full                                         (12,17%)
   205 569 630 707      l1d_pend_miss.pending                                         (12,17%)
     5 103 444 329      LLC-loads                                                     (12,17%)
            33 406      LLC-load-misses           #    0,00% of all LL-cache hits     (12,17%)
     9 567 917 742      LLC-stores                                                    (6,08%)
     1 157 237 980      l2_rqsts.all_pf                                               (9,12%)
                 0      sw_prefetch_access.t1_t2                                      (12,17%)
           301 471      l2_lines_out.useless_hwpf                                     (12,17%)
       218 528 985      l2_rqsts.pf_hit                                               (12,17%)
       938 735 722      l2_rqsts.pf_miss                                              (12,17%)
                 0      load_hit_pre.sw_pf                                            (12,17%)
         4 096 281      l2_rqsts.rfo_hit                                              (12,17%)
     4 972 640 931      l2_rqsts.rfo_miss                                             (12,17%)
     4 976 006 805      l2_rqsts.all_rfo                                              (12,17%)
     5 175 544 191      offcore_requests.all_data_rd                                     (12,17%)
    15 772 124 082      offcore_requests.all_requests                                     (12,17%)
     5 120 635 892      offcore_response.demand_data_rd.any_response                                     (12,17%)
       292 980 395      bus-cycles                                                    (12,17%)
    37 592 020 151      resource_stalls.any                                           (12,14%)
    37 317 091 982      resource_stalls.sb                                            (12,11%)
    38 121 826 730      uops_retired.stall_cycles                                     (12,08%)
    25 430 699 605      uops_executed.stall_cycles                                     (12,04%)
    41 416 190 037      uops_issued.stall_cycles                                      (12,04%)
    25 326 579 070      cycle_activity.stalls_l1d_miss                                     (12,04%)
    25 019 148 253      cycle_activity.stalls_l2_miss                                     (12,03%)
         7 384 770      cycle_activity.stalls_l3_miss                                     (12,03%)
    25 442 709 033      cycle_activity.stalls_mem_any                                     (12,03%)
    25 406 897 956      cycle_activity.stalls_total                                     (12,03%)
    49 877 044 086      cycles                                                        (12,03%)

      12,231406658 seconds time elapsed

      12,226386000 seconds user
       0,004000000 seconds sys

我注意到柜台上:

12 611 038 018循环\u活动。暂停\u l2\u未命中
v/s
25 019 148 253循环活动。暂停2未命中

表明正在应用MSR禁用L2 HW预取器。此外,与l2/LLC相关的其他内容也存在显著差异在不同的运行中,差异是可重复的。问题在于
总时间和周期几乎没有区别:

48 914 478 241次循环
v/s
49 877 044 086次循环

12155774555秒经过的时间
v/s
12231406658秒经过的时间

问题:
二级未命中是否被其他性能限制器隐藏?
如果是,您能建议查看哪些计数器以了解它吗?

是的,二级拖缆在很多时候都非常有用

memcpy没有任何计算延迟可隐藏,因此我想它可以让OoO exec resources(ROB size)处理更多二级未命中带来的额外负载延迟,至少在这种情况下,使用适合三级的中等大小工作集(1MiB)可以获得所有三级命中,无需预取即可实现三级命中

而唯一的指令是加载/存储(和循环开销),因此OoO窗口包含了相当超前的需求加载

IDK,如果L2空间预取器和L1d预取器在这里有帮助


预测来检验这一假设:如果OoO exec不足以隐藏一直到DRAM的负载延迟,那么将阵列变大,这样就可以获得三级未命中,并且您可能会看到总体时间上的差异。在更远的前方触发HW预取可能会有所帮助

硬件预取的另一大好处是,它可以跟上您的计算速度,因此您可以获得L2命中率。(在具有中等长度计算的循环中,但不是循环承载的依赖链。)

当ROB容量没有其他压力时,Demand loads和OoO exec可以尽可能使用可用(单线程)内存带宽


还请注意,在英特尔CPU上,每次缓存未命中都可能导致从属UOP的后端重播(来自RS/调度程序),数据预计到达时,L1d和L2未命中各一次。在那之后,很显然,在等待数据从L3到达时,核心乐观地散播UOP

(见和)


不是缓存未加载本身;在这种情况下,它将是存储说明。更具体地说,存储端口4的数据uop。这在这里无关紧要;在三级带宽上使用32字节存储和瓶颈意味着我们没有接近每个时钟1个端口4 uop。

是的,二级硬件预取器非常有用

例如,在我运行的机器(i7-6700HQ)上查找以下结果。第一列结果是所有预取器打开,第二列结果是L2拖缆关闭(但所有其他预取器仍打开)

此测试使用32个MiB源和目标缓冲区,它们比我机器上的L3大得多,因此它将主要测试DRAM的未命中

==========================================================================
== Memory bandwidth tests                                               ==
==                                                                      ==
== Note 1: 1MB = 1000000 bytes                                          ==
== Note 2: Results for 'copy' tests show how many bytes can be          ==
==         copied per second (adding together read and writen           ==
==         bytes would have provided twice higher numbers)              ==
== Note 3: 2-pass copy means that we are using a small temporary buffer ==
==         to first fetch data into it, and only then write it to the   ==
==         destination (source -> L1 cache, L1 cache -> destination)    ==
== Note 4: If sample standard deviation exceeds 0.1%, it is shown in    ==
==         brackets                                                     ==
==========================================================================

                                                       L2 streamer ON            OFF
 C copy backwards                                     :   7962.4 MB/s    4430.5 MB/s
 C copy backwards (32 byte blocks)                    :   7993.5 MB/s    4467.0 MB/s
 C copy backwards (64 byte blocks)                    :   7989.9 MB/s    4438.0 MB/s
 C copy                                               :   8503.1 MB/s    4466.6 MB/s
 C copy prefetched (32 bytes step)                    :   8729.2 MB/s    4958.4 MB/s
 C copy prefetched (64 bytes step)                    :   8730.7 MB/s    4958.4 MB/s
 C 2-pass copy                                        :   6171.2 MB/s    3368.7 MB/s
 C 2-pass copy prefetched (32 bytes step)             :   6193.1 MB/s    4104.2 MB/s
 C 2-pass copy prefetched (64 bytes step)             :   6198.8 MB/s    4101.6 MB/s
 C fill                                               :  13372.4 MB/s   10610.5 MB/s
 C fill (shuffle within 16 byte blocks)               :  13379.4 MB/s   10547.5 MB/s
 C fill (shuffle within 32 byte blocks)               :  13365.8 MB/s   10636.9 MB/s
 C fill (shuffle within 64 byte blocks)               :  13588.7 MB/s   10588.3 MB/s
 -
 standard memcpy                                      :  11550.7 MB/s    8216.3 MB/s
 standard memset                                      :  23188.7 MB/s   22686.8 MB/s
 -
 MOVSB copy                                           :   9458.4 MB/s    6523.7 MB/s
 MOVSD copy                                           :   9474.5 MB/s    6510.7 MB/s
 STOSB fill                                           :  23329.0 MB/s   22901.5 MB/s
 SSE2 copy                                            :   9073.1 MB/s    4970.3 MB/s
 SSE2 nontemporal copy                                :  12647.1 MB/s    7492.5 MB/s
 SSE2 copy prefetched (32 bytes step)                 :   9106.0 MB/s    5069.8 MB/s
 SSE2 copy prefetched (64 bytes step)                 :   9113.5 MB/s    5063.1 MB/s
 SSE2 nontemporal copy prefetched (32 bytes step)     :  11770.8 MB/s    7453.4 MB/s
 SSE2 nontemporal copy prefetched (64 bytes step)     :  11937.1 MB/s    7712.1 MB/s
 SSE2 2-pass copy                                     :   7092.8 MB/s    4355.2 MB/s
 SSE2 2-pass copy prefetched (32 bytes step)          :   7001.4 MB/s    4585.1 MB/s
 SSE2 2-pass copy prefetched (64 bytes step)          :   7055.1 MB/s    4557.9 MB/s
 SSE2 2-pass nontemporal copy                         :   5043.2 MB/s    3263.3 MB/s
 SSE2 fill                                            :  14087.3 MB/s   10947.1 MB/s
 SSE2 nontemporal fill                                :  33134.5 MB/s   32774.3 MB/s
在这些测试中,L2拖缆的速度从来都不慢,而且通常快两倍

通常,您可能会在结果中注意到以下模式:

  • 副本通常比填充受影响更大
  • 标准memset
    STOSB-fill
    (在这个平台上可以归结为相同的东西)受影响最小,预取结果只比没有的快几%
  • 标准
    memcpy
    可能是这里唯一一个使用32字节AVX指令的拷贝,它是受影响最小的拷贝之一,但使用该拷贝的预取速度仍比不使用该拷贝的快约40%

我还试着打开和关闭其他三个预取器,但它们对这个基准测试几乎没有可测量的影响。

根据经验法则:任何未实现的内存拷贝都是内存受限的。即使它只命中一级缓存。任何内存访问的开销都比CPU加上二加二所需的开销高得多。在您的情况下,甚至可以使用AVX指令来减少每个复制字节的指令量。无论在哪里找到您的数据(L1、L2、LLC、内存),相关内存组件的吞吐量都将是您的瓶颈。@St.Antario:huh?这毫无意义;您的内存有限,因此没有前端瓶颈,因此LSD不相关。(这样可以避免从uop缓存中重新获取它们,从而节省一些电源)。在退休之前,他们仍然在抢劫中占有一席之地。它们没有那么重要,但也不可忽略。将数组变大,这样就可以获得三级未命中,您可能会看到不同之处。我使用
16MiB
缓冲区和
10
迭代运行了大量测试,确实获得了
141868883秒
vs
43731360909秒
46,76%的ll缓存命中率
vs
99,32%LL缓存命中数
<代码>1 028 664 372 LLC负载
vs
1 587 454 298 LLC加载
@St.Antario:通过注册重命名!这是OoO exec最关键的部分之一,尤其是在寄存器不足的ISA(如x86)上。看见顺便说一句,通常你需要做2次加载,然后是2次存储,而不是加载/存储加载/存储。更好的避免机会