C 程序完成时分支预测器条目是否失效？_C_Performance_X86_Branch Prediction_Spectre

C 程序完成时分支预测器条目是否失效？

c performance x86

C 程序完成时分支预测器条目是否失效？,c,performance,x86,branch-prediction,spectre,C,Performance,X86,Branch Prediction,Spectre,我试图了解分支预测器条目何时无效以下是我做的实验：代码1： start_measure_branch_mispred() while(X times): if(something something): do_useless() endif endwhile end_measurement() store_difference() 因此，我多次运行这段代码。我可以看到，在第一次运行之后，预测失误率会降低。分支预测器学习如何正确预测。但是，如果我一次又一次地运行这个实验（即通过向终端

我试图了解分支预测器条目何时无效

以下是我做的实验：

代码1：

start_measure_branch_mispred()
while(X times):
 if(something something):
  do_useless()
 endif
endwhile
end_measurement()
store_difference()

因此，我多次运行这段代码。我可以看到，在第一次运行之后，预测失误率会降低。分支预测器学习如何正确预测。但是，如果我一次又一次地运行这个实验（即通过向终端写入

/experiment

），那么所有的第一次迭代都是从高预测失误率开始的。因此，在每次执行时，那些

条件分支的分支预测单元将失效。我正在使用nokaslr
，并且禁用了ASLR
。我也在一个孤立的核上做了这个实验。我已经做了几次这个实验，以确保这是行为（也就是说，不是因为噪音）
我的问题是：在程序停止执行后，CPU会使分支预测单元失效吗？或者这是什么原因
我做的第二个实验是：
代码2：
do:
    start_measure_branch_mispred()
    while(X times):
      if(something something):
        do_useless()
      endif
    endwhile
    end_measurement()
    store_difference()
while(cpu core == 1)

在这个实验中，我从两个不同的终端运行不同的进程。第一个被固定在核心1
上，这样它将在核心1上运行，它将做这个实验，直到我停止它（杀死它）。然后，我从另一个终端运行第二个进程，并将该进程固定到不同的内核上。由于此进程位于不同的核心中，因此它将只执行do while循环1次。如果第二个进程被固定到第一个进程的同级核心（相同的物理核心），我发现在第一次迭代中，第二个进程的猜测几乎是正确的。如果我将第二个进程固定到另一个不是第一个进程兄弟的内核上，那么第二个进程的第一次迭代会产生更高的预测失误。这是预期的结果，因为相同物理核上的虚拟核共享相同的分支预测单元（这是我的假设）。因此，第二个过程有利于经过训练的分支预测单元，因为它们具有相同的虚拟地址并映射到相同的分支预测单元条目
据我所知，由于CPU没有处理第一个进程（执行忙循环的核心1进程），因此分支预测条目仍然存在，第二个进程可以从中受益。但是，在第一种情况下，从一次运行到另一次运行，我得到了更高的预测失误
编辑：正如其他用户要求的代码，它在这里。您需要下载performance events标头代码
编译：$（CXX）-std=c++11-O0 main.cpp-lpthread-o实验
守则：
#include "linux-perf-events.h"

#include <algorithm>
#include <climits>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <vector>

// some array
int arr8[8] = {1,1,0,0,0,1,0,1};

int pin_thread_to_core(int core_id){            
    int retval;     
    int num_cores = sysconf(_SC_NPROCESSORS_ONLN);      
    if (core_id < 0 || core_id >= num_cores)            
        retval = EINVAL;                                
    cpu_set_t cpuset;                                   
    CPU_ZERO(&cpuset);                                  
    CPU_SET(core_id, &cpuset);                          
    retval = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
    return retval;
}

void measurement(int cpuid, uint64_t howmany, int* branch_misses){

    int retval = pin_thread_to_core(cpuid);
    if(retval){
        printf("Affinity error: %s\n", strerror(errno));
        return;
    }

    std::vector<int> evts;
    evts.push_back(PERF_COUNT_HW_BRANCH_MISSES); // You might have a different performance event!

    LinuxEvents<PERF_TYPE_HARDWARE> unified(evts, cpuid); // You need to change the constructor in the performance counter so that it will count the events in the given cpuid

    uint64_t *buffer = new uint64_t[howmany + 1];
    uint64_t *buffer_org; // for restoring
    buffer_org = buffer;
    uint64_t howmany_org = howmany; // for restoring

    std::vector<unsigned long long> results;
    results.resize(evts.size());

    do{
        for(size_t trial = 0; trial < 10; trial++) {

            unified.start();
            // the while loop will be executed innerloop times
            int res;
            while(howmany){
                res = arr8[howmany & 0x7]; // do the sequence howmany/8 times
                if(res){
                    *buffer++ = res;
                }       
                howmany--;
            }
            unified.end(results);
            // store misses
            branch_misses[trial] = results[0];
            // restore for next iteration
            buffer = buffer_org;
            howmany = howmany_org;
        }
    }while(cpuid == 5); // the core that does busy loop

    // get rid of optimization
    howmany = (howmany + 1) * buffer[3];
    branch_misses[10] = howmany; // last entry is reserved for this dummy operation

    delete[] buffer;

}
void usage(){
    printf("Run with ./experiment X \t where X is the core number\n");
}
int main(int argc, char *argv[]) {
    // as I have 11th core isolated, set affinity to that
    if(argc == 1){
        usage();
        return 1;
    }

    int exp = 16; // howmany

    int results[11];
    int cpuid = atoi(argv[1]); 

    measurement(cpuid, exp, results);

    printf("%d measurements\n", exp);

    printf("Trial\t\t\tBranchMiss\n");
    for (size_t trial = 0; trial < 10; trial++)
    {
        printf("%zu\t\t\t%d\n", trial, results[trial]);
    }
    return 0;
}

#包括“linux perf events.h”
#包括
#包括
#包括
#包括
#包括
#包括
//一些数组
int arr8[8]={1,1,0,0,0,1,0,1}；
int引脚_线程_到_内核（int内核_id）{
内部检索；
int num_cores=sysconf（_SC_NPROCESSORS_ONLN）；
if（核心id<0 | |核心id>=num_核心）
retval=EINVAL；
cpu\u设置\u t cpuset；
CPU_零（&cpuset）；
CPU设置（核心id和cpuset）；
retval=pthread_setaffinity_np（pthread_self（）、sizeof（cpu_set_t）和cpuset）；
返回返回；
}
无效度量（int cpuid，uint64_t多少，int*分支未命中）{
int retval=针螺纹到针芯（cpuid）；
如果（返回）{
printf（“关联错误：%s\n”，strerror（errno））；
返回；
}
std：：向量evts；
evts.push_back（PERF_COUNT_HW_BRANCH_MISSES）；//您可能有不同的性能事件！
LinuxEvents统一（evts，cpuid）；//您需要更改性能计数器中的构造函数，以便它对给定cpuid中的事件进行计数
uint64_t*buffer=新的uint64_t[数量+1]；
uint64\u t*buffer\u org；//用于恢复
缓冲区组织=缓冲区；
uint64\u t howmany\u org=howmany；//用于恢复
std：：矢量结果；
results.resize（evts.size（））；
做{
用于（尺寸试验=0；试验<10；试验++）{
unified.start（）；
//while循环将在内部循环时间内执行
国际关系；
虽然（有多少人）{
res=arr8[howmany&0x7]；//按顺序执行多少次/8次
如果（res）{
*缓冲区+++=res；
}       
有多少——；
}
统一。结束（结果）；
//商店失窃
分支_未命中[试验]=结果[0]；
//为下一次迭代恢复
缓冲区=缓冲区组织；
howmany=组织的数量；
}
}while（cpuid==5）；//执行忙循环的核心
//摆脱优化
多少=（多少+1）*缓冲区[3]；
branch_misses[10]=howmount；//最后一个条目是为这个伪操作保留的
删除[]缓冲区；
}
无效用法（）{
printf（“与./experiment X\t一起运行，其中X是核心编号\n”）；
}
int main（int argc，char*argv[]）{
//因为我已经隔离了第11个核心，所以将affinity设置为
如果（argc==1）{
用法（）；
返回1；
}
int exp=16；//有多少个
int结果[11]；
int cpuid=atoi（argv[1]）；
测量（cpuid、实验、结果）；
printf（“%d个测量值\n”，exp）；
printf（“试用\t\t\t\n”）；
用于（尺寸试验=0；试验<10；试验++）
{
printf（“%zu\t\t\t%d\n”，试验，结果[试验]；
}
返回0；
}

如果您想尝试第一个代码，只需运行两次/实验1
。它将具有与第一个代码相同的执行
如果要尝试第二个代码，请打开两个终端，在第一个终端中运行/实验X
，在第二个终端中运行/实验Y
，其中X和Y是cpuid
请注意，您可能没有相同的性能事件计数器。另外，请注意，您可能需要更改BusyLop中的cpuid
CPU是否在t之后使分支预测单元失效
int main(int arg){ // arg is the iteration
   pin_thread_to_isolated_core()
   for i=0 to arg:
     measurement()
     std::this_thread::sleep_for(std::chrono::milliseconds(1)); // I put this as it is
   endfor
   printresults() // print after all measurements are completed
}

void measurement(){
   initialization()
   for i=0 to 10:
      start_measurement()
      while(X times) // for the results below, X is 32
        a = arr8[an element] //sequence of 8,
        if(a is odd)
           do_sth()
        endif
      endwhile
      end_measurement()
      store_difference()
   endfor
}

Trial           BranchMiss
RUN:1
    0           16
    1           28
    2           3
    3           1
    ....  continues as 1
RUN:2
    0           16   // CPU forgets the sequence
    1           30
    2           2
    3           1
    ....  continues as 1
RUN:3
    0           16
    1           27
    2           4
    3           1
    ....  continues as 1

RUN:2
    0           1   
    1           1
    ....  continues as 1
RUN:3
    0           1
    1           1
    ....  continues as 1

void measurement(int cpuid, uint64_t howmany, int* branch_misses) {
    ...
        for(size_t trial = 0; trial < 4; trial++) {

            unified.start();
            int res;
            for(uint64_t tmp = howmany; tmp; tmp--) {
                res = arr8[tmp & 0x7];
                if(res){
                    *buffer++ = res;
                }
            }
            unified.end(results);
            ...
        }
    ...
}

int main(int argc, char *argv[]) {
    ...
    for(int i = 0; i < 3; ++i) {
        measurement(cpuid, exp, results);
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
    }
    ...
}