如何在GEM5中使用ARM PMU？_Arm_Simulation_Cpu Registers_Arm64_Gem5

如何在GEM5中使用ARM PMU？

arm

如何在GEM5中使用ARM PMU？,arm,simulation,cpu-registers,arm64,gem5,Arm,Simulation,Cpu Registers,Arm64,Gem5,我在初始化gem5中的PMU时遇到问题，该系统使用cpu hpi中的starter\u fs.py。我遵照这篇文章的指示，设法解决了我的问题。我添加了补丁并配置了系统。我没有使用perf。我尝试直接访问寄存器并读取它们。如我所见，GEM5只实现了一些寄存器事件。我们是否可以添加其他内容以及：例如，未实施EXC_。以下是添加它们的方法吗 self.addEvent（ProbeEvent（self，0x09，cpu，“EXC_take”）） #0x09:EXC_已执行此外，读取pmu事件寄存器

我在初始化gem5中的PMU时遇到问题，该系统使用cpu hpi中的starter\u fs.py。我遵照这篇文章的指示，设法解决了我的问题。我添加了补丁并配置了系统。我没有使用perf。我尝试直接访问寄存器并读取它们。如我所见，GEM5只实现了一些寄存器事件。我们是否可以添加其他内容以及：例如，未实施EXC_。以下是添加它们的方法吗
self.addEvent（ProbeEvent（self，0x09，cpu，“EXC_take”））
#0x09:EXC_已执行
此外，读取pmu事件寄存器我设法读取它们并提取事件，但pmccntr循环寄存器始终返回零？gem5如何增加该寄存器？读取循环注册表的步骤是什么
我使用perf读取的代码如下：

#include <stdio.h> #include <stdint.h> #include <string.h> #include <errno.h> #include <unistd.h> #include <sys/syscall.h> #include <linux/perf_event.h> #define NUM_NODES 100 #define NONE 9999 struct _NODE { int iDist; int iPrev; }; typedef struct _NODE NODE; struct _QITEM { int iNode; int iDist; int iPrev; struct _QITEM *qNext; }; typedef struct _QITEM QITEM; QITEM *qHead = NULL; int AdjMatrix[NUM_NODES][NUM_NODES]; int g_qCount = 0; NODE rgnNodes[NUM_NODES]; int ch; int iPrev, iNode; int i, iCost, iDist; void print_path (NODE *rgnNodes, int chNode) { if (rgnNodes[chNode].iPrev != NONE) { //print_path(rgnNodes, rgnNodes[chNode].iPrev); } //printf (" %d", chNode); fflush(stdout); } void enqueue (int iNode, int iDist, int iPrev) { QITEM *qNew = (QITEM *) malloc(sizeof(QITEM)); QITEM *qLast = qHead; if (!qNew) { //fprintf(stderr, "Out of memory.\n"); exit(1); } qNew->iNode = iNode; qNew->iDist = iDist; qNew->iPrev = iPrev; qNew->qNext = NULL; if (!qLast) { qHead = qNew; } else { while (qLast->qNext) qLast = qLast->qNext; qLast->qNext = qNew; } g_qCount++; // ASSERT(g_qCount); } void dequeue (int *piNode, int *piDist, int *piPrev) { QITEM *qKill = qHead; if (qHead) { // ASSERT(g_qCount); *piNode = qHead->iNode; *piDist = qHead->iDist; *piPrev = qHead->iPrev; qHead = qHead->qNext; free(qKill); g_qCount--; } } int qcount (void) { return(g_qCount); } int dijkstra(int chStart, int chEnd) { for (ch = 0; ch < NUM_NODES; ch++) { rgnNodes[ch].iDist = NONE; rgnNodes[ch].iPrev = NONE; } if (chStart == chEnd) { //printf("Shortest path is 0 in cost. Just stay where you are.\n"); } else { rgnNodes[chStart].iDist = 0; rgnNodes[chStart].iPrev = NONE; enqueue (chStart, 0, NONE); while (qcount() > 0) { dequeue (&iNode, &iDist, &iPrev); for (i = 0; i < NUM_NODES; i++) { if ((iCost = AdjMatrix[iNode][i]) != NONE) { if ((NONE == rgnNodes[i].iDist) || (rgnNodes[i].iDist > (iCost + iDist))) { rgnNodes[i].iDist = iDist + iCost; rgnNodes[i].iPrev = iNode; enqueue (i, iDist + iCost, iNode); } } } } //printf("Shortest path is %d in cost. ", rgnNodes[chEnd].iDist); //printf("Path is: "); //print_path(rgnNodes, chEnd); //printf("\n"); } } int main(int argc, char *argv[]) { int diff = 0; uint64_t num_cycles_nominal=0; uint64_t num_cycles_attack=0; uint64_t counter_cpu_cycles = 0; //system("./load-module"); int i,j,k; FILE *fp; static int perf_fd_cpu_cycles; static struct perf_event_attr attr_cpu_cycles; attr_cpu_cycles.size = sizeof(attr_cpu_cycles); attr_cpu_cycles.exclude_kernel = 1; attr_cpu_cycles.exclude_hv = 1; attr_cpu_cycles.exclude_callchain_kernel = 1; attr_cpu_cycles.type = PERF_TYPE_RAW; attr_cpu_cycles.config = 0x11; /* Open the file descriptor corresponding to this counter. The counter should start at this moment. */ if ((perf_fd_cpu_cycles = syscall(__NR_perf_event_open, &attr_cpu_cycles, 0, -1, -1, 0)) == -1) fprintf(stderr, "perf_event_open fail %d %d: %s\n", perf_fd_cpu_cycles, errno, strerror(errno)); if (argc<2) { //fprintf(stderr, "Usage: dijkstra <filename>\n"); //fprintf(stderr, "Only supports matrix size is #define'd.\n"); } /* open the adjacency matrix file */ fp = fopen (argv[1],"r"); /* make a fully connected matrix */ for (i=0;i<NUM_NODES;i++) { for (j=0;j<NUM_NODES;j++) { /* make it more sparce */ fscanf(fp,"%d",&k); AdjMatrix[i][j]= k; } } /* Get and close the performance counters. */ read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles)); //close(perf_fd_cpu_cycles); printf("Number of cpu_cycles before: %d\n", counter_cpu_cycles); num_cycles_nominal = counter_cpu_cycles; /* Get and close the performance counters. */ read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles)); //close(perf_fd_cpu_cycles); printf("Number of cpu_cycles after attack: %d\n", counter_cpu_cycles); num_cycles_attack = counter_cpu_cycles - num_cycles_nominal; /* finds 10 shortest paths between nodes */ for (i=0,j=NUM_NODES/2;i<100;i++,j++) { j=j%NUM_NODES; dijkstra(i,j); } read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles)); close(perf_fd_cpu_cycles); printf("Number of cpu_cycles end: %d\n", counter_cpu_cycles); num_cycles_nominal = counter_cpu_cycles - num_cycles_attack; printf("Number of cpu_cycles nominal: %d\n", num_cycles_nominal); printf("Number of cpu_cycles attack: %d\n", num_cycles_attack); exit(0); }

#包括 #包括 #包括 #包括 #包括 #包括 #包括 #定义NUM_节点100 #定义无9999 结构节点 { 国际主义者； int iPrev； }; typedef结构节点；结构 { int-iNode；国际主义者； int iPrev；结构_QITEM*qNext； }; typedef struct_QITEM QITEM； QITEM*qHead=NULL； int AdjMatrix[NUM_NODES][NUM_NODES]； int g_qCount=0；节点rgnNodes[NUM_节点]； int-ch； int-iPrev，iNode； int i、iCost、iDist； void print\u路径（节点*rgnnode，int chNode） { if（rgnNodes[chNode].iPrev！=无） { //打印路径（rgnNodes，rgnNodes[chNode].iPrev）； } //printf（“%d”，chNode）； fflush（stdout）； } 无效队列（int-iNode、int-iDist、int-iPrev） { QITEM*qNew=（QITEM*）malloc（sizeof（QITEM））； QITEM*qLast=qHead；如果（！qNew） { //fprintf（stderr，“内存不足”。\n”）；出口（1）； } qNew->iNode=iNode； qNew->iDist=iDist； qNew->iPrev=iPrev； qNew->qNext=NULL；如果（！qLast） { qHead=qNew； } 其他的 { 而（qLast->qNext）qLast=qLast->qNext； qLast->qNext=qNew； } g_qCount++； //断言（g_qCount）； } 无效出列（int*piNode，int*piDist，int*piPrev） { QITEM*qKill=qHead； if（qHead） { //断言（g_qCount）； *piNode=qHead->iNode； *piDist=qHead->iDist； *piPrev=qHead->iPrev； qHead=qHead->qNext；免费（qKill）； g_qCount--； } } int qcount（无效） { 返回（g_qCount）； } 迪杰克斯特拉国际机场（chStart国际机场、chEnd国际机场） { 对于（ch=0；ch0） { 退出队列（&iNode，&iDist，&iPrev）；对于（i=0；i（iCost+iDist））） { rgnNodes[i].iDist=iDist+iCost； rgnNodes[i].iPrev=iNode；排队（i，iDist+iCost，iNode）； } } } } //printf（“最短路径成本为%d.”，rgnNodes[chEnd].iDist）； //printf（“路径为：”）； //打印路径（rgnNodes，chEnd）； //printf（“\n”）； } } int main（int argc，char*argv[]）{ int-diff=0； uint64\u t num\u循环\u标称=0； uint64\u t num\u cycles\u attack=0； uint64计数器cpu周期=0； //系统（“/加载模块”）； int i，j，k；文件*fp；静态int性能fd cpu周期；静态结构性能事件属性属性cpu周期； attr\u cpu\u cycles.size=sizeof（attr\u cpu\u cycles）； attr_cpu_cycles.exclude_kernel=1；属性cpu周期。排除hv=1； attr\u cpu\u cycles.exclude\u callchain\u kernel=1； attr\u cpu\u cycles.type=性能类型\u原始； attr_cpu_cycles.config=0x11； /*打开与此计数器对应的文件描述符。计数器应该从这个时候开始*/ 如果（（性能fd\u cpu\u周期=系统调用（\uu NR\u性能事件\u打开，&attr\u cpu\u周期，0，-1，-1，0））=-1） fprintf（标准，“性能事件打开失败%d%d:%s\n”，性能fd\u cpu\u周期，错误号，strerror（错误号））；如果（argc对于丢失的事件，添加了一个记录单。顺便说一句：我将很快尝试实现一些。与已实现的事件一样，您必须在CPU源代码中找到需要增加这些事件的正确位置，并首先在那里添加一个探测点。已实现的事件，我已经可以在man perf_event_open 通过内核API的示例，该API间接调用PMU。内核可能正在进行一些必要的初始化。如果可能，只需使用perf_event_open 。您好，谢谢您的回答。我的问题是，一些计数器没有递增。ie周期和指令已经实现，但是不递增。我将给出一个示例：'./dijkstra的性能计数器统计