如何在GEM5中使用ARM PMU?
我在初始化gem5中的PMU时遇到问题,该系统使用cpu hpi中的starter\u fs.py。 我遵照这篇文章的指示,设法解决了我的问题。我添加了补丁并配置了系统。我没有使用perf。我尝试直接访问寄存器并读取它们。如我所见,GEM5只实现了一些寄存器事件。我们是否可以添加其他内容以及: 例如,未实施EXC_。以下是添加它们的方法吗 self.addEvent(ProbeEvent(self,0x09,cpu,“EXC_take”)) #0x09:EXC_已执行 此外,读取pmu事件寄存器我设法读取它们并提取事件,但pmccntr循环寄存器始终返回零?gem5如何增加该寄存器?读取循环注册表的步骤是什么 我使用perf读取的代码如下:如何在GEM5中使用ARM PMU?,arm,simulation,cpu-registers,arm64,gem5,Arm,Simulation,Cpu Registers,Arm64,Gem5,我在初始化gem5中的PMU时遇到问题,该系统使用cpu hpi中的starter\u fs.py。 我遵照这篇文章的指示,设法解决了我的问题。我添加了补丁并配置了系统。我没有使用perf。我尝试直接访问寄存器并读取它们。如我所见,GEM5只实现了一些寄存器事件。我们是否可以添加其他内容以及: 例如,未实施EXC_。以下是添加它们的方法吗 self.addEvent(ProbeEvent(self,0x09,cpu,“EXC_take”)) #0x09:EXC_已执行 此外,读取pmu事件寄存器
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <linux/perf_event.h>
#define NUM_NODES 100
#define NONE 9999
struct _NODE
{
int iDist;
int iPrev;
};
typedef struct _NODE NODE;
struct _QITEM
{
int iNode;
int iDist;
int iPrev;
struct _QITEM *qNext;
};
typedef struct _QITEM QITEM;
QITEM *qHead = NULL;
int AdjMatrix[NUM_NODES][NUM_NODES];
int g_qCount = 0;
NODE rgnNodes[NUM_NODES];
int ch;
int iPrev, iNode;
int i, iCost, iDist;
void print_path (NODE *rgnNodes, int chNode)
{
if (rgnNodes[chNode].iPrev != NONE)
{
//print_path(rgnNodes, rgnNodes[chNode].iPrev);
}
//printf (" %d", chNode);
fflush(stdout);
}
void enqueue (int iNode, int iDist, int iPrev)
{
QITEM *qNew = (QITEM *) malloc(sizeof(QITEM));
QITEM *qLast = qHead;
if (!qNew)
{
//fprintf(stderr, "Out of memory.\n");
exit(1);
}
qNew->iNode = iNode;
qNew->iDist = iDist;
qNew->iPrev = iPrev;
qNew->qNext = NULL;
if (!qLast)
{
qHead = qNew;
}
else
{
while (qLast->qNext) qLast = qLast->qNext;
qLast->qNext = qNew;
}
g_qCount++;
// ASSERT(g_qCount);
}
void dequeue (int *piNode, int *piDist, int *piPrev)
{
QITEM *qKill = qHead;
if (qHead)
{
// ASSERT(g_qCount);
*piNode = qHead->iNode;
*piDist = qHead->iDist;
*piPrev = qHead->iPrev;
qHead = qHead->qNext;
free(qKill);
g_qCount--;
}
}
int qcount (void)
{
return(g_qCount);
}
int dijkstra(int chStart, int chEnd)
{
for (ch = 0; ch < NUM_NODES; ch++)
{
rgnNodes[ch].iDist = NONE;
rgnNodes[ch].iPrev = NONE;
}
if (chStart == chEnd)
{
//printf("Shortest path is 0 in cost. Just stay where you are.\n");
}
else
{
rgnNodes[chStart].iDist = 0;
rgnNodes[chStart].iPrev = NONE;
enqueue (chStart, 0, NONE);
while (qcount() > 0)
{
dequeue (&iNode, &iDist, &iPrev);
for (i = 0; i < NUM_NODES; i++)
{
if ((iCost = AdjMatrix[iNode][i]) != NONE)
{
if ((NONE == rgnNodes[i].iDist) ||
(rgnNodes[i].iDist > (iCost + iDist)))
{
rgnNodes[i].iDist = iDist + iCost;
rgnNodes[i].iPrev = iNode;
enqueue (i, iDist + iCost, iNode);
}
}
}
}
//printf("Shortest path is %d in cost. ", rgnNodes[chEnd].iDist);
//printf("Path is: ");
//print_path(rgnNodes, chEnd);
//printf("\n");
}
}
int main(int argc, char *argv[]) {
int diff = 0;
uint64_t num_cycles_nominal=0;
uint64_t num_cycles_attack=0;
uint64_t counter_cpu_cycles = 0;
//system("./load-module");
int i,j,k;
FILE *fp;
static int perf_fd_cpu_cycles;
static struct perf_event_attr attr_cpu_cycles;
attr_cpu_cycles.size = sizeof(attr_cpu_cycles);
attr_cpu_cycles.exclude_kernel = 1;
attr_cpu_cycles.exclude_hv = 1;
attr_cpu_cycles.exclude_callchain_kernel = 1;
attr_cpu_cycles.type = PERF_TYPE_RAW;
attr_cpu_cycles.config = 0x11;
/* Open the file descriptor corresponding to this counter. The counter
should start at this moment. */
if ((perf_fd_cpu_cycles = syscall(__NR_perf_event_open, &attr_cpu_cycles, 0, -1, -1, 0)) == -1)
fprintf(stderr, "perf_event_open fail %d %d: %s\n", perf_fd_cpu_cycles, errno, strerror(errno));
if (argc<2) {
//fprintf(stderr, "Usage: dijkstra <filename>\n");
//fprintf(stderr, "Only supports matrix size is #define'd.\n");
}
/* open the adjacency matrix file */
fp = fopen (argv[1],"r");
/* make a fully connected matrix */
for (i=0;i<NUM_NODES;i++) {
for (j=0;j<NUM_NODES;j++) {
/* make it more sparce */
fscanf(fp,"%d",&k);
AdjMatrix[i][j]= k;
}
}
/* Get and close the performance counters. */
read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles));
//close(perf_fd_cpu_cycles);
printf("Number of cpu_cycles before: %d\n", counter_cpu_cycles);
num_cycles_nominal = counter_cpu_cycles;
/* Get and close the performance counters. */
read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles));
//close(perf_fd_cpu_cycles);
printf("Number of cpu_cycles after attack: %d\n", counter_cpu_cycles);
num_cycles_attack = counter_cpu_cycles - num_cycles_nominal;
/* finds 10 shortest paths between nodes */
for (i=0,j=NUM_NODES/2;i<100;i++,j++) {
j=j%NUM_NODES;
dijkstra(i,j);
}
read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles));
close(perf_fd_cpu_cycles);
printf("Number of cpu_cycles end: %d\n", counter_cpu_cycles);
num_cycles_nominal = counter_cpu_cycles - num_cycles_attack;
printf("Number of cpu_cycles nominal: %d\n", num_cycles_nominal);
printf("Number of cpu_cycles attack: %d\n", num_cycles_attack);
exit(0);
}
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义NUM_节点100
#定义无9999
结构节点
{
国际主义者;
int iPrev;
};
typedef结构节点;
结构
{
int-iNode;
国际主义者;
int iPrev;
结构_QITEM*qNext;
};
typedef struct_QITEM QITEM;
QITEM*qHead=NULL;
int AdjMatrix[NUM_NODES][NUM_NODES];
int g_qCount=0;
节点rgnNodes[NUM_节点];
int-ch;
int-iPrev,iNode;
int i、iCost、iDist;
void print\u路径(节点*rgnnode,int chNode)
{
if(rgnNodes[chNode].iPrev!=无)
{
//打印路径(rgnNodes,rgnNodes[chNode].iPrev);
}
//printf(“%d”,chNode);
fflush(stdout);
}
无效队列(int-iNode、int-iDist、int-iPrev)
{
QITEM*qNew=(QITEM*)malloc(sizeof(QITEM));
QITEM*qLast=qHead;
如果(!qNew)
{
//fprintf(stderr,“内存不足”。\n”);
出口(1);
}
qNew->iNode=iNode;
qNew->iDist=iDist;
qNew->iPrev=iPrev;
qNew->qNext=NULL;
如果(!qLast)
{
qHead=qNew;
}
其他的
{
而(qLast->qNext)qLast=qLast->qNext;
qLast->qNext=qNew;
}
g_qCount++;
//断言(g_qCount);
}
无效出列(int*piNode,int*piDist,int*piPrev)
{
QITEM*qKill=qHead;
if(qHead)
{
//断言(g_qCount);
*piNode=qHead->iNode;
*piDist=qHead->iDist;
*piPrev=qHead->iPrev;
qHead=qHead->qNext;
免费(qKill);
g_qCount--;
}
}
int qcount(无效)
{
返回(g_qCount);
}
迪杰克斯特拉国际机场(chStart国际机场、chEnd国际机场)
{
对于(ch=0;ch0)
{
退出队列(&iNode,&iDist,&iPrev);
对于(i=0;i(iCost+iDist)))
{
rgnNodes[i].iDist=iDist+iCost;
rgnNodes[i].iPrev=iNode;
排队(i,iDist+iCost,iNode);
}
}
}
}
//printf(“最短路径成本为%d.”,rgnNodes[chEnd].iDist);
//printf(“路径为:”);
//打印路径(rgnNodes,chEnd);
//printf(“\n”);
}
}
int main(int argc,char*argv[]){
int-diff=0;
uint64\u t num\u循环\u标称=0;
uint64\u t num\u cycles\u attack=0;
uint64计数器cpu周期=0;
//系统(“/加载模块”);
int i,j,k;
文件*fp;
静态int性能fd cpu周期;
静态结构性能事件属性属性cpu周期;
attr\u cpu\u cycles.size=sizeof(attr\u cpu\u cycles);
attr_cpu_cycles.exclude_kernel=1;
属性cpu周期。排除hv=1;
attr\u cpu\u cycles.exclude\u callchain\u kernel=1;
attr\u cpu\u cycles.type=性能类型\u原始;
attr_cpu_cycles.config=0x11;
/*打开与此计数器对应的文件描述符。计数器
应该从这个时候开始*/
如果((性能fd\u cpu\u周期=系统调用(\uu NR\u性能事件\u打开,&attr\u cpu\u周期,0,-1,-1,0))=-1)
fprintf(标准,“性能事件打开失败%d%d:%s\n”,性能fd\u cpu\u周期,错误号,strerror(错误号));
如果(argc对于丢失的事件,添加了一个记录单。顺便说一句:我将很快尝试实现一些。与已实现的事件一样,您必须在CPU源代码中找到需要增加这些事件的正确位置,并首先在那里添加一个探测点。已实现的事件,我已经可以在man perf_event_open
通过内核API的示例,该API间接调用PMU。内核可能正在进行一些必要的初始化。如果可能,只需使用perf_event_open
。您好,谢谢您的回答。我的问题是,一些计数器没有递增。ie周期和指令已经实现,但是不递增。我将给出一个示例:'./dijkstra的性能计数器统计