Linux模块:性能计数器不起作用
发布时间:2020-12-13 23:10:08 所属栏目:Linux 来源:网络整理
导读:我想监视最后一级缓存中的缓存请求编号.我写了一个 Linux模块来获取基于教程 here的信息. 它可以编译和运行,但输出结果总是0.换句话说,当我使用rdmsr时,它总是给我edx = 0,eax = 0.我甚至尝试了tutorial中的演示代码,输出仍为0. 我坚持这个问题整整一个星期.
我想监视最后一级缓存中的缓存请求编号.我写了一个
Linux模块来获取基于教程
here的信息.
它可以编译和运行,但输出结果总是0.换句话说,当我使用rdmsr时,它总是给我edx = 0,eax = 0.我甚至尝试了tutorial中的演示代码,输出仍为0. 我坚持这个问题整整一个星期.谁能帮助我指出我在程序中犯的错误? 我知道有一些现有程序在做同样的事情,但我必须知道如何自己编写代码,因为我想在Xen管理程序中监视缓存请求.我不能在Xen中使用这些工具,除非我将这些工具合并到Xen的管理程序中,这似乎更有用. /* * Record the cache miss rate of Intel Sandybridge cpu * To confirm the event is correctly set! */ #include <linux/module.h> /* Needed by all modules */ #include <linux/kernel.h> /* Needed for KERN_INFO */ /*4 Performance Counters Selector for %ecx in insn wrmsr*/ #define PERFEVTSEL0 0x186 #define PERFEVTSEL1 0x187 #define PERFEVTSEL2 0x188 #define PERFEVTSEL3 0x189 /*4 MSR Performance Counter for the above selector*/ #define PMC0 0xc1 #define PMC1 0xc2 #define PMC2 0xc2 #define PMC3 0xc3 /*Intel Software Developer Manual Page 2549*/ /*L1I L1D cache events has not been confirmed!*/ /*L1 Instruction Cache Performance Tuning Events*/ #define L1I_ALLHIT_EVENT 0x80 #define L1I_ALLHIT_MASK 0x01 #define L1I_ALLMISS_EVENT 0x80 /*confirmed*/ #define L1I_ALLMISS_MASK 0x02 /*confirmed*/ /*L1 Data Cache Performance Tuning Events*/ /*Intel does not have the ALLREQ Miss mask; have to add LD_miss and ST_miss*/ #define L1D_ALLREQ_EVENT 0x43 #define L1D_ALLREQ_MASK 0x01 #define L1D_LDMISS_EVENT 0x40 #define L1D_LDMISS_MASK 0x01 #define L1D_STMISS_EVENT 0x28 #define L1D_STMISS_MASK 0x01 /*L2 private cache for each core*/ /*confirmed*/ #define L2_ALLREQ_EVENT 0x24 #define L2_ALLREQ_MASK L2_ALLCODEREQ_MASK /*0xFF*/ #define L2_ALLMISS_EVENT 0x24 #define L2_ALLMISS_MASK L2_ALLCODEMISS_MASK /*0xAA*/ #define L2_ALLCODEREQ_MASK 0x30 #define L2_ALLCODEMISS_MASK 0x20 /*L3 shared cache*/ /*confirmed*/ /*Use the last level cache event and mask*/ #define L3_ALLREQ_EVENT 0x2E #define L3_ALLREQ_MASK 0x4F #define L3_ALLMISS_EVENT 0x2E #define L3_ALLMISS_MASK 0x41 #define USR_BIT (0x01UL << 16) #define OS_BIT (0x01UL << 17) #define SET_MSR_USR_BIT(eax) eax |= USR_BIT #define CLEAR_MSR_USR_BIT(exa) eax &= (~USR_BIT) #define SET_MSR_OS_BIT(eax) eax |= OS_BIT #define CLEAR_MSR_OS_BIT(eax) eax &= (~OS_BIT) #define SET_EVENT_MASK(eax,event,umask) eax |= (event | (umask << 8)) /*MSR EN flag: when set start the counter!*/ //#define MSR_ENFLAG (0x1<<22) #define MSR_ENFLAG (0x1<<22) /* 32bit insn v3*/ static inline void rtxen_write_msr(uint32_t eax,uint32_t ecx) { /*clear counter first*/ __asm__ __volatile__ ("movl %0,%%ecxnt" "xorl %%edx,%%edxnt" "xorl %%eax,%%eaxnt" "wrmsrnt" : /* no outputs */ : "m" (ecx) : "eax","ecx","edx" /* all clobbered */); eax |= MSR_ENFLAG; __asm__("movl %0,%%ecxnt" /* ecx contains the number of the MSR to set */ "xorl %%edx,%%edxnt"/* edx contains the high bits to set the MSR to */ "movl %1,%%eaxnt" /* eax contains the log bits to set the MSR to */ "wrmsrnt" : /* no outputs */ : "m" (ecx),"m" (eax) : "eax","edx" /* clobbered */); } static inline void rtxen_read_msr(uint32_t* ecx,uint32_t *eax,uint32_t* edx) { __asm__ __volatile__( "rdmsr" :"=d" (*edx),"=a" (*eax) :"c"(*ecx) ); } static inline void delay(void ) { char tmp[1000]; int i; for( i = 0; i < 1000; i++ ) { tmp[i] = i * 2; } } enum cache_level { UOPS,L1I,L1D,L2,L3 }; int init_module(void) { enum cache_level op; uint32_t eax,edx,ecx; uint64_t l3_all; op = UOPS; switch(op) { case UOPS: eax = 0x0001010E; eax |= MSR_ENFLAG; ecx = 0x187; printk(KERN_INFO "UOPS Demo: write_msr: eax=%#010x,ecx=%#010xn",eax,ecx); rtxen_write_msr(eax,ecx); ecx = 0xc2; eax = 1; edx = 2; rtxen_read_msr(&ecx,&eax,&edx); printk(KERN_INFO "UOPS Demo: read_msr: edx=%#010x,eax=%#010xn",eax); break; case L3: eax = 0; SET_MSR_USR_BIT(eax); SET_MSR_OS_BIT(eax); SET_EVENT_MASK(eax,L3_ALLREQ_EVENT,L3_ALLREQ_MASK); eax |= MSR_ENFLAG; ecx = PERFEVTSEL2; printk(KERN_INFO "before wrmsr: eax=%#010x,ecx); printk(KERN_INFO "after wrmsr: eax=%#010x,ecx); printk(KERN_INFO "L3 all request set MSR PMC2n"); printk(KERN_INFO "delay by access an arrayn"); delay(); ecx = PMC2; eax = 1; edx = 2; printk(KERN_INFO "rdmsr: ecx=%#010xn",ecx); rtxen_read_msr(&ecx,&edx); /*need to pass into address!*/ l3_all = ( ((uint64_t) edx << 32) | eax ); printk(KERN_INFO "rdmsr: L3 all request is %llu (%#010lx)n",l3_all,(unsigned long)l3_all); break; default: printk(KERN_INFO "operation not implemented yetn"); } /* * A non 0 return means init_module failed; module can't be loaded. */ return 0; } void cleanup_module(void) { printk(KERN_INFO "Goodbye world 1.n"); } 我的结果是: [ 1780.946584] UOPS Demo: write_msr: eax=0x0001010e,ecx=0x00000187 [ 1780.946590] UOPS Demo: read_msr: edx=0x00000000,eax=0x00000000 [ 1818.595055] Goodbye world 1. [ 1821.153947] UOPS Demo: write_msr: eax=0x0041010e,ecx=0x00000187 [ 1821.153950] UOPS Demo: read_msr: edx=0x00000000,eax=0x00000000 解决方法
我终于在@Manuel Selva的帮助下解决了这个问题!
设置perf的正确流程.柜台是: 步骤1:设置msr并通过设置eax中的EN位来启用计数器; 第2步:写入msr停止计数器 第3步:阅读计数器 我错过了第2步,这就是为什么它总是给我0.如果我想在停止之前读取计数器,则报告0是有意义的. switch语句的正确代码如下: switch(op) { case UOPS: eax = 0x0051010E; eax |= MSR_ENFLAG; ecx = 0x187; printk(KERN_INFO "UOPS Demo: write_msr: eax=%#010x,ecx); //stop counting eax = 0x0011010E; rtxen_write_msr(eax,L3_ALLREQ_MASK); eax |= MSR_ENFLAG; eax |= (1<<20); //INT bit: counter overflow ecx = PERFEVTSEL2; printk(KERN_INFO "before wrmsr: eax=%#010x,ecx); printk(KERN_INFO "L3 all request set MSR PMC2n"); printk(KERN_INFO "delay by access an arrayn"); delay(); eax &= (~MSR_ENFLAG); rtxen_write_msr(eax,ecx); printk(KERN_INFO "stop the counter,eax); ecx = PMC2; eax = 1; edx = 2; printk(KERN_INFO "rdmsr: ecx=%#010xn",(unsigned long)l3_all); break; default: printk(KERN_INFO "operation not implemented yetn"); } (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |