On Fri, Apr 07, 2017 at 06:47:43PM +0800, Jin Yao wrote: > Perf already has support for disassembling the branch instruction > and using the branch type for filtering. The patch just records > the branch type in perf_branch_entry. > > Before recording, the patch converts the x86 branch classification > to common branch classification and compute for checking if the > branches cross 4K or 2MB areas. It's an approximate computing for > crossing 4K page or 2MB page.
The changelog is completely empty of rationale. Why do we care? Not having the binary is a very bad reason; you can't do much of anything if that's missing. > @@ -923,6 +933,84 @@ static int branch_type(unsigned long from, unsigned long > to, int abort) > return ret; > } > > +static int > +common_branch_type(int type, u64 from, u64 to) > +{ > + int ret; > + > + type = type & (~(X86_BR_KERNEL | X86_BR_USER)); > + > + switch (type) { > + case X86_BR_CALL: > + case X86_BR_ZERO_CALL: > + ret = PERF_BR_CALL; > + break; > + > + case X86_BR_RET: > + ret = PERF_BR_RET; > + break; > + > + case X86_BR_SYSCALL: > + ret = PERF_BR_SYSCALL; > + break; > + > + case X86_BR_SYSRET: > + ret = PERF_BR_SYSRET; > + break; > + > + case X86_BR_INT: > + ret = PERF_BR_INT; > + break; > + > + case X86_BR_IRET: > + ret = PERF_BR_IRET; > + break; > + > + case X86_BR_IRQ: > + ret = PERF_BR_IRQ; > + break; > + > + case X86_BR_ABORT: > + ret = PERF_BR_FAR_BRANCH; > + break; > + > + case X86_BR_JCC: > + if (to > from) > + ret = PERF_BR_JCC_FWD; > + else > + ret = PERF_BR_JCC_BWD; > + break; This seems like superfluous information; we already get to and from, so this comparison is pointless. The rest looks like something you can simpler implement using a lookup table. > + > + case X86_BR_JMP: > + ret = PERF_BR_JMP; > + break; > + > + case X86_BR_IND_CALL: > + ret = PERF_BR_IND_CALL; > + break; > + > + case X86_BR_IND_JMP: > + ret = PERF_BR_IND_JMP; > + break; > + > + default: > + ret = PERF_BR_NONE; > + } > + > + return ret; > +} > + > +static bool > +cross_area(u64 addr1, u64 addr2, int size) > +{ > + u64 align1, align2; > + > + align1 = addr1 & ~(size - 1); > + align2 = addr2 & ~(size - 1); > + > + return (align1 != align2) ? true : false; > +} > + > /* > * implement actual branch filter based on user demand. > * Hardware may not exactly satisfy that request, thus > @@ -939,7 +1027,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) > bool compress = false; > > /* if sampling all branches, then nothing to filter */ > - if ((br_sel & X86_BR_ALL) == X86_BR_ALL) > + if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && > + ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) > return; > > for (i = 0; i < cpuc->lbr_stack.nr; i++) { > @@ -960,6 +1049,21 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) > cpuc->lbr_entries[i].from = 0; > compress = true; > } > + > + if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) { > + cpuc->lbr_entries[i].type = common_branch_type(type, > + from, > + to); > + if (cross_area(from, to, AREA_2M)) > + cpuc->lbr_entries[i].cross = PERF_BR_CROSS_2M; > + else if (cross_area(from, to, AREA_4K)) > + cpuc->lbr_entries[i].cross = PERF_BR_CROSS_4K; > + else > + cpuc->lbr_entries[i].cross = PERF_BR_CROSS_NONE; This again is superfluous information; it is already fully contained in to and from, which we have. > + } else { > + cpuc->lbr_entries[i].type = PERF_BR_NONE; > + cpuc->lbr_entries[i].cross = PERF_BR_CROSS_NONE; > + } > } > > if (!compress) > -- > 2.7.4 >