On Fri, Aug 14, 2015 at 07:44:23PM -0500, Scott Wood wrote: > On Fri, 2015-08-14 at 15:13 +0800, Kevin Hao wrote: > > On Thu, Aug 13, 2015 at 10:39:19PM -0500, Scott Wood wrote: > > > On Thu, 2015-08-13 at 19:51 +0800, Kevin Hao wrote: > > > > I didn't find anything unusual. But I think we do need to order the > > > > load/store of esel_next when acquire/release tcd lock. For acquire, > > > > add a data dependency to order the loads of lock and esel_next. > > > > For release, even there already have a "isync" here, but it doesn't > > > > guarantee any memory access order. So we still need "lwsync" for > > > > the two stores for lock and esel_next. > > > > > > I was going to say that esel_next is just a hint and it doesn't really > > > matter > > > if we occasionally get the wrong value, unless it happens often enough to > > > cause more performance degradation than the lwsync causes. However, with > > > the > > > A-008139 workaround we do need to read the same value from esel_next both > > > times. It might be less costly to save/restore an additional register > > > instead of lwsync, though. > > > > I will try to get some benchmark number to compare which method is a bit > > better. > > Do you have any recommended benchmark for a case this is? > > lmbench lat_mem_rd with a stride chosen to maximize TLB misses. For the > uncontended case, one instance; for the contended case, two instances, one > pinned to each thread of a core.
I have tried the method to save/restore an additional register for the esel with the following codes: diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h index a8b52b61043f..8267c1404050 100644 --- a/arch/powerpc/include/asm/exception-64e.h +++ b/arch/powerpc/include/asm/exception-64e.h @@ -69,9 +69,9 @@ #define EX_TLB_ESR ( 9 * 8) /* Level 0 and 2 only */ #define EX_TLB_SRR0 (10 * 8) #define EX_TLB_SRR1 (11 * 8) +#define EX_TLB_R9 (12 * 8) #ifdef CONFIG_BOOK3E_MMU_TLB_STATS -#define EX_TLB_R8 (12 * 8) -#define EX_TLB_R9 (13 * 8) +#define EX_TLB_R8 (13 * 8) #define EX_TLB_LR (14 * 8) #define EX_TLB_SIZE (15 * 8) #else diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index e4185581c5a7..8d184dd530c4 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -68,11 +68,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) ld r14,PACAPGD(r13) std r15,EX_TLB_R15(r12) std r10,EX_TLB_CR(r12) + std r9,EX_TLB_R9(r12) TLB_MISS_PROLOG_STATS .endm .macro tlb_epilog_bolted ld r14,EX_TLB_CR(r12) + ld r9,EX_TLB_R9(r12) ld r10,EX_TLB_R10(r12) ld r11,EX_TLB_R11(r12) ld r13,EX_TLB_R13(r12) @@ -297,6 +299,7 @@ itlb_miss_fault_bolted: * r13 = PACA * r11 = tlb_per_core ptr * r10 = crap (free to use) + * r9 = esel entry */ tlb_miss_common_e6500: crmove cr2*4+2,cr0*4+2 /* cr2.eq != 0 if kernel address */ @@ -334,8 +337,8 @@ BEGIN_FTR_SECTION /* CPU_FTR_SMT */ * with tlbilx before overwriting. */ - lbz r15,TCD_ESEL_NEXT(r11) - rlwinm r10,r15,16,0xff0000 + lbz r9,TCD_ESEL_NEXT(r11) + rlwinm r10,r9,16,0xff0000 oris r10,r10,MAS0_TLBSEL(1)@h mtspr SPRN_MAS0,r10 isync @@ -429,15 +432,14 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT) mtspr SPRN_MAS2,r15 tlb_miss_huge_done_e6500: - lbz r15,TCD_ESEL_NEXT(r11) lbz r16,TCD_ESEL_MAX(r11) lbz r14,TCD_ESEL_FIRST(r11) - rlwimi r10,r15,16,0x00ff0000 /* insert esel_next into MAS0 */ - addi r15,r15,1 /* increment esel_next */ + rlwimi r10,r9,16,0x00ff0000 /* insert esel_next into MAS0 */ + addi r9,r9,1 /* increment esel_next */ mtspr SPRN_MAS0,r10 - cmpw r15,r16 - iseleq r15,r14,r15 /* if next == last use first */ - stb r15,TCD_ESEL_NEXT(r11) + cmpw r9,r16 + iseleq r9,r14,r9 /* if next == last use first */ + stb r9,TCD_ESEL_NEXT(r11) tlbwe The following is the benchmark number on a t2080rdb board: For uncontended case (taskset -c 0 lat_mem_rd 2048 2097152): origin lwsync r9 2.00000 1.958 1.958 1.958 3.00000 1.958 1.958 1.958 4.00000 1.958 1.958 1.958 6.00000 1.958 1.958 1.958 8.00000 1.958 1.958 1.958 12.00000 6.528 6.528 6.528 16.00000 6.528 6.528 6.528 24.00000 37.862 37.862 37.861 32.00000 37.862 37.862 37.862 48.00000 37.862 37.862 37.862 64.00000 37.862 37.862 37.862 96.00000 37.862 37.863 37.862 128.00000 221.621 232.067 222.927 192.00000 221.874 232.333 222.925 256.00000 221.623 232.066 222.927 384.00000 221.758 232.331 222.927 512.00000 221.621 232.165 222.926 768.00000 221.776 236.870 226.598 1024.00000 264.199 271.351 259.281 1536.00000 370.748 380.910 372.544 2048.00000 392.185 404.696 395.881 For contended case (taskset -c 0 lat_mem_rd 2048 2097152 & taskset -c 1 lat_mem_rd 2048 2097152 >/dev/null 2>&1): origin lwsync r9 2.00000 3.366 2.944 3.086 3.00000 2.915 3.256 3.095 4.00000 3.043 2.443 2.617 6.00000 2.742 3.367 2.629 8.00000 3.145 3.365 2.443 12.00000 18.267 11.885 13.736 16.00000 15.607 13.312 18.048 24.00000 37.856 37.208 37.855 32.00000 37.856 37.861 37.855 48.00000 37.856 37.861 37.855 64.00000 57.487 37.861 52.505 96.00000 270.445 229.641 143.241 128.00000 284.535 279.907 305.540 192.00000 275.491 298.282 295.592 256.00000 265.155 331.212 259.096 384.00000 276.084 291.023 251.282 512.00000 275.852 335.602 267.074 768.00000 289.682 354.521 312.180 1024.00000 344.968 355.977 343.725 1536.00000 410.961 454.381 412.790 2048.00000 392.984 461.818 397.185 It seems that the using of an additional register do have better performance in both cases. Thanks, Kevin
pgpmLKSpYaU6_.pgp
Description: PGP signature
_______________________________________________ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev