On Fri, Aug 14, 2015 at 07:44:23PM -0500, Scott Wood wrote:
> On Fri, 2015-08-14 at 15:13 +0800, Kevin Hao wrote:
> > On Thu, Aug 13, 2015 at 10:39:19PM -0500, Scott Wood wrote:
> > > On Thu, 2015-08-13 at 19:51 +0800, Kevin Hao wrote:
> > > > I didn't find anything unusual. But I think we do need to order the
> > > > load/store of esel_next when acquire/release tcd lock. For acquire,
> > > > add a data dependency to order the loads of lock and esel_next.
> > > > For release, even there already have a "isync" here, but it doesn't
> > > > guarantee any memory access order. So we still need "lwsync" for
> > > > the two stores for lock and esel_next.
> > > 
> > > I was going to say that esel_next is just a hint and it doesn't really 
> > > matter 
> > > if we occasionally get the wrong value, unless it happens often enough to 
> > > cause more performance degradation than the lwsync causes.  However, with 
> > > the 
> > > A-008139 workaround we do need to read the same value from esel_next both 
> > > times.  It might be less costly to save/restore an additional register 
> > > instead of lwsync, though.
> > 
> > I will try to get some benchmark number to compare which method is a bit 
> > better.
> > Do you have any recommended benchmark for a case this is?
> 
> lmbench lat_mem_rd with a stride chosen to maximize TLB misses.  For the 
> uncontended case, one instance; for the contended case, two instances, one 
> pinned to each thread of a core.

I have tried the method to save/restore an additional register for the esel
with the following codes:

diff --git a/arch/powerpc/include/asm/exception-64e.h 
b/arch/powerpc/include/asm/exception-64e.h
index a8b52b61043f..8267c1404050 100644
--- a/arch/powerpc/include/asm/exception-64e.h
+++ b/arch/powerpc/include/asm/exception-64e.h
@@ -69,9 +69,9 @@
 #define EX_TLB_ESR     ( 9 * 8) /* Level 0 and 2 only */
 #define EX_TLB_SRR0    (10 * 8)
 #define EX_TLB_SRR1    (11 * 8)
+#define EX_TLB_R9      (12 * 8)
 #ifdef CONFIG_BOOK3E_MMU_TLB_STATS
-#define EX_TLB_R8      (12 * 8)
-#define EX_TLB_R9      (13 * 8)
+#define EX_TLB_R8      (13 * 8)
 #define EX_TLB_LR      (14 * 8)
 #define EX_TLB_SIZE    (15 * 8)
 #else
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index e4185581c5a7..8d184dd530c4 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -68,11 +68,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
        ld      r14,PACAPGD(r13)
        std     r15,EX_TLB_R15(r12)
        std     r10,EX_TLB_CR(r12)
+       std     r9,EX_TLB_R9(r12)
        TLB_MISS_PROLOG_STATS
 .endm
 
 .macro tlb_epilog_bolted
        ld      r14,EX_TLB_CR(r12)
+       ld      r9,EX_TLB_R9(r12)
        ld      r10,EX_TLB_R10(r12)
        ld      r11,EX_TLB_R11(r12)
        ld      r13,EX_TLB_R13(r12)
@@ -297,6 +299,7 @@ itlb_miss_fault_bolted:
  * r13 = PACA
  * r11 = tlb_per_core ptr
  * r10 = crap (free to use)
+ * r9  = esel entry
  */
 tlb_miss_common_e6500:
        crmove  cr2*4+2,cr0*4+2         /* cr2.eq != 0 if kernel address */
@@ -334,8 +337,8 @@ BEGIN_FTR_SECTION           /* CPU_FTR_SMT */
         * with tlbilx before overwriting.
         */
 
-       lbz     r15,TCD_ESEL_NEXT(r11)
-       rlwinm  r10,r15,16,0xff0000
+       lbz     r9,TCD_ESEL_NEXT(r11)
+       rlwinm  r10,r9,16,0xff0000
        oris    r10,r10,MAS0_TLBSEL(1)@h
        mtspr   SPRN_MAS0,r10
        isync
@@ -429,15 +432,14 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT)
        mtspr   SPRN_MAS2,r15
 
 tlb_miss_huge_done_e6500:
-       lbz     r15,TCD_ESEL_NEXT(r11)
        lbz     r16,TCD_ESEL_MAX(r11)
        lbz     r14,TCD_ESEL_FIRST(r11)
-       rlwimi  r10,r15,16,0x00ff0000   /* insert esel_next into MAS0 */
-       addi    r15,r15,1               /* increment esel_next */
+       rlwimi  r10,r9,16,0x00ff0000    /* insert esel_next into MAS0 */
+       addi    r9,r9,1         /* increment esel_next */
        mtspr   SPRN_MAS0,r10
-       cmpw    r15,r16
-       iseleq  r15,r14,r15             /* if next == last use first */
-       stb     r15,TCD_ESEL_NEXT(r11)
+       cmpw    r9,r16
+       iseleq  r9,r14,r9               /* if next == last use first */
+       stb     r9,TCD_ESEL_NEXT(r11)
 
        tlbwe
 

The following is the benchmark number on a t2080rdb board:
For uncontended case (taskset -c 0 lat_mem_rd 2048 2097152):
                origin          lwsync          r9
2.00000         1.958           1.958           1.958
3.00000         1.958           1.958           1.958
4.00000         1.958           1.958           1.958
6.00000         1.958           1.958           1.958
8.00000         1.958           1.958           1.958
12.00000        6.528           6.528           6.528
16.00000        6.528           6.528           6.528
24.00000        37.862          37.862          37.861
32.00000        37.862          37.862          37.862
48.00000        37.862          37.862          37.862
64.00000        37.862          37.862          37.862
96.00000        37.862          37.863          37.862
128.00000       221.621         232.067         222.927
192.00000       221.874         232.333         222.925
256.00000       221.623         232.066         222.927
384.00000       221.758         232.331         222.927
512.00000       221.621         232.165         222.926
768.00000       221.776         236.870         226.598
1024.00000      264.199         271.351         259.281
1536.00000      370.748         380.910         372.544
2048.00000      392.185         404.696         395.881

For contended case (taskset -c 0 lat_mem_rd 2048 2097152 &
                    taskset -c 1 lat_mem_rd 2048 2097152 >/dev/null 2>&1):
                origin          lwsync          r9
2.00000         3.366           2.944           3.086
3.00000         2.915           3.256           3.095
4.00000         3.043           2.443           2.617
6.00000         2.742           3.367           2.629
8.00000         3.145           3.365           2.443
12.00000        18.267          11.885          13.736
16.00000        15.607          13.312          18.048
24.00000        37.856          37.208          37.855
32.00000        37.856          37.861          37.855
48.00000        37.856          37.861          37.855
64.00000        57.487          37.861          52.505
96.00000        270.445         229.641         143.241
128.00000       284.535         279.907         305.540
192.00000       275.491         298.282         295.592
256.00000       265.155         331.212         259.096
384.00000       276.084         291.023         251.282
512.00000       275.852         335.602         267.074
768.00000       289.682         354.521         312.180
1024.00000      344.968         355.977         343.725
1536.00000      410.961         454.381         412.790
2048.00000      392.984         461.818         397.185

It seems that the using of an additional register do have better performance in
both cases.

Thanks,
Kevin

Attachment: pgpmLKSpYaU6_.pgp
Description: PGP signature

_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Reply via email to