POWER7 has a dedicated stream prefetcher that is pre-programmed via
dcbt rX,rY,0b010?0 instructions in the beginning of vmx_copy.
e6500 has no such prefetcher, so we revert to using regular dcbt
instructions in-loop:

1. at __copy_tofrom_user_power7 entry, we prefetch the first
src and dest lines with dcbt and dcbtst, respectively.

2. if a short (16 bytes or less) copy, don't prefetch any further.

3. else (nonvmx_copy, vmx_copy, unaligned_vmx_copy), we prefetch
LINES_AHEAD number of lines, then, in the inner cacheline-wide
loops, prefetch a line LINES_AHEAD of the current address being
copied, and, finally drop into a tail-end cacheline-wide loop
that doesn't prefetch for the last LINES_AHEAD number of iterations.

LINES_AHEAD has been chosen emperically to be 10 based on generally
best results on an important target benchmark: 1MB netperf TCP_CRR
runs*, relative to a stock 3.19 kernel with FTR_VMX_COPY turned off
for e6500 (i.e., without this patchseries):

                throughput      mean latency
single-thread   27%             21.2%
8-thread        6.08%           5.8%

POWER7/BOOK3S_64 code is left completely immune from these e6500
changes: we don't have a POWER7 to benchmark with.

Lastly, this patch includes some post make copyuser_power7.S 64-byte
cacheline friendly enhancements by removing any register stack saves
for non-128-byte case, and a renumbering of the branch labels.

* specifically: netperf -v 0 -B "-b 8 -D" -H $dest_ip -l 20 -t TCP_CRR -P 0 -- 
-b 8 -D -r 64,1000000

Signed-off-by: Kim Phillips <kim.phill...@freescale.com>
---
 arch/powerpc/lib/copyuser_power7.S | 254 ++++++++++++++++++++++++++++++++-----
 1 file changed, 221 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/lib/copyuser_power7.S 
b/arch/powerpc/lib/copyuser_power7.S
index 2d22e58..54b70fe 100644
--- a/arch/powerpc/lib/copyuser_power7.S
+++ b/arch/powerpc/lib/copyuser_power7.S
@@ -14,6 +14,7 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
  * Copyright (C) IBM Corporation, 2011
+ * Copyright Freescale Semiconductor, 2015
  *
  * Author: Anton Blanchard <an...@au.ibm.com>
  */
@@ -63,9 +64,11 @@
 
 
 .Ldo_err4:
+#if L1_CACHE_BYTES >= 128
        ld      r16,STK_REG(R16)(r1)
        ld      r15,STK_REG(R15)(r1)
        ld      r14,STK_REG(R14)(r1)
+#endif
 .Ldo_err3:
        bl      exit_vmx_usercopy
        ld      r0,STACKFRAMESIZE+16(r1)
@@ -74,6 +77,7 @@
 #endif /* CONFIG_ALTIVEC */
 
 .Ldo_err2:
+#if L1_CACHE_BYTES >= 128
        ld      r22,STK_REG(R22)(r1)
        ld      r21,STK_REG(R21)(r1)
        ld      r20,STK_REG(R20)(r1)
@@ -81,6 +85,7 @@
        ld      r18,STK_REG(R18)(r1)
        ld      r17,STK_REG(R17)(r1)
        ld      r16,STK_REG(R16)(r1)
+#endif
        ld      r15,STK_REG(R15)(r1)
        ld      r14,STK_REG(R14)(r1)
 .Lexit:
@@ -93,6 +98,10 @@
 
 
 _GLOBAL(__copy_tofrom_user_power7)
+#ifdef CONFIG_PPC_BOOK3E_64
+       dcbt    0,r4
+       dcbt    0,r3
+#endif
 #ifdef CONFIG_ALTIVEC
        cmpldi  r5,16
        cmpldi  cr1,r5,4096
@@ -139,12 +148,13 @@ err1;     stw     r0,0(r3)
 
 3:     sub     r5,r5,r6
        cmpldi  r5,L1_CACHE_BYTES
-       blt     5f
+       blt     9f
 
        mflr    r0
        stdu    r1,-STACKFRAMESIZE(r1)
        std     r14,STK_REG(R14)(r1)
        std     r15,STK_REG(R15)(r1)
+#if L1_CACHE_BYTES >= 128
        std     r16,STK_REG(R16)(r1)
        std     r17,STK_REG(R17)(r1)
        std     r18,STK_REG(R18)(r1)
@@ -152,14 +162,43 @@ err1;     stw     r0,0(r3)
        std     r20,STK_REG(R20)(r1)
        std     r21,STK_REG(R21)(r1)
        std     r22,STK_REG(R22)(r1)
+#endif
        std     r0,STACKFRAMESIZE+16(r1)
 
-       srdi    r6,r5,L1_CACHE_SHIFT
-       mtctr   r6
+#ifdef CONFIG_PPC_BOOK3E_64
+#define LINES_AHEAD 10
+       clrrdi  r6,r4,L1_CACHE_SHIFT
+       clrrdi  r9,r3,L1_CACHE_SHIFT
+       srdi    r7,r5,L1_CACHE_SHIFT    /* length in cachelines,
+                                        * capped at LINES_AHEAD
+                                        */
+       cmpldi  r7,LINES_AHEAD
+       ble     4f
+       li      r7,LINES_AHEAD
+4:     mtctr   r7
+
+5:     addi    r6,r6,L1_CACHE_BYTES
+       dcbt    0,r6
+       addi    r9,r9,L1_CACHE_BYTES
+       dcbtst  0,r9
+
+       bdnz    5b
+
+       li      r14,LINES_AHEAD*L1_CACHE_BYTES
+#endif
+
+       srdi    r15,r5,L1_CACHE_SHIFT
+#ifdef CONFIG_PPC_BOOK3E_64
+       cmpldi  r15,LINES_AHEAD
+       ble     7f      /* don't prefetch if cachelines <= LINES_AHEAD*/
+       subi    r15,r15,LINES_AHEAD     /* otherwise, r6 <- r6 - LINES_AHEAD*/
+#endif
+
+       mtctr   r15
 
        /* Now do cacheline sized loads and stores. */
        .align  5
-4:
+6:
 err2;  ld      r0,0(r4)
 err2;  ld      r6,8(r4)
 err2;  ld      r7,16(r4)
@@ -179,6 +218,9 @@ err2;       ld      r20,112(r4)
 err2;  ld      r21,120(r4)
 #endif
        addi    r4,r4,L1_CACHE_BYTES
+#ifdef CONFIG_PPC_BOOK3E_64
+       dcbt    r14,r4
+#endif
 err2;  std     r0,0(r3)
 err2;  std     r6,8(r3)
 err2;  std     r7,16(r3)
@@ -198,12 +240,47 @@ err2;     std     r20,112(r3)
 err2;  std     r21,120(r3)
 #endif
        addi    r3,r3,L1_CACHE_BYTES
-       bdnz    4b
+#ifdef CONFIG_PPC_BOOK3E_64
+       dcbtst  r14,r3
+#endif
+       bdnz    6b
+
+#ifdef CONFIG_PPC_BOOK3E_64
+       srdi    r7,r5,L1_CACHE_SHIFT    /* length in cachelines */
+       subf    r15,r15,r7              /* r6 = r7 - r15 */
+
+7:
+       mtctr   r15
+
+       /* remaining cacheline sized loads and stores, without prefetches. */
+       .align  5
+8:
+err2;  ld      r0,0(r4)
+err2;  ld      r6,8(r4)
+err2;  ld      r7,16(r4)
+err2;  ld      r8,24(r4)
+err2;  ld      r9,32(r4)
+err2;  ld      r10,40(r4)
+err2;  ld      r11,48(r4)
+err2;  ld      r12,56(r4)
+       addi    r4,r4,L1_CACHE_BYTES
+err2;  std     r0,0(r3)
+err2;  std     r6,8(r3)
+err2;  std     r7,16(r3)
+err2;  std     r8,24(r3)
+err2;  std     r9,32(r3)
+err2;  std     r10,40(r3)
+err2;  std     r11,48(r3)
+err2;  std     r12,56(r3)
+       addi    r3,r3,L1_CACHE_BYTES
+       bdnz    8b
+#endif
 
        clrldi  r5,r5,(64-L1_CACHE_SHIFT)
 
        ld      r14,STK_REG(R14)(r1)
        ld      r15,STK_REG(R15)(r1)
+#if L1_CACHE_BYTES >= 128
        ld      r16,STK_REG(R16)(r1)
        ld      r17,STK_REG(R17)(r1)
        ld      r18,STK_REG(R18)(r1)
@@ -211,14 +288,15 @@ err2;     std     r21,120(r3)
        ld      r20,STK_REG(R20)(r1)
        ld      r21,STK_REG(R21)(r1)
        ld      r22,STK_REG(R22)(r1)
+#endif
        addi    r1,r1,STACKFRAMESIZE
 
        /* Up to L1_CACHE_BYTES - 1 to go */
-5:     srdi    r6,r5,4
+9:     srdi    r6,r5,4
        mtocrf  0x01,r6
 
 #if L1_CACHE_BYTES >= 128
-6:     bf      cr7*4+1,7f
+10:    bf      cr7*4+1,11f
 err1;  ld      r0,0(r4)
 err1;  ld      r6,8(r4)
 err1;  ld      r7,16(r4)
@@ -240,7 +318,7 @@ err1;       std     r12,56(r3)
 #endif
 
        /* Up to 63B to go */
-7:     bf      cr7*4+2,8f
+11:    bf      cr7*4+2,12f
 err1;  ld      r0,0(r4)
 err1;  ld      r6,8(r4)
 err1;  ld      r7,16(r4)
@@ -253,7 +331,7 @@ err1;       std     r8,24(r3)
        addi    r3,r3,32
 
        /* Up to 31B to go */
-8:     bf      cr7*4+3,9f
+12:    bf      cr7*4+3,13f
 err1;  ld      r0,0(r4)
 err1;  ld      r6,8(r4)
        addi    r4,r4,16
@@ -261,12 +339,12 @@ err1;     std     r0,0(r3)
 err1;  std     r6,8(r3)
        addi    r3,r3,16
 
-9:     clrldi  r5,r5,(64-4)
+13:    clrldi  r5,r5,(64-4)
 
        /* Up to 15B to go */
 .Lshort_copy:
        mtocrf  0x01,r5
-       bf      cr7*4+0,12f
+       bf      cr7*4+0,14f
 err1;  lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 err1;  lwz     r6,4(r4)
        addi    r4,r4,8
@@ -274,23 +352,23 @@ err1;     stw     r0,0(r3)
 err1;  stw     r6,4(r3)
        addi    r3,r3,8
 
-12:    bf      cr7*4+1,13f
+14:    bf      cr7*4+1,15f
 err1;  lwz     r0,0(r4)
        addi    r4,r4,4
 err1;  stw     r0,0(r3)
        addi    r3,r3,4
 
-13:    bf      cr7*4+2,14f
+15:    bf      cr7*4+2,16f
 err1;  lhz     r0,0(r4)
        addi    r4,r4,2
 err1;  sth     r0,0(r3)
        addi    r3,r3,2
 
-14:    bf      cr7*4+3,15f
+16:    bf      cr7*4+3,17f
 err1;  lbz     r0,0(r4)
 err1;  stb     r0,0(r3)
 
-15:    li      r3,0
+17:    li      r3,0
        blr
 
 .Lunwind_stack_nonvmx_copy:
@@ -310,6 +388,7 @@ err1;       stb     r0,0(r3)
        ld      r5,STK_REG(R29)(r1)
        mtlr    r0
 
+#ifdef CONFIG_PPC_BOOK3S_64
        /*
         * We prefetch both the source and destination using enhanced touch
         * instructions. We use a stream ID of 0 for the load side and
@@ -342,6 +421,30 @@ err1;      stb     r0,0(r3)
        eieio
        dcbt    r0,r8,0b01010   /* all streams GO */
 .machine pop
+#else
+       /*
+        * We prefetch both the source and destination using regular touch
+        * instructions.
+        */
+       clrrdi  r6,r4,L1_CACHE_SHIFT
+       clrrdi  r9,r3,L1_CACHE_SHIFT
+       srdi    r7,r5,L1_CACHE_SHIFT    /* length in cachelines,
+                                        * capped at LINES_AHEAD
+                                        */
+       cmpldi  r7,LINES_AHEAD
+       ble     2f
+       li      r7,LINES_AHEAD
+2:     mtctr   r7
+
+3:     addi    r6,r6,L1_CACHE_BYTES
+       dcbt    0,r6
+       addi    r9,r9,L1_CACHE_BYTES
+       dcbtst  0,r9
+
+       bdnz    3b
+
+       li      r8,LINES_AHEAD*L1_CACHE_BYTES
+#endif
 
        beq     cr1,.Lunwind_stack_nonvmx_copy
 
@@ -426,6 +529,14 @@ err3;      stvx    vr0,r3,r11
 7:     sub     r5,r5,r6
        srdi    r6,r5,L1_CACHE_SHIFT
 
+#ifdef CONFIG_PPC_BOOK3E_64
+       cmpldi  r6,LINES_AHEAD
+       ble     12f     /* don't prefetch if cachelines <= LINES_AHEAD*/
+       subi    r6,r6,LINES_AHEAD       /* otherwise, r6 <- r6 - LINES_AHEAD*/
+       li      r8,LINES_AHEAD*L1_CACHE_BYTES
+#endif
+
+#if L1_CACHE_BYTES >= 128
        std     r14,STK_REG(R14)(r1)
        std     r15,STK_REG(R15)(r1)
        std     r16,STK_REG(R16)(r1)
@@ -434,6 +545,7 @@ err3;       stvx    vr0,r3,r11
        li      r14,80
        li      r15,96
        li      r16,112
+#endif
 
        mtctr   r6
 
@@ -454,6 +566,9 @@ err4;       lvx     vr1,r4,r15
 err4;  lvx     vr0,r4,r16
 #endif
        addi    r4,r4,L1_CACHE_BYTES
+#ifdef CONFIG_PPC_BOOK3E_64
+       dcbt    r8,r4
+#endif
 err4;  stvx    vr7,r0,r3
 err4;  stvx    vr6,r3,r9
 err4;  stvx    vr5,r3,r10
@@ -465,11 +580,39 @@ err4;     stvx    vr1,r3,r15
 err4;  stvx    vr0,r3,r16
 #endif
        addi    r3,r3,L1_CACHE_BYTES
+#ifdef CONFIG_PPC_BOOK3E_64
+       dcbtst  r8,r3
+#endif
        bdnz    8b
 
+#ifdef CONFIG_PPC_BOOK3E_64
+       srdi    r7,r5,L1_CACHE_SHIFT    /* length in cachelines */
+       subf    r6,r6,r7                /* r6 = r7 - r6 */
+
+12:
+       mtctr   r6
+
+       /* remaining cacheline sized loads and stores, without prefetches.  */
+       .align  5
+13:
+err4;  lvx     vr7,r0,r4
+err4;  lvx     vr6,r4,r9
+err4;  lvx     vr5,r4,r10
+err4;  lvx     vr4,r4,r11
+       addi    r4,r4,L1_CACHE_BYTES
+err4;  stvx    vr7,r0,r3
+err4;  stvx    vr6,r3,r9
+err4;  stvx    vr5,r3,r10
+err4;  stvx    vr4,r3,r11
+       addi    r3,r3,L1_CACHE_BYTES
+       bdnz    13b
+#endif
+
+#if L1_CACHE_BYTES >= 128
        ld      r14,STK_REG(R14)(r1)
        ld      r15,STK_REG(R15)(r1)
        ld      r16,STK_REG(R16)(r1)
+#endif
 
        /* Up to L1_CACHE_BYTES - 1 to go */
        clrldi  r5,r5,(64-L1_CACHE_SHIFT)
@@ -477,7 +620,7 @@ err4;       stvx    vr0,r3,r16
        mtocrf  0x01,r6
 
 #if L1_CACHE_BYTES >= 128
-       bf      cr7*4+1,9f
+       bf      cr7*4+1,14f
 err3;  lvx     vr3,r0,r4
 err3;  lvx     vr2,r4,r9
 err3;  lvx     vr1,r4,r10
@@ -490,7 +633,7 @@ err3;       stvx    vr0,r3,r11
        addi    r3,r3,64
 #endif
 
-9:     bf      cr7*4+2,10f
+14:    bf      cr7*4+2,15f
 err3;  lvx     vr1,r0,r4
 err3;  lvx     vr0,r4,r9
        addi    r4,r4,32
@@ -498,38 +641,38 @@ err3;     stvx    vr1,r0,r3
 err3;  stvx    vr0,r3,r9
        addi    r3,r3,32
 
-10:    bf      cr7*4+3,11f
+15:    bf      cr7*4+3,16f
 err3;  lvx     vr1,r0,r4
        addi    r4,r4,16
 err3;  stvx    vr1,r0,r3
        addi    r3,r3,16
 
        /* Up to 15B to go */
-11:    clrldi  r5,r5,(64-4)
+16:    clrldi  r5,r5,(64-4)
        mtocrf  0x01,r5
-       bf      cr7*4+0,12f
+       bf      cr7*4+0,17f
 err3;  ld      r0,0(r4)
        addi    r4,r4,8
 err3;  std     r0,0(r3)
        addi    r3,r3,8
 
-12:    bf      cr7*4+1,13f
+17:    bf      cr7*4+1,18f
 err3;  lwz     r0,0(r4)
        addi    r4,r4,4
 err3;  stw     r0,0(r3)
        addi    r3,r3,4
 
-13:    bf      cr7*4+2,14f
+18:    bf      cr7*4+2,19f
 err3;  lhz     r0,0(r4)
        addi    r4,r4,2
 err3;  sth     r0,0(r3)
        addi    r3,r3,2
 
-14:    bf      cr7*4+3,15f
+19:    bf      cr7*4+3,20f
 err3;  lbz     r0,0(r4)
 err3;  stb     r0,0(r3)
 
-15:    addi    r1,r1,STACKFRAMESIZE
+20:    addi    r1,r1,STACKFRAMESIZE
        b       exit_vmx_usercopy       /* tail call optimise */
 
 .Lvmx_unaligned_copy:
@@ -620,6 +763,14 @@ err3;      stvx    vr11,r3,r11
 7:     sub     r5,r5,r6
        srdi    r6,r5,L1_CACHE_SHIFT
 
+#ifdef CONFIG_PPC_BOOK3E_64
+       cmpldi  r6,LINES_AHEAD
+       ble     9f            /* don't prefetch if cachelines <= LINES_AHEAD*/
+       subi    r6,r6,LINES_AHEAD     /* otherwise, r6 <- r6 - LINES_AHEAD*/
+       li      r8,LINES_AHEAD*L1_CACHE_BYTES
+#endif
+
+#if L1_CACHE_BYTES >= 128
        std     r14,STK_REG(R14)(r1)
        std     r15,STK_REG(R15)(r1)
        std     r16,STK_REG(R16)(r1)
@@ -628,6 +779,7 @@ err3;       stvx    vr11,r3,r11
        li      r14,80
        li      r15,96
        li      r16,112
+#endif
 
        mtctr   r6
 
@@ -659,6 +811,9 @@ err4;       lvx     vr0,r4,r16
        VPERM(vr15,vr1,vr0,vr16)
 #endif
        addi    r4,r4,L1_CACHE_BYTES
+#ifdef CONFIG_PPC_BOOK3E_64
+       dcbt    r8,r4
+#endif
 err4;  stvx    vr8,r0,r3
 err4;  stvx    vr9,r3,r9
 err4;  stvx    vr10,r3,r10
@@ -670,11 +825,44 @@ err4;     stvx    vr14,r3,r15
 err4;  stvx    vr15,r3,r16
 #endif
        addi    r3,r3,L1_CACHE_BYTES
+#ifdef CONFIG_PPC_BOOK3E_64
+       dcbtst  r8,r3
+#endif
        bdnz    8b
 
+#ifdef CONFIG_PPC_BOOK3E_64
+       srdi    r7,r5,L1_CACHE_SHIFT    /* length in cachelines */
+       subf    r6,r6,r7                /* r6 = r7 - r6 */
+
+9:
+       mtctr   r6
+
+       /* remaining cacheline sized loads and stores, without prefetches.  */
+       .align  5
+10:
+err4;  lvx     vr7,r0,r4
+       VPERM(vr8,vr0,vr7,vr16)
+err4;  lvx     vr6,r4,r9
+       VPERM(vr9,vr7,vr6,vr16)
+err4;  lvx     vr5,r4,r10
+       VPERM(vr10,vr6,vr5,vr16)
+err4;  lvx     vr0,r4,r11
+       VPERM(vr11,vr5,vr0,vr16)
+       addi    r4,r4,L1_CACHE_BYTES
+err4;  stvx    vr8,r0,r3
+err4;  stvx    vr9,r3,r9
+err4;  stvx    vr10,r3,r10
+err4;  stvx    vr11,r3,r11
+       addi    r3,r3,L1_CACHE_BYTES
+
+       bdnz    10b
+#endif
+
+#if L1_CACHE_BYTES >= 128
        ld      r14,STK_REG(R14)(r1)
        ld      r15,STK_REG(R15)(r1)
        ld      r16,STK_REG(R16)(r1)
+#endif
 
        /* Up to L1_CACHE_BYTES - 1 to go */
        clrldi  r5,r5,(64-L1_CACHE_SHIFT)
@@ -682,7 +870,7 @@ err4;       stvx    vr15,r3,r16
        mtocrf  0x01,r6
 
 #if L1_CACHE_BYTES >= 128
-       bf      cr7*4+1,9f
+       bf      cr7*4+1,11f
 err3;  lvx     vr3,r0,r4
        VPERM(vr8,vr0,vr3,vr16)
 err3;  lvx     vr2,r4,r9
@@ -699,7 +887,7 @@ err3;       stvx    vr11,r3,r11
        addi    r3,r3,64
 #endif
 
-9:     bf      cr7*4+2,10f
+11:    bf      cr7*4+2,12f
 err3;  lvx     vr1,r0,r4
        VPERM(vr8,vr0,vr1,vr16)
 err3;  lvx     vr0,r4,r9
@@ -709,7 +897,7 @@ err3;       stvx    vr8,r0,r3
 err3;  stvx    vr9,r3,r9
        addi    r3,r3,32
 
-10:    bf      cr7*4+3,11f
+12:    bf      cr7*4+3,13f
 err3;  lvx     vr1,r0,r4
        VPERM(vr8,vr0,vr1,vr16)
        addi    r4,r4,16
@@ -717,10 +905,10 @@ err3;     stvx    vr8,r0,r3
        addi    r3,r3,16
 
        /* Up to 15B to go */
-11:    clrldi  r5,r5,(64-4)
+13:    clrldi  r5,r5,(64-4)
        addi    r4,r4,-16       /* Unwind the +16 load offset */
        mtocrf  0x01,r5
-       bf      cr7*4+0,12f
+       bf      cr7*4+0,14f
 err3;  lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 err3;  lwz     r6,4(r4)
        addi    r4,r4,8
@@ -728,22 +916,22 @@ err3;     stw     r0,0(r3)
 err3;  stw     r6,4(r3)
        addi    r3,r3,8
 
-12:    bf      cr7*4+1,13f
+14:    bf      cr7*4+1,15f
 err3;  lwz     r0,0(r4)
        addi    r4,r4,4
 err3;  stw     r0,0(r3)
        addi    r3,r3,4
 
-13:    bf      cr7*4+2,14f
+15:    bf      cr7*4+2,16f
 err3;  lhz     r0,0(r4)
        addi    r4,r4,2
 err3;  sth     r0,0(r3)
        addi    r3,r3,2
 
-14:    bf      cr7*4+3,15f
+16:    bf      cr7*4+3,17f
 err3;  lbz     r0,0(r4)
 err3;  stb     r0,0(r3)
 
-15:    addi    r1,r1,STACKFRAMESIZE
+17:    addi    r1,r1,STACKFRAMESIZE
        b       exit_vmx_usercopy       /* tail call optimise */
 #endif /* CONFIG_ALTIVEC */
-- 
2.3.3

_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Reply via email to