[PATCH/RFC] 64 bit csum_partial_copy_generic

jschopp Wed, 10 Sep 2008 13:15:52 -0700
The current 64 bit csum_partial_copy_generic function is based on the 32bit version and never was optimized for 64 bit. This patch takes the 64 bitmemcpy and adapts it to also do the sum. It has been tested on a varietyof input sizes and alignments on Power5 and Power6 processors. It givescorrect output for all cases tested. It also runs 20-55% fasterthan the implemention it replaces depending on size, alignment, and processor.
I think there is still some room for improvement in the unaligned case,but given that it is much faster than what we have now I figured I'd sendit out.

Signed-off-by: Joel Schopp<[EMAIL PROTECTED]>

Index: 2.6.26/arch/powerpc/lib/checksum_64.S
===================================================================
--- 2.6.26.orig/arch/powerpc/lib/checksum_64.S
+++ 2.6.26/arch/powerpc/lib/checksum_64.S
@@ -22,8 +22,7 @@
  * len is in words and is always >= 5.
  *
  * In practice len == 5, but this is not guaranteed.  So this code does not
- * attempt to use doubleword instructions.
- */
+ * attempt to use doubleword instructions. */
 _GLOBAL(ip_fast_csum)
        lwz     r0,0(r3)
        lwzu    r5,4(r3)
@@ -122,108 +121,286 @@ _GLOBAL(csum_partial)
  * to *src_err or *dst_err respectively, and (for an error on
  * src) zeroes the rest of dst.
  *
- * This code needs to be reworked to take advantage of 64 bit sum+copy.
- * However, due to tokenring halfword alignment problems this will be very
- * tricky.  For now we'll leave it until we instrument it somehow.
+ * This returns a 32 bit 1s complement sum that can be folded to 16 bits and
+ * notted to produce a 16bit tcp/ip checksum.
  *
  * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, 
r8=dst_err)
  */
 _GLOBAL(csum_partial_copy_generic)
-       addic   r0,r6,0
-       subi    r3,r3,4
-       subi    r4,r4,4
-       srwi.   r6,r5,2
-       beq     3f              /* if we're doing < 4 bytes */
-       andi.   r9,r4,2         /* Align dst to longword boundary */
-       beq+    1f
-81:    lhz     r6,4(r3)        /* do 2 bytes to get aligned */
-       addi    r3,r3,2
-       subi    r5,r5,2
-91:    sth     r6,4(r4)
-       addi    r4,r4,2
-       addc    r0,r0,r6
-       srwi.   r6,r5,2         /* # words to do */
+       std     r7,48(r1)       /* we need to save the error pointers ...*/
+       std     r8,56(r1)       /* we need to save the error pointers ...*/
+       PPC_MTOCRF      0x01,r5
+       cmpldi  cr1,r5,16
+       neg     r11,r4          # LS 3 bits = # bytes to 8-byte dest bdry
+       andi.   r11,r11,7
+       dcbt    0,r3
+       blt     cr1,.Lshort_copy
+       bne     .Ldst_unaligned
+.Ldst_aligned:
+       andi.   r0,r3,7
+       addi    r4,r4,-16
+       bne     .Lsrc_unaligned
+       srdi    r10,r5,4                /* src and dst aligned */
+80:    ld      r9,0(r3)
+       addi    r3,r3,-8
+       mtctr   r10
+       andi.   r5,r5,7
+       bf      cr7*4+0,2f
+       addi    r4,r4,8
+       addi    r3,r3,8
+       mr      r12,r9
+       blt     cr1,3f
+1:
+81:    ld      r9,8(r3)
+82:    std     r12,8(r4)
+       adde    r6,r6,r12       /* add to checksum */
+2:
+83:    ldu     r12,16(r3)
+84:    stdu    r9,16(r4)
+       adde    r6,r6,r9        /* add to checksum */
+       bdnz    1b
+3:
+85:    std     r12,8(r4)
+       adde    r6,r6,r12       /* add to checksum */
        beq     3f
-1:     mtctr   r6
-82:    lwzu    r6,4(r3)        /* the bdnz has zero overhead, so it should */
-92:    stwu    r6,4(r4)        /* be unnecessary to unroll this loop */
-       adde    r0,r0,r6
-       bdnz    82b
-       andi.   r5,r5,3
-3:     cmpwi   0,r5,2
-       blt+    4f
-83:    lhz     r6,4(r3)
+       addi    r4,r4,16
+       ld      r9,8(r3)
+.Ldo_tail:
+       bf      cr7*4+1,1f
+       rotldi  r9,r9,32
+86:    stw     r9,0(r4)
+       adde    r6,r6,r9        /* add to checksum */
+       addi    r4,r4,4
+1:     bf      cr7*4+2,2f
+       rotldi  r9,r9,16
+87:    sth     r9,0(r4)
+       adde    r6,r6,r9        /* add to checksum */
+       addi    r4,r4,2
+2:     bf      cr7*4+3,3f
+       rotldi  r9,r9,8
+88:    stb     r9,0(r4)
+       adde    r6,r6,r9        /* add to checksum */
+3:     addze   r6,r6           /* add in final carry (unlikely with 64-bit 
regs) */
+        rldicl  r9,r6,32,0    /* fold 64 bit value */
+        add     r3,r9,r6
+        srdi    r3,r3,32
+       blr                     /* return sum */
+
+.Lsrc_unaligned:
+       srdi    r11,r5,3
+       addi    r5,r5,-16
+       subf    r3,r0,r3
+       srdi    r7,r5,4
+       sldi    r10,r0,3
+       cmpdi   cr6,r11,3
+       andi.   r5,r5,7
+       mtctr   r7
+       subfic  r12,r10,64
+       add     r5,r5,r0
+
+       bt      cr7*4+0,0f
+
+115:   ld      r9,0(r3)        # 3+2n loads, 2+2n stores
+116:   ld      r0,8(r3)
+       sld     r11,r9,r10
+117:   ldu     r9,16(r3)
+       srd     r7,r0,r12
+       sld     r8,r0,r10
+       or      r7,r7,r11
+       blt     cr6,4f
+118:   ld      r0,8(r3)
+       # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r11 & r12
+       b       2f
+
+0:
+113:   ld      r0,0(r3)        # 4+2n loads, 3+2n stores
+114:   ldu     r9,8(r3)
+       sld     r8,r0,r10
+       addi    r4,r4,-8
+       blt     cr6,5f
+119:   ld      r0,8(r3)
+       mr      r7,r12          /* need more registers */
+       srd     r12,r9,r12
+       sld     r11,r9,r10
+120:   ldu     r9,16(r3)
+       or      r12,r8,r12
+       srd     r7,r0,r7        /* lost value but can recreate from r10 */
+       sld     r8,r0,r10
+       addi    r4,r4,16
+       beq     cr6,3f
+
+       # d0=(s0<<|s1>>) in r12, s1<< in r11, s2>> in r7, s2<< in r8, s3 in r9
+1:     or      r7,r7,r11
+89:    ld      r0,8(r3)
+90:    std     r12,8(r4)
+       adde    r6,r6,r12       /* add to checksum */
+2:     subfic  r12,r10,64      /* recreate value from r10 */
+       srd     r12,r9,r12
+       sld     r11,r9,r10
+91:    ldu     r9,16(r3)
+       or      r12,r8,r12
+92:    stdu    r7,16(r4)
+       adde    r6,r6,r7        /* add to checksum */
+       subfic  r7,r10,64       /* recreate value from r10 */
+       srd     r7,r0,r7
+       sld     r8,r0,r10
+       bdnz    1b
+
+3:
+93:    std     r12,8(r4)
+       adde    r6,r6,r12       /* add to checksum */
+       or      r7,r7,r11
+4:
+94:    std     r7,16(r4)
+       adde    r6,r6,r7        /* add to checksum */
+5:     subfic  r12,r10,64      /* recreate value from r10 */
+       srd     r12,r9,r12
+       or      r12,r8,r12
+95:    std     r12,24(r4)
+       adde    r6,r6,r12       /* add to checksum */
+       beq     4f
+       cmpwi   cr1,r5,8
+       addi    r4,r4,32
+       sld     r9,r9,r10
+       ble     cr1,.Ldo_tail
+96:    ld      r0,8(r3)
+       srd     r7,r0,r12
+       or      r9,r7,r9
+       b       .Ldo_tail
+
+.Ldst_unaligned:
+       PPC_MTOCRF      0x01,r11                # put #bytes to 8B bdry into cr7
+       subf    r5,r11,r5
+       li      r10,0
+       cmpldi  r1,r5,16
+       bf      cr7*4+3,1f
+97:    lbz     r0,0(r3)
+98:    stb     r0,0(r4)
+       adde    r6,r6,r0        /* add to checksum */
+       addi    r10,r10,1
+1:     bf      cr7*4+2,2f
+99:    lhzx    r0,r10,r3
+100:   sthx    r0,r10,r4
+       adde    r6,r6,r0        /* add to checksum */
+       addi    r10,r10,2
+2:     bf      cr7*4+1,3f
+101:   lwzx    r0,r10,r3
+102:   stwx    r0,r10,r4
+       adde    r6,r6,r0        /* add to checksum */
+3:     PPC_MTOCRF      0x01,r5
+       add     r3,r11,r3
+       add     r4,r11,r4
+       b       .Ldst_aligned
+
+.Lshort_copy:
+       bf      cr7*4+0,1f
+103:   lwz     r0,0(r3)
+104:   lwz     r9,4(r3)
+       addi    r3,r3,8
+105:   stw     r0,0(r4)
+106:   stw     r9,4(r4)
+       adde    r6,r6,r0
+       adde    r6,r6,r9
+       addi    r4,r4,8
+1:     bf      cr7*4+1,2f
+107:   lwz     r0,0(r3)
+       addi    r3,r3,4
+108:   stw     r0,0(r4)
+       adde    r6,r6,r0
+       addi    r4,r4,4
+2:     bf      cr7*4+2,3f
+109:   lhz     r0,0(r3)
        addi    r3,r3,2
-       subi    r5,r5,2
-93:    sth     r6,4(r4)
+110:   sth     r0,0(r4)
+       adde    r6,r6,r0
        addi    r4,r4,2
-       adde    r0,r0,r6
-4:     cmpwi   0,r5,1
-       bne+    5f
-84:    lbz     r6,4(r3)
-94:    stb     r6,4(r4)
-       slwi    r6,r6,8         /* Upper byte of word */
-       adde    r0,r0,r6
-5:     addze   r3,r0           /* add in final carry (unlikely with 64-bit 
regs) */
-        rldicl  r4,r3,32,0      /* fold 64 bit value */
-        add     r3,r4,r3
+3:     bf      cr7*4+3,4f
+111:   lbz     r0,0(r3)
+112:   stb     r0,0(r4)
+       adde    r6,r6,r0
+4:     addze   r6,r6           /* add in final carry (unlikely with 64-bit 
regs) */
+        rldicl  r9,r6,32,0    /* fold 64 bit value */
+        add     r3,r9,r6
         srdi    r3,r3,32
-       blr
+       blr                     /* return dest pointer */

 /* These shouldn't go in the fixup section, since that would
    cause the ex_table addresses to get out of order. */

-       .globl src_error_1
-src_error_1:
-       li      r6,0
-       subi    r5,r5,2
-95:    sth     r6,4(r4)
-       addi    r4,r4,2
-       srwi.   r6,r5,2
-       beq     3f
-       mtctr   r6
-       .globl src_error_2
-src_error_2:
-       li      r6,0
-96:    stwu    r6,4(r4)
-       bdnz    96b
-3:     andi.   r5,r5,3
-       beq     src_error
-       .globl src_error_3
-src_error_3:
-       li      r6,0
-       mtctr   r5
-       addi    r4,r4,3
-97:    stbu    r6,1(r4)
-       bdnz    97b
+/* Load store exception handlers */
        .globl src_error
 src_error:
-       cmpdi   0,r7,0
+       ld      r7,48(r1)       /* restore src_error */
+
+       li      r11,0
+       mtctr   r5              /* Non-optimized zero out we will hopefully...*/
+113:   stbu    r11,1(r4)               /* never hit. */
+       bdnz    113b
+       cmpdi   0,r7,0          /* if it isn't NULL write EFAULT into it */
        beq     1f
-       li      r6,-EFAULT
-       stw     r6,0(r7)
-1:     addze   r3,r0
+       li      r11,-EFAULT
+       stw     r11,0(r7)
+1:     addze   r3,r6           /* add any carry */
        blr

        .globl dst_error
 dst_error:
+       ld      r8,56(r1)       /* restore dst_error */
        cmpdi   0,r8,0
        beq     1f
-       li      r6,-EFAULT
-       stw     r6,0(r8)
-1:     addze   r3,r0
+       li      r11,-EFAULT
+       stw     r11,0(r8)
+1:     addze   r3,r6           /* add any carry */
        blr

+       .globl dst_error
+
 .section __ex_table,"a"
        .align  3
-       .llong  81b,src_error_1
-       .llong  91b,dst_error
-       .llong  82b,src_error_2
-       .llong  92b,dst_error
-       .llong  83b,src_error_3
-       .llong  93b,dst_error
-       .llong  84b,src_error_3
-       .llong  94b,dst_error
-       .llong  95b,dst_error
-       .llong  96b,dst_error
-       .llong  97b,dst_error
+       /* labels 80-120 are for load/stores that we have
+        * to catch exceptions and handle them
+        */
+       /*
+
+       */
+       .llong  80b,src_error
+       .llong  81b,src_error
+       .llong  82b,dst_error
+       .llong  83b,src_error
+       .llong  84b,dst_error
+       .llong  85b,dst_error
+       .llong  86b,dst_error
+       .llong  87b,dst_error
+       .llong  88b,dst_error
+       .llong  115b,src_error
+       .llong  116b,src_error
+       .llong  117b,src_error
+       .llong  118b,src_error
+       .llong  113b,src_error
+       .llong  114b,src_error
+       .llong  119b,src_error
+       .llong  120b,src_error
+       .llong  90b,dst_error
+       .llong  91b,src_error
+       .llong  92b,dst_error
+       .llong  93b,dst_error
+       .llong  94b,dst_error
+       .llong  95b,dst_error
+       .llong  96b,src_error
+       .llong  97b,src_error
+       .llong  98b,dst_error
+       .llong  99b,src_error
+       .llong  100b,dst_error
+       .llong  101b,src_error
+       .llong  102b,dst_error
+       .llong  103b,src_error
+       .llong  104b,src_error
+       .llong  105b,dst_error
+       .llong  106b,dst_error
+       .llong  107b,src_error
+       .llong  108b,dst_error
+       .llong  109b,src_error
+       .llong  110b,dst_error
+       .llong  111b,src_error
+       .llong  112b,dst_error
+       .llong  113b,dst_error
_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev
[PATCH/RFC] 64 bit csum_partial_copy_generic

Reply via email to