On Fri, Jun 08, 2018 at 10:20:41AM +0000, Christophe Leroy wrote:
> The generic implementation of strlen() reads strings byte per byte.
> 
> This patch implements strlen() in assembly based on a read of entire
> words, in the same spirit as what some other arches and glibc do.
> 
> On a 8xx the time spent in strlen is reduced by 2/3 for long strings.
> 
> strlen() selftest on an 8xx provides the following values:
> 
> Before the patch (ie with the generic strlen() in lib/string.c):
> 
> len 256 : time = 0.803648
> len 16  : time = 0.062989
> len 4   : time = 0.026269
> 
> After the patch:
> 
> len 256 : time = 0.267791  ==>  66% improvment
> len 16  : time = 0.037902  ==>  41% improvment
> len 4   : time = 0.026124  ==>  no degradation
> 
> Signed-off-by: Christophe Leroy <christophe.le...@c-s.fr>
> ---
> Not tested on PPC64.
> 
> Changes in v4:
>  - Added alignment of the loop
>  - doing the andc only if still not 0 as it happends only for bytes above 
> 0x7f which is pretty rare in a string
> 
> Changes in v3:
>  - Made it common to PPC32 and PPC64
> 
> Changes in v2:
>  - Moved handling of unaligned strings outside of the main path as it is very 
> unlikely.
>  - Removed the verification of the fourth byte in case none of the three 
> first ones are NUL.
> 
> 
>  arch/powerpc/include/asm/asm-compat.h |  4 +++
>  arch/powerpc/include/asm/string.h     |  1 +
>  arch/powerpc/lib/string.S             | 57 
> +++++++++++++++++++++++++++++++++++
>  3 files changed, 62 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/asm-compat.h 
> b/arch/powerpc/include/asm/asm-compat.h
> index 7f2a7702596c..0e99fe7570c0 100644
> --- a/arch/powerpc/include/asm/asm-compat.h
> +++ b/arch/powerpc/include/asm/asm-compat.h
> @@ -20,8 +20,10 @@
>  
>  /* operations for longs and pointers */
>  #define PPC_LL               stringify_in_c(ld)
> +#define PPC_LLU              stringify_in_c(ldu)
>  #define PPC_STL              stringify_in_c(std)
>  #define PPC_STLU     stringify_in_c(stdu)
> +#define PPC_ROTLI    stringify_in_c(rotldi)
>  #define PPC_LCMPI    stringify_in_c(cmpdi)
>  #define PPC_LCMPLI   stringify_in_c(cmpldi)
>  #define PPC_LCMP     stringify_in_c(cmpd)
> @@ -53,8 +55,10 @@
>  
>  /* operations for longs and pointers */
>  #define PPC_LL               stringify_in_c(lwz)
> +#define PPC_LLU              stringify_in_c(lwzu)
>  #define PPC_STL              stringify_in_c(stw)
>  #define PPC_STLU     stringify_in_c(stwu)
> +#define PPC_ROTLI    stringify_in_c(rotlwi)
>  #define PPC_LCMPI    stringify_in_c(cmpwi)
>  #define PPC_LCMPLI   stringify_in_c(cmplwi)
>  #define PPC_LCMP     stringify_in_c(cmpw)
> diff --git a/arch/powerpc/include/asm/string.h 
> b/arch/powerpc/include/asm/string.h
> index 9b8cedf618f4..8fdcb532de72 100644
> --- a/arch/powerpc/include/asm/string.h
> +++ b/arch/powerpc/include/asm/string.h
> @@ -13,6 +13,7 @@
>  #define __HAVE_ARCH_MEMCHR
>  #define __HAVE_ARCH_MEMSET16
>  #define __HAVE_ARCH_MEMCPY_FLUSHCACHE
> +#define __HAVE_ARCH_STRLEN
>  
>  extern char * strcpy(char *,const char *);
>  extern char * strncpy(char *,const char *, __kernel_size_t);
> diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
> index 4b41970e9ed8..238f61e2024f 100644
> --- a/arch/powerpc/lib/string.S
> +++ b/arch/powerpc/lib/string.S
> @@ -67,3 +67,60 @@ _GLOBAL(memchr)
>  2:   li      r3,0
>       blr
>  EXPORT_SYMBOL(memchr)
> +
> +_GLOBAL(strlen)
> +     andi.   r9, r3, (SZL - 1)
> +     addi    r10, r3, -SZL
> +     bne-    1f
> +2:   lis     r6, 0x8080
> +     ori     r6, r6, 0x8080          /* r6 = 0x80808080 (himagic) */
> +#ifdef CONFIG_PPC64
> +     rldimi  r6, r6, 32, 0           /* r6 = 0x8080808080808080 (himagic) */
> +#endif
> +     PPC_ROTLI  r7, r6, 1            /* r7 = 0x01010101(01010101) (lomagic)*/
> +     .balign IFETCH_ALIGN_BYTES
> +3:   PPC_LLU r9, SZL(r10)
> +     /* ((x - lomagic) & ~x & himagic) == 0 means no byte in x is NUL */
> +     subf    r8, r7, r9
> +     and.    r8, r8, r6
> +     beq+    3b
> +     andc.   r8, r8, r9
> +     beq+    3b
> +#ifdef CONFIG_PPC64
> +     rldicl. r8, r9, 8, 56
> +     beq     20f
> +     rldicl. r8, r9, 16, 56
> +     beq     21f
> +     rldicl. r8, r9, 24, 56
> +     beq     22f
> +     rldicl. r8, r9, 32, 56
> +     beq     23f
> +     addi    r10, r10, 4
> +#endif
> +     rlwinm. r8, r9, 0, 0xff000000
> +     beq     20f
> +     rlwinm. r8, r9, 0, 0x00ff0000
> +     beq     21f
> +     rlwinm. r8, r9, 0, 0x0000ff00
> +     beq     22f
> +23:  subf    r3, r3, r10

Actually these rlwinm. can likely be replaced by a single
cntlzw /cntlzd; for 32 bit something like:

        cntlzw  r8,r9
        subf    r3,r3,r10       
        srwi    r8,r8,3
        add     r3,r3,r8
        blr

and similar for 64 bit but with cntlzd.

        Gabriel


> +     addi    r3, r3, 3
> +     blr
> +22:  subf    r3, r3, r10
> +     addi    r3, r3, 2
> +     blr
> +21:  subf    r3, r3, r10
> +     addi    r3, r3, 1
> +     blr
> +19:  addi    r10, r10, (SZL - 1)
> +20:  subf    r3, r3, r10
> +     blr
> +
> +1:   lbz     r9, SZL(r10)
> +     addi    r10, r10, 1
> +     cmpwi   cr1, r9, 0
> +     andi.   r9, r10, (SZL - 1)
> +     beq     cr1, 19b
> +     bne     1b
> +     b       2b
> +EXPORT_SYMBOL(strlen)
> -- 
> 2.13.3
> 

Reply via email to