Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m

Ramana Radhakrishnan Fri, 01 Jul 2016 06:42:18 -0700


On 13/10/15 18:01, Andre Vieira wrote:
> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings 
> (https://git.linaro.org/toolchain/cortex-strings.git), which was contributed 
> by ARM under Free BSD license.
> 
> The new aeabi_idiv routine is used to replace the one in 
> libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1 
> wrapper. The new routine is under LGPLv3 license.


This is not under LGPLv3 . It is under GPLv3 with the runtime library exception 
license, there's a difference. Assuming your licensing expectation is ok .... 
read on for more of a review.

> 
> The main advantage of this version is that it can improve the performance of 
> the aeabi_idiv function for Thumb1. This solution will also increase the code 
> size. So it will only be used if __OPTIMIZE_SIZE__ is not defined.
> 
> Make check passed for armv6-m.
> 
> libgcc/ChangeLog:
> 2015-08-10  Hale Wang  <hale.w...@arm.com>
>             Andre Vieira  <andre.simoesdiasvie...@arm.com>
> 
>   * config/arm/lib1funcs.S: Add new wrapper.
> 
> 0001-integer-division.patch
> 
> 
> From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
> From: Andre Simoes Dias Vieira <andsi...@arm.com>
> Date: Fri, 21 Aug 2015 14:23:28 +0100
> Subject: [PATCH] new wrapper idivmod
> 
> ---
>  libgcc/config/arm/lib1funcs.S | 250 
> ++++++++++++++++++++++++++++++++++++------
>  1 file changed, 217 insertions(+), 33 deletions(-)
> 
> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
> index 
> 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816
>  100644
> --- a/libgcc/config/arm/lib1funcs.S
> +++ b/libgcc/config/arm/lib1funcs.S
> @@ -306,34 +306,12 @@ LSYM(Lend_fde):
>  #ifdef __ARM_EABI__
>  .macro THUMB_LDIV0 name signed
>  #if defined(__ARM_ARCH_6M__)
> -     .ifc \signed, unsigned
> -     cmp     r0, #0
> -     beq     1f
> -     mov     r0, #0
> -     mvn     r0, r0          @ 0xffffffff
> -1:
> -     .else
> -     cmp     r0, #0
> -     beq     2f
> -     blt     3f
> +
> +     push    {r0, lr}
>       mov     r0, #0
> -     mvn     r0, r0
> -     lsr     r0, r0, #1      @ 0x7fffffff
> -     b       2f
> -3:   mov     r0, #0x80
> -     lsl     r0, r0, #24     @ 0x80000000
> -2:
> -     .endif
> -     push    {r0, r1, r2}
> -     ldr     r0, 4f
> -     adr     r1, 4f
> -     add     r0, r1
> -     str     r0, [sp, #8]
> -     @ We know we are not on armv4t, so pop pc is safe.
> -     pop     {r0, r1, pc}
> -     .align  2
> -4:
> -     .word   __aeabi_idiv0 - 4b
> +     bl      SYM(__aeabi_idiv0)
> +     pop     {r1, pc}
> +

I'd still retain the comment about pop pc here because there's often a 
misconception of merging armv4t and armv6m code.

>  #elif defined(__thumb2__)
>       .syntax unified
>       .ifc \signed, unsigned
> @@ -945,7 +923,170 @@ LSYM(Lover7):
>       add     dividend, work
>    .endif
>  LSYM(Lgot_result):
> -.endm        
> +.endm
> +
> +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
> +/* If performance is preferred, the following functions are provided.  */
> +

Comment above #if please and also check elsewhere in patch.

> +/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
> +.macro BranchToDiv n, label
> +     lsr     curbit, dividend, \n
> +     cmp     curbit, divisor
> +     blo     \label
> +.endm
> +
> +/* Body of div(n).  Shift the divisor in n bits and compare the divisor
> +   and dividend.  Update the dividend as the substruction result.  */
> +.macro DoDiv n
> +     lsr     curbit, dividend, \n
> +     cmp     curbit, divisor
> +     bcc     1f
> +     lsl     curbit, divisor, \n
> +     sub     dividend, dividend, curbit
> +
> +1:   adc     result, result
> +.endm
> +
> +/* The body of division with positive divisor.  Unless the divisor is very
> +   big, shift it up in multiples of four bits, since this is the amount of
> +   unwinding in the main division loop.  Continue shifting until the divisor
> +   is larger than the dividend.  */
> +.macro THUMB1_Div_Positive
> +     mov     result, #0
> +     BranchToDiv #1, LSYM(Lthumb1_div1)
> +     BranchToDiv #4, LSYM(Lthumb1_div4)
> +     BranchToDiv #8, LSYM(Lthumb1_div8)
> +     BranchToDiv #12, LSYM(Lthumb1_div12)
> +     BranchToDiv #16, LSYM(Lthumb1_div16)
> +LSYM(Lthumb1_div_large_positive):
> +     mov     result, #0xff
> +     lsl     divisor, divisor, #8
> +     rev     result, result
> +     lsr     curbit, dividend, #16
> +     cmp     curbit, divisor
> +     blo     1f
> +     asr     result, #8
> +     lsl     divisor, divisor, #8
> +     beq     LSYM(Ldivbyzero_waypoint)
> +
> +1:   lsr     curbit, dividend, #12
> +     cmp     curbit, divisor
> +     blo     LSYM(Lthumb1_div12)
> +     b       LSYM(Lthumb1_div16)
> +LSYM(Lthumb1_div_loop):
> +     lsr     divisor, divisor, #8
> +LSYM(Lthumb1_div16):
> +     Dodiv   #15
> +     Dodiv   #14
> +     Dodiv   #13
> +     Dodiv   #12
> +LSYM(Lthumb1_div12):
> +     Dodiv   #11
> +     Dodiv   #10
> +     Dodiv   #9
> +     Dodiv   #8
> +     bcs     LSYM(Lthumb1_div_loop)
> +LSYM(Lthumb1_div8):
> +     Dodiv   #7
> +     Dodiv   #6
> +     Dodiv   #5
> +LSYM(Lthumb1_div5):
> +     Dodiv   #4
> +LSYM(Lthumb1_div4):
> +     Dodiv   #3
> +LSYM(Lthumb1_div3):
> +     Dodiv   #2
> +LSYM(Lthumb1_div2):
> +     Dodiv   #1
> +LSYM(Lthumb1_div1):
> +     sub     divisor, dividend, divisor
> +     bcs     1f
> +     cpy     divisor, dividend
> +
> +1:   adc     result, result
> +     cpy     dividend, result
> +     RET
> +
> +LSYM(Ldivbyzero_waypoint):
> +     b       LSYM(Ldiv0)
> +.endm
> +
> +/* The body of division with negative divisor.  Similar with
> +   THUMB1_Div_Positive except that the shift steps are in multiples
> +   of six bits.  */
> +.macro THUMB1_Div_Negative
> +     lsr     result, divisor, #31
> +     beq     1f
> +     neg     divisor, divisor
> +
> +1:   asr     curbit, dividend, #32
> +     bcc     2f
> +     neg     dividend, dividend
> +
> +2:   eor     curbit, result
> +     mov     result, #0
> +     cpy     ip, curbit
> +     BranchToDiv #4, LSYM(Lthumb1_div_negative4)
> +     BranchToDiv #8, LSYM(Lthumb1_div_negative8)
> +LSYM(Lthumb1_div_large):
> +     mov     result, #0xfc
> +     lsl     divisor, divisor, #6
> +     rev     result, result
> +     lsr     curbit, dividend, #8
> +     cmp     curbit, divisor
> +     blo     LSYM(Lthumb1_div_negative8)
> +
> +     lsl     divisor, divisor, #6
> +     asr     result, result, #6
> +     cmp     curbit, divisor
> +     blo     LSYM(Lthumb1_div_negative8)
> +
> +     lsl     divisor, divisor, #6
> +     asr     result, result, #6
> +     cmp     curbit, divisor
> +     blo     LSYM(Lthumb1_div_negative8)
> +
> +     lsl     divisor, divisor, #6
> +     beq     LSYM(Ldivbyzero_negative)
> +     asr     result, result, #6
> +     b       LSYM(Lthumb1_div_negative8)
> +LSYM(Lthumb1_div_negative_loop):
> +     lsr     divisor, divisor, #6
> +LSYM(Lthumb1_div_negative8):
> +     DoDiv   #7
> +     DoDiv   #6
> +     DoDiv   #5
> +     DoDiv   #4
> +LSYM(Lthumb1_div_negative4):
> +     DoDiv   #3
> +     DoDiv   #2
> +     bcs     LSYM(Lthumb1_div_negative_loop)
> +     DoDiv   #1
> +     sub     divisor, dividend, divisor
> +     bcs     1f
> +     cpy     divisor, dividend
> +
> +1:   cpy     curbit, ip
> +     adc     result, result
> +     asr     curbit, curbit, #1
> +     cpy     dividend, result
> +     bcc     2f
> +     neg     dividend, dividend
> +     cmp     curbit, #0
> +
> +2:   bpl     3f
> +     neg     divisor, divisor
> +
> +3:   RET
> +
> +LSYM(Ldivbyzero_negative):
> +     cpy     curbit, ip
> +     asr     curbit, curbit, #1
> +     bcc     LSYM(Ldiv0)
> +     neg     dividend, dividend
> +.endm
> +#endif /* ARM Thumb version.  */
> +
>  /* ------------------------------------------------------------------------ 
> */
>  /*           Start of the Real Functions                                 */
>  /* ------------------------------------------------------------------------ 
> */
> @@ -955,6 +1096,7 @@ LSYM(Lgot_result):
>  
>       FUNC_START udivsi3
>       FUNC_ALIAS aeabi_uidiv udivsi3
> +#if defined(__OPTIMIZE_SIZE__)
>  
>       cmp     divisor, #0
>       beq     LSYM(Ldiv0)
> @@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
>       pop     { work }
>       RET
>  
> +#else
> +     /* Implementation of aeabi_uidiv for ARMv6m.  This version is only
> +        used in ARMv6-M when we need an efficient implementation.  */
> +LSYM(udivsi3_skip_div0_test):
> +     THUMB1_Div_Positive
> +
> +#endif /* __OPTIMIZE_SIZE__ */
> +
>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>  
>       ARM_FUNC_START udivsi3
> @@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
>  FUNC_START aeabi_uidivmod
>       cmp     r1, #0
>       beq     LSYM(Ldiv0)
> +# if defined(__OPTIMIZE_SIZE__)
>       push    {r0, r1, lr}
>       bl      LSYM(udivsi3_skip_div0_test)
>       POP     {r1, r2, r3}
>       mul     r2, r0
>       sub     r1, r1, r2
>       bx      r3
> +# else
> +     /* Both the quotient and remainder are calculated simultaneously
> +        in THUMB1_Div_Positive.  There is no need to calculate the
> +        remainder again here.  */
> +     b       LSYM(udivsi3_skip_div0_test)
> +     RET
> +# endif /* __OPTIMIZE_SIZE__ */
> +
>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>  ARM_FUNC_START aeabi_uidivmod
>       cmp     r1, #0
> @@ -1084,7 +1243,7 @@ LSYM(Lover10):
>       RET
>       
>  #else  /* ARM version.  */
> -     
> +
>       FUNC_START umodsi3
>  
>       subs    r2, r1, #1                      @ compare divisor with 1
> @@ -1109,8 +1268,9 @@ LSYM(Lover10):
>  
>  #if defined(__prefer_thumb__)
>  
> -     FUNC_START divsi3       
> +     FUNC_START divsi3
>       FUNC_ALIAS aeabi_idiv divsi3
> +#if defined(__OPTIMIZE_SIZE__)
>  
>       cmp     divisor, #0
>       beq     LSYM(Ldiv0)
> @@ -1133,7 +1293,7 @@ LSYM(Lover11):
>       blo     LSYM(Lgot_result)
>  
>       THUMB_DIV_MOD_BODY 0
> -     
> +
>       mov     r0, result
>       mov     work, ip
>       cmp     work, #0
> @@ -1142,6 +1302,21 @@ LSYM(Lover11):
>  LSYM(Lover12):
>       pop     { work }
>       RET
> +#else
> +     /* Implementation of aeabi_idiv for ARMv6m.  This version is only
> +        used in ARMv6-M when we need an efficient implementation.  */
> +LSYM(divsi3_skip_div0_test):
> +     cpy     curbit, dividend
> +     orr     curbit, divisor
> +     bmi     LSYM(Lthumb1_div_negative)
> +
> +LSYM(Lthumb1_div_positive):
> +     THUMB1_Div_Positive
> +
> +LSYM(Lthumb1_div_negative):
> +     THUMB1_Div_Negative
> +
> +#endif /* __OPTIMIZE_SIZE__ */
>  
>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>  
> @@ -1154,8 +1329,8 @@ LSYM(Lover12):
>       RET
>  
>  #else /* ARM/Thumb-2 version.  */
> -     
> -     ARM_FUNC_START divsi3   
> +
> +     ARM_FUNC_START divsi3
>       ARM_FUNC_ALIAS aeabi_idiv divsi3
>  
>       cmp     r1, #0
> @@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
>  FUNC_START aeabi_idivmod
>       cmp     r1, #0
>       beq     LSYM(Ldiv0)
> +# if defined(__OPTIMIZE_SIZE__)
>       push    {r0, r1, lr}
>       bl      LSYM(divsi3_skip_div0_test)
>       POP     {r1, r2, r3}
>       mul     r2, r0
>       sub     r1, r1, r2
>       bx      r3
> +# else
> +     /* Both the quotient and remainder are calculated simultaneously
> +        in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
> +        need to calculate the remainder again here.  */
> +     b       LSYM(divsi3_skip_div0_test)
> +     RET
> +# endif /* __OPTIMIZE_SIZE__ */
> +
>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>  ARM_FUNC_START aeabi_idivmod
>       cmp     r1, #0
> -- 1.9.1
> 

Otherwise OK if no regressions and the following request passes.

Can you ensure that libgcc for one ARM state and one Thumb2 state non-v6m 
configuration should give identical binaries with and without your patch, no ? 

regards
Ramana

Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m

Reply via email to