On 11/07/16 18:09, Andre Vieira (lists) wrote:
> On 06/07/16 11:52, Andre Vieira (lists) wrote:
>> On 01/07/16 14:40, Ramana Radhakrishnan wrote:
>>>
>>>
>>> On 13/10/15 18:01, Andre Vieira wrote:
>>>> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings 
>>>> (https://git.linaro.org/toolchain/cortex-strings.git), which was 
>>>> contributed by ARM under Free BSD license.
>>>>
>>>> The new aeabi_idiv routine is used to replace the one in 
>>>> libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1 
>>>> wrapper. The new routine is under LGPLv3 license.
>>>
>>> This is not under LGPLv3 . It is under GPLv3 with the runtime library 
>>> exception license, there's a difference. Assuming your licensing 
>>> expectation is ok .... read on for more of a review.
>>>
>>>>
>>>> The main advantage of this version is that it can improve the performance 
>>>> of the aeabi_idiv function for Thumb1. This solution will also increase 
>>>> the code size. So it will only be used if __OPTIMIZE_SIZE__ is not defined.
>>>>
>>>> Make check passed for armv6-m.
>>>>
>>>> libgcc/ChangeLog:
>>>> 2015-08-10  Hale Wang  <hale.w...@arm.com>
>>>>             Andre Vieira  <andre.simoesdiasvie...@arm.com>
>>>>
>>>>   * config/arm/lib1funcs.S: Add new wrapper.
>>>>
>>>> 0001-integer-division.patch
>>>>
>>>>
>>>> From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
>>>> From: Andre Simoes Dias Vieira <andsi...@arm.com>
>>>> Date: Fri, 21 Aug 2015 14:23:28 +0100
>>>> Subject: [PATCH] new wrapper idivmod
>>>>
>>>> ---
>>>>  libgcc/config/arm/lib1funcs.S | 250 
>>>> ++++++++++++++++++++++++++++++++++++------
>>>>  1 file changed, 217 insertions(+), 33 deletions(-)
>>>>
>>>> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
>>>> index 
>>>> 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816
>>>>  100644
>>>> --- a/libgcc/config/arm/lib1funcs.S
>>>> +++ b/libgcc/config/arm/lib1funcs.S
>>>> @@ -306,34 +306,12 @@ LSYM(Lend_fde):
>>>>  #ifdef __ARM_EABI__
>>>>  .macro THUMB_LDIV0 name signed
>>>>  #if defined(__ARM_ARCH_6M__)
>>>> -  .ifc \signed, unsigned
>>>> -  cmp     r0, #0
>>>> -  beq     1f
>>>> -  mov     r0, #0
>>>> -  mvn     r0, r0          @ 0xffffffff
>>>> -1:
>>>> -  .else
>>>> -  cmp     r0, #0
>>>> -  beq     2f
>>>> -  blt     3f
>>>> +
>>>> +  push    {r0, lr}
>>>>    mov     r0, #0
>>>> -  mvn     r0, r0
>>>> -  lsr     r0, r0, #1      @ 0x7fffffff
>>>> -  b       2f
>>>> -3:        mov     r0, #0x80
>>>> -  lsl     r0, r0, #24     @ 0x80000000
>>>> -2:
>>>> -  .endif
>>>> -  push    {r0, r1, r2}
>>>> -  ldr     r0, 4f
>>>> -  adr     r1, 4f
>>>> -  add     r0, r1
>>>> -  str     r0, [sp, #8]
>>>> -  @ We know we are not on armv4t, so pop pc is safe.
>>>> -  pop     {r0, r1, pc}
>>>> -  .align  2
>>>> -4:
>>>> -  .word   __aeabi_idiv0 - 4b
>>>> +  bl      SYM(__aeabi_idiv0)
>>>> +  pop     {r1, pc}
>>>> +
>>>
>>> I'd still retain the comment about pop pc here because there's often a 
>>> misconception of merging armv4t and armv6m code.
>>>
>>>>  #elif defined(__thumb2__)
>>>>    .syntax unified
>>>>    .ifc \signed, unsigned
>>>> @@ -945,7 +923,170 @@ LSYM(Lover7):
>>>>    add     dividend, work
>>>>    .endif
>>>>  LSYM(Lgot_result):
>>>> -.endm     
>>>> +.endm
>>>> +
>>>> +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
>>>> +/* If performance is preferred, the following functions are provided.  */
>>>> +
>>>
>>> Comment above #if please and also check elsewhere in patch.
>>>
>>>> +/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
>>>> +.macro BranchToDiv n, label
>>>> +  lsr     curbit, dividend, \n
>>>> +  cmp     curbit, divisor
>>>> +  blo     \label
>>>> +.endm
>>>> +
>>>> +/* Body of div(n).  Shift the divisor in n bits and compare the divisor
>>>> +   and dividend.  Update the dividend as the substruction result.  */
>>>> +.macro DoDiv n
>>>> +  lsr     curbit, dividend, \n
>>>> +  cmp     curbit, divisor
>>>> +  bcc     1f
>>>> +  lsl     curbit, divisor, \n
>>>> +  sub     dividend, dividend, curbit
>>>> +
>>>> +1:        adc     result, result
>>>> +.endm
>>>> +
>>>> +/* The body of division with positive divisor.  Unless the divisor is very
>>>> +   big, shift it up in multiples of four bits, since this is the amount of
>>>> +   unwinding in the main division loop.  Continue shifting until the 
>>>> divisor
>>>> +   is larger than the dividend.  */
>>>> +.macro THUMB1_Div_Positive
>>>> +  mov     result, #0
>>>> +  BranchToDiv #1, LSYM(Lthumb1_div1)
>>>> +  BranchToDiv #4, LSYM(Lthumb1_div4)
>>>> +  BranchToDiv #8, LSYM(Lthumb1_div8)
>>>> +  BranchToDiv #12, LSYM(Lthumb1_div12)
>>>> +  BranchToDiv #16, LSYM(Lthumb1_div16)
>>>> +LSYM(Lthumb1_div_large_positive):
>>>> +  mov     result, #0xff
>>>> +  lsl     divisor, divisor, #8
>>>> +  rev     result, result
>>>> +  lsr     curbit, dividend, #16
>>>> +  cmp     curbit, divisor
>>>> +  blo     1f
>>>> +  asr     result, #8
>>>> +  lsl     divisor, divisor, #8
>>>> +  beq     LSYM(Ldivbyzero_waypoint)
>>>> +
>>>> +1:        lsr     curbit, dividend, #12
>>>> +  cmp     curbit, divisor
>>>> +  blo     LSYM(Lthumb1_div12)
>>>> +  b       LSYM(Lthumb1_div16)
>>>> +LSYM(Lthumb1_div_loop):
>>>> +  lsr     divisor, divisor, #8
>>>> +LSYM(Lthumb1_div16):
>>>> +  Dodiv   #15
>>>> +  Dodiv   #14
>>>> +  Dodiv   #13
>>>> +  Dodiv   #12
>>>> +LSYM(Lthumb1_div12):
>>>> +  Dodiv   #11
>>>> +  Dodiv   #10
>>>> +  Dodiv   #9
>>>> +  Dodiv   #8
>>>> +  bcs     LSYM(Lthumb1_div_loop)
>>>> +LSYM(Lthumb1_div8):
>>>> +  Dodiv   #7
>>>> +  Dodiv   #6
>>>> +  Dodiv   #5
>>>> +LSYM(Lthumb1_div5):
>>>> +  Dodiv   #4
>>>> +LSYM(Lthumb1_div4):
>>>> +  Dodiv   #3
>>>> +LSYM(Lthumb1_div3):
>>>> +  Dodiv   #2
>>>> +LSYM(Lthumb1_div2):
>>>> +  Dodiv   #1
>>>> +LSYM(Lthumb1_div1):
>>>> +  sub     divisor, dividend, divisor
>>>> +  bcs     1f
>>>> +  cpy     divisor, dividend
>>>> +
>>>> +1:        adc     result, result
>>>> +  cpy     dividend, result
>>>> +  RET
>>>> +
>>>> +LSYM(Ldivbyzero_waypoint):
>>>> +  b       LSYM(Ldiv0)
>>>> +.endm
>>>> +
>>>> +/* The body of division with negative divisor.  Similar with
>>>> +   THUMB1_Div_Positive except that the shift steps are in multiples
>>>> +   of six bits.  */
>>>> +.macro THUMB1_Div_Negative
>>>> +  lsr     result, divisor, #31
>>>> +  beq     1f
>>>> +  neg     divisor, divisor
>>>> +
>>>> +1:        asr     curbit, dividend, #32
>>>> +  bcc     2f
>>>> +  neg     dividend, dividend
>>>> +
>>>> +2:        eor     curbit, result
>>>> +  mov     result, #0
>>>> +  cpy     ip, curbit
>>>> +  BranchToDiv #4, LSYM(Lthumb1_div_negative4)
>>>> +  BranchToDiv #8, LSYM(Lthumb1_div_negative8)
>>>> +LSYM(Lthumb1_div_large):
>>>> +  mov     result, #0xfc
>>>> +  lsl     divisor, divisor, #6
>>>> +  rev     result, result
>>>> +  lsr     curbit, dividend, #8
>>>> +  cmp     curbit, divisor
>>>> +  blo     LSYM(Lthumb1_div_negative8)
>>>> +
>>>> +  lsl     divisor, divisor, #6
>>>> +  asr     result, result, #6
>>>> +  cmp     curbit, divisor
>>>> +  blo     LSYM(Lthumb1_div_negative8)
>>>> +
>>>> +  lsl     divisor, divisor, #6
>>>> +  asr     result, result, #6
>>>> +  cmp     curbit, divisor
>>>> +  blo     LSYM(Lthumb1_div_negative8)
>>>> +
>>>> +  lsl     divisor, divisor, #6
>>>> +  beq     LSYM(Ldivbyzero_negative)
>>>> +  asr     result, result, #6
>>>> +  b       LSYM(Lthumb1_div_negative8)
>>>> +LSYM(Lthumb1_div_negative_loop):
>>>> +  lsr     divisor, divisor, #6
>>>> +LSYM(Lthumb1_div_negative8):
>>>> +  DoDiv   #7
>>>> +  DoDiv   #6
>>>> +  DoDiv   #5
>>>> +  DoDiv   #4
>>>> +LSYM(Lthumb1_div_negative4):
>>>> +  DoDiv   #3
>>>> +  DoDiv   #2
>>>> +  bcs     LSYM(Lthumb1_div_negative_loop)
>>>> +  DoDiv   #1
>>>> +  sub     divisor, dividend, divisor
>>>> +  bcs     1f
>>>> +  cpy     divisor, dividend
>>>> +
>>>> +1:        cpy     curbit, ip
>>>> +  adc     result, result
>>>> +  asr     curbit, curbit, #1
>>>> +  cpy     dividend, result
>>>> +  bcc     2f
>>>> +  neg     dividend, dividend
>>>> +  cmp     curbit, #0
>>>> +
>>>> +2:        bpl     3f
>>>> +  neg     divisor, divisor
>>>> +
>>>> +3:        RET
>>>> +
>>>> +LSYM(Ldivbyzero_negative):
>>>> +  cpy     curbit, ip
>>>> +  asr     curbit, curbit, #1
>>>> +  bcc     LSYM(Ldiv0)
>>>> +  neg     dividend, dividend
>>>> +.endm
>>>> +#endif /* ARM Thumb version.  */
>>>> +
>>>>  /* 
>>>> ------------------------------------------------------------------------ */
>>>>  /*                Start of the Real Functions                             
>>>>     */
>>>>  /* 
>>>> ------------------------------------------------------------------------ */
>>>> @@ -955,6 +1096,7 @@ LSYM(Lgot_result):
>>>>  
>>>>    FUNC_START udivsi3
>>>>    FUNC_ALIAS aeabi_uidiv udivsi3
>>>> +#if defined(__OPTIMIZE_SIZE__)
>>>>  
>>>>    cmp     divisor, #0
>>>>    beq     LSYM(Ldiv0)
>>>> @@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
>>>>    pop     { work }
>>>>    RET
>>>>  
>>>> +#else
>>>> +  /* Implementation of aeabi_uidiv for ARMv6m.  This version is only
>>>> +     used in ARMv6-M when we need an efficient implementation.  */
>>>> +LSYM(udivsi3_skip_div0_test):
>>>> +  THUMB1_Div_Positive
>>>> +
>>>> +#endif /* __OPTIMIZE_SIZE__ */
>>>> +
>>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>>  
>>>>    ARM_FUNC_START udivsi3
>>>> @@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
>>>>  FUNC_START aeabi_uidivmod
>>>>    cmp     r1, #0
>>>>    beq     LSYM(Ldiv0)
>>>> +# if defined(__OPTIMIZE_SIZE__)
>>>>    push    {r0, r1, lr}
>>>>    bl      LSYM(udivsi3_skip_div0_test)
>>>>    POP     {r1, r2, r3}
>>>>    mul     r2, r0
>>>>    sub     r1, r1, r2
>>>>    bx      r3
>>>> +# else
>>>> +  /* Both the quotient and remainder are calculated simultaneously
>>>> +     in THUMB1_Div_Positive.  There is no need to calculate the
>>>> +     remainder again here.  */
>>>> +  b       LSYM(udivsi3_skip_div0_test)
>>>> +  RET
>>>> +# endif /* __OPTIMIZE_SIZE__ */
>>>> +
>>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>>  ARM_FUNC_START aeabi_uidivmod
>>>>    cmp     r1, #0
>>>> @@ -1084,7 +1243,7 @@ LSYM(Lover10):
>>>>    RET
>>>>    
>>>>  #else  /* ARM version.  */
>>>> -  
>>>> +
>>>>    FUNC_START umodsi3
>>>>  
>>>>    subs    r2, r1, #1                      @ compare divisor with 1
>>>> @@ -1109,8 +1268,9 @@ LSYM(Lover10):
>>>>  
>>>>  #if defined(__prefer_thumb__)
>>>>  
>>>> -  FUNC_START divsi3       
>>>> +  FUNC_START divsi3
>>>>    FUNC_ALIAS aeabi_idiv divsi3
>>>> +#if defined(__OPTIMIZE_SIZE__)
>>>>  
>>>>    cmp     divisor, #0
>>>>    beq     LSYM(Ldiv0)
>>>> @@ -1133,7 +1293,7 @@ LSYM(Lover11):
>>>>    blo     LSYM(Lgot_result)
>>>>  
>>>>    THUMB_DIV_MOD_BODY 0
>>>> -  
>>>> +
>>>>    mov     r0, result
>>>>    mov     work, ip
>>>>    cmp     work, #0
>>>> @@ -1142,6 +1302,21 @@ LSYM(Lover11):
>>>>  LSYM(Lover12):
>>>>    pop     { work }
>>>>    RET
>>>> +#else
>>>> +  /* Implementation of aeabi_idiv for ARMv6m.  This version is only
>>>> +     used in ARMv6-M when we need an efficient implementation.  */
>>>> +LSYM(divsi3_skip_div0_test):
>>>> +  cpy     curbit, dividend
>>>> +  orr     curbit, divisor
>>>> +  bmi     LSYM(Lthumb1_div_negative)
>>>> +
>>>> +LSYM(Lthumb1_div_positive):
>>>> +  THUMB1_Div_Positive
>>>> +
>>>> +LSYM(Lthumb1_div_negative):
>>>> +  THUMB1_Div_Negative
>>>> +
>>>> +#endif /* __OPTIMIZE_SIZE__ */
>>>>  
>>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>>  
>>>> @@ -1154,8 +1329,8 @@ LSYM(Lover12):
>>>>    RET
>>>>  
>>>>  #else /* ARM/Thumb-2 version.  */
>>>> -  
>>>> -  ARM_FUNC_START divsi3   
>>>> +
>>>> +  ARM_FUNC_START divsi3
>>>>    ARM_FUNC_ALIAS aeabi_idiv divsi3
>>>>  
>>>>    cmp     r1, #0
>>>> @@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
>>>>  FUNC_START aeabi_idivmod
>>>>    cmp     r1, #0
>>>>    beq     LSYM(Ldiv0)
>>>> +# if defined(__OPTIMIZE_SIZE__)
>>>>    push    {r0, r1, lr}
>>>>    bl      LSYM(divsi3_skip_div0_test)
>>>>    POP     {r1, r2, r3}
>>>>    mul     r2, r0
>>>>    sub     r1, r1, r2
>>>>    bx      r3
>>>> +# else
>>>> +  /* Both the quotient and remainder are calculated simultaneously
>>>> +     in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
>>>> +     need to calculate the remainder again here.  */
>>>> +  b       LSYM(divsi3_skip_div0_test)
>>>> +  RET
>>>> +# endif /* __OPTIMIZE_SIZE__ */
>>>> +
>>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>>  ARM_FUNC_START aeabi_idivmod
>>>>    cmp     r1, #0
>>>> -- 1.9.1
>>>>
>>>
>>> Otherwise OK if no regressions and the following request passes.
>>>
>>> Can you ensure that libgcc for one ARM state and one Thumb2 state non-v6m 
>>> configuration should give identical binaries with and without your patch, 
>>> no ? 
>>>
>>> regards
>>> Ramana
>>>
>> Hi Ramana,
>>
>> Thank you for the comments. Sorry about the license, must have been a
>> mixup somewhere.
>>
>> I put back the 'pop pc is safe' assembly comment and I moved some
>> comments before the #if and #else as requested. I left some in place
>> because they did not apply to the whole block but simply to the first
>> assembly instruction after the #if/else.
>>
>> I checked that the assembly generated for libgcc was the same with and
>> without the patch for armv7-a in arm mode and armv7-m in thumb mode.
>>
>> Is this OK?
>>
>> Cheers,
>> Andre
>>
>> libgcc/ChangeLog:
>> 2016-07-06  Hale Wang  <hale.w...@arm.com>
>>             Andre Vieira  <andre.simoesdiasvie...@arm.com>
>>
>>    * config/arm/lib1funcs.S: Add new wrapper.
>>
> I had to rebase the patch due to the ARMv8-M patches. This implied
> changing a context line that changed due to the code for ARMv6-M being
> reused for ARMv8-M Baseline.
> 
> I ran regression tests for both ARMv6-M and ARMv8-M Baseline and
> compared the generated libgcc for ARMv7-A in ARM mode and ARMv7-M in
> Thumb mode, observing no changes.
> 
> Applying patch, as it was previously OK'ed.
> 
> Cheers,
> Andre
> 
Backported to embedded-6-branch at revision r238240.

Cheers,
Andre

Reply via email to