On architectures without __ARM_FEATURE_CLZ, this version combines __clzdi2() with __clzsi2() into a single object with an efficient tail call. Also, this version merges the formerly separate Thumb and ARM code implementations into a unified instruction sequence. This change significantly improves Thumb performance without affecting ARM performance. Finally, this version adds a new __OPTIMIZE_SIZE__ build option (binary search loop).
There is no change to the code for architectures with __ARM_FEATURE_CLZ. gcc/libgcc/ChangeLog: 2021-01-13 Daniel Engel <g...@danielengel.com> * config/arm/bits/clz2.S (__clzsi2, __clzdi2): Reduced code size on architectures without __ARM_FEATURE_CLZ. * config/arm/t-elf (LIB1ASMFUNCS): Moved _clzsi2 to new weak roup. --- libgcc/config/arm/clz2.S | 362 +++++++++++++++++++++++++-------------- libgcc/config/arm/t-elf | 7 +- 2 files changed, 236 insertions(+), 133 deletions(-) diff --git a/libgcc/config/arm/clz2.S b/libgcc/config/arm/clz2.S index 2ad9a81892c..dc246708a82 100644 --- a/libgcc/config/arm/clz2.S +++ b/libgcc/config/arm/clz2.S @@ -1,145 +1,243 @@ -/* Copyright (C) 1995-2021 Free Software Foundation, Inc. +/* clz2.S: Cortex M0 optimized 'clz' functions -This file is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the -Free Software Foundation; either version 3, or (at your option) any -later version. + Copyright (C) 2018-2021 Free Software Foundation, Inc. + Contributed by Daniel Engel (g...@danielengel.com) -This file is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -General Public License for more details. + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. -Under Section 7 of GPL version 3, you are granted additional -permissions described in the GCC Runtime Library Exception, version -3.1, as published by the Free Software Foundation. + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. -You should have received a copy of the GNU General Public License and -a copy of the GCC Runtime Library Exception along with this program; -see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -<http://www.gnu.org/licenses/>. */ + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + + +#if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ + +#ifdef L_clzdi2 + +// int __clzdi2(long long) +// Counts leading zero bits in $r1:$r0. +// Returns the result in $r0. +FUNC_START_SECTION clzdi2 .text.sorted.libgcc.clz2.clzdi2 + CFI_START_FUNCTION + + // Moved here from lib1funcs.S + cmp xxh, #0 + do_it eq, et + clzeq r0, xxl + clzne r0, xxh + addeq r0, #32 + RET + + CFI_END_FUNCTION +FUNC_END clzdi2 + +#endif /* L_clzdi2 */ #ifdef L_clzsi2 -#ifdef NOT_ISA_TARGET_32BIT -FUNC_START clzsi2 - movs r1, #28 - movs r3, #1 - lsls r3, r3, #16 - cmp r0, r3 /* 0x10000 */ - bcc 2f - lsrs r0, r0, #16 - subs r1, r1, #16 -2: lsrs r3, r3, #8 - cmp r0, r3 /* #0x100 */ - bcc 2f - lsrs r0, r0, #8 - subs r1, r1, #8 -2: lsrs r3, r3, #4 - cmp r0, r3 /* #0x10 */ - bcc 2f - lsrs r0, r0, #4 - subs r1, r1, #4 -2: adr r2, 1f - ldrb r0, [r2, r0] - adds r0, r0, r1 - bx lr -.align 2 -1: -.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 - FUNC_END clzsi2 -#else -ARM_FUNC_START clzsi2 -# if defined (__ARM_FEATURE_CLZ) - clz r0, r0 - RET -# else - mov r1, #28 - cmp r0, #0x10000 - do_it cs, t - movcs r0, r0, lsr #16 - subcs r1, r1, #16 - cmp r0, #0x100 - do_it cs, t - movcs r0, r0, lsr #8 - subcs r1, r1, #8 - cmp r0, #0x10 - do_it cs, t - movcs r0, r0, lsr #4 - subcs r1, r1, #4 - adr r2, 1f - ldrb r0, [r2, r0] - add r0, r0, r1 - RET -.align 2 -1: -.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 -# endif /* !defined (__ARM_FEATURE_CLZ) */ - FUNC_END clzsi2 -#endif + +// int __clzsi2(int) +// Counts leading zero bits in $r0. +// Returns the result in $r0. +FUNC_START_SECTION clzsi2 .text.sorted.libgcc.clz2.clzsi2 + CFI_START_FUNCTION + + // Moved here from lib1funcs.S + clz r0, r0 + RET + + CFI_END_FUNCTION +FUNC_END clzsi2 + #endif /* L_clzsi2 */ +#else /* !__ARM_FEATURE_CLZ */ + #ifdef L_clzdi2 -#if !defined (__ARM_FEATURE_CLZ) - -# ifdef NOT_ISA_TARGET_32BIT -FUNC_START clzdi2 - push {r4, lr} - cmp xxh, #0 - bne 1f -# ifdef __ARMEB__ - movs r0, xxl - bl __clzsi2 - adds r0, r0, #32 - b 2f -1: - bl __clzsi2 -# else - bl __clzsi2 - adds r0, r0, #32 - b 2f -1: - movs r0, xxh - bl __clzsi2 -# endif -2: - pop {r4, pc} -# else /* NOT_ISA_TARGET_32BIT */ -ARM_FUNC_START clzdi2 - do_push {r4, lr} - cmp xxh, #0 - bne 1f -# ifdef __ARMEB__ - mov r0, xxl - bl __clzsi2 - add r0, r0, #32 - b 2f -1: - bl __clzsi2 -# else - bl __clzsi2 - add r0, r0, #32 - b 2f -1: - mov r0, xxh - bl __clzsi2 -# endif -2: - RETLDM r4 - FUNC_END clzdi2 -# endif /* NOT_ISA_TARGET_32BIT */ - -#else /* defined (__ARM_FEATURE_CLZ) */ - -ARM_FUNC_START clzdi2 - cmp xxh, #0 - do_it eq, et - clzeq r0, xxl - clzne r0, xxh - addeq r0, r0, #32 - RET - FUNC_END clzdi2 -#endif +// int __clzdi2(long long) +// Counts leading zero bits in $r1:$r0. +// Returns the result in $r0. +// Uses $r2 and possibly $r3 as scratch space. +FUNC_START_SECTION clzdi2 .text.sorted.libgcc.clz2.clzdi2 + CFI_START_FUNCTION + + #if defined(__ARMEB__) && __ARMEB__ + // Check if the upper word is zero. + cmp r0, #0 + + // The upper word is non-zero, so calculate __clzsi2(upper). + bne SYM(__clzsi2) + + // The upper word is zero, so calculate 32 + __clzsi2(lower). + movs r2, #64 + movs r0, r1 + b SYM(__internal_clzsi2) + + #else /* !__ARMEB__ */ + // Assume all the bits in the argument are zero. + movs r2, #64 + + // Check if the upper word is zero. + cmp r1, #0 + + // The upper word is zero, so calculate 32 + __clzsi2(lower). + beq SYM(__internal_clzsi2) + + // The upper word is non-zero, so set up __clzsi2(upper). + // Then fall through. + movs r0, r1 + + #endif /* !__ARMEB__ */ + #endif /* L_clzdi2 */ + +// The bitwise implementation of __clzdi2() tightly couples with __clzsi2(), +// such that instructions must appear consecutively in the same memory +// section for proper flow control. However, this construction inhibits +// the ability to discard __clzdi2() when only using __clzsi2(). +// Therefore, this block configures __clzsi2() for compilation twice. +// The first version is a minimal standalone implementation, and the second +// version is the continuation of __clzdi2(). The standalone version must +// be declared WEAK, so that the combined version can supersede it and +// provide both symbols when required. +// '_clzsi2' should appear before '_clzdi2' in LIB1ASMFUNCS. +#if defined(L_clzsi2) || defined(L_clzdi2) + +#ifdef L_clzsi2 +// int __clzsi2(int) +// Counts leading zero bits in $r0. +// Returns the result in $r0. +// Uses $r2 and possibly $r3 as scratch space. +WEAK_START_SECTION clzsi2 .text.sorted.libgcc.clz2.clzsi2 + CFI_START_FUNCTION + +#else /* L_clzdi2 */ +FUNC_ENTRY clzsi2 + +#endif + + // Assume all the bits in the argument are zero + movs r2, #32 + +#ifdef L_clzsi2 + WEAK_ENTRY internal_clzsi2 +#else /* L_clzdi2 */ + FUNC_ENTRY internal_clzsi2 +#endif + + // Size optimized: 22 bytes, 51 cycles + // Speed optimized: 50 bytes, 20 cycles + + #if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__ + + // Binary search starts at half the word width. + movs r3, #16 + + LLSYM(__clz_loop): + // Test the upper 'n' bits of the operand for ZERO. + movs r1, r0 + lsrs r1, r3 + + // When the test fails, discard the lower bits of the register, + // and deduct the count of discarded bits from the result. + #ifdef __HAVE_FEATURE_IT + do_it ne, t + #else + beq LLSYM(__clz_skip) + #endif + + IT(mov,ne) r0, r1 + IT(sub,ne) r2, r3 + + LLSYM(__clz_skip): + // Decrease the shift distance for the next test. + lsrs r3, #1 + bne LLSYM(__clz_loop) + + #else /* __OPTIMIZE_SIZE__ */ + + // Unrolled binary search. + lsrs r1, r0, #16 + + #ifdef __HAVE_FEATURE_IT + do_it ne,t + #else + beq LLSYM(__clz8) + #endif + + // Out of 32 bits, the first '1' is somewhere in the highest 16, + // so the lower 16 bits are no longer interesting. + IT(mov,ne) r0, r1 + IT(sub,ne) r2, #16 + + LLSYM(__clz8): + lsrs r1, r0, #8 + + // Out of 16 bits, the first '1' is somewhere in the highest 8, + // so the lower 8 bits are no longer interesting. + #ifdef __HAVE_FEATURE_IT + do_it ne,t + #else + beq LLSYM(__clz4) + #endif + + // Out of 8 bits, the first '1' is somewhere in the highest 4, + // so the lower 4 bits are no longer interesting. + IT(mov,ne) r0, r1 + IT(sub,ne) r2, #8 + + LLSYM(__clz4): + lsrs r1, r0, #4 + + #ifdef __HAVE_FEATURE_IT + do_it ne,t + #else + beq LLSYM(__clz2) + #endif + + IT(mov,ne) r0, r1 + IT(sub,ne) r2, #4 + + LLSYM(__clz2): + // Load the remainder by index + adr r1, LLSYM(__clz_remainder) + ldrb r0, [r1, r0] + + #endif /* !__OPTIMIZE_SIZE__ */ + + // Account for the remainder. + subs r0, r2, r0 + RET + + #if !defined(__OPTIMIZE_SIZE__) || !__OPTIMIZE_SIZE__ + .align 2 + LLSYM(__clz_remainder): + .byte 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4 + #endif + + CFI_END_FUNCTION +FUNC_END clzsi2 + +#ifdef L_clzdi2 +FUNC_END clzdi2 +#endif + +#endif /* L_clzsi2 || L_clzdi2 */ + +#endif /* !__ARM_FEATURE_CLZ */ + diff --git a/libgcc/config/arm/t-elf b/libgcc/config/arm/t-elf index 93ea1cd8f76..af779afa0a9 100644 --- a/libgcc/config/arm/t-elf +++ b/libgcc/config/arm/t-elf @@ -19,13 +19,18 @@ endif # !__symbian__ # assembly implementation here for double-precision values. +# Group 0: WEAK overridable function objects. +# See respective sources for rationale. +LIB1ASMFUNCS += \ + _clzsi2 \ + + # Group 1: Integer function objects. LIB1ASMFUNCS += \ _ashldi3 \ _ashrdi3 \ _lshrdi3 \ _clzdi2 \ - _clzsi2 \ _ctzsi2 \ _dvmd_tls \ _divsi3 \ -- 2.25.1