This version combines __ctzdi2() with __ctzsi2() into a single object with an efficient tail call. The former implementation of __ctzdi2() was in C.
On architectures without __ARM_FEATURE_CLZ, this version merges the formerly separate Thumb and ARM code sequences into a unified instruction sequence. This change significantly improves Thumb performance without affecting ARM performance. Finally, this version adds a new __OPTIMIZE_SIZE__ build option. On architectures with __ARM_FEATURE_CLZ, __ctzsi2(0) now returns 32. Formerly, __ctzsi2(0) would return -1. Architectures without __ARM_FEATURE_CLZ have always returned 32, so this change makes the return value consistent. This change costs 2 extra instructions (branchless). Likewise on architectures with __ARM_FEATURE_CLZ, __ctzdi2(0) now returns 64 instead of 31. gcc/libgcc/ChangeLog: 2021-01-13 Daniel Engel <g...@danielengel.com> * config/arm/bits/ctz2.S (__ctzdi2): Added a new function. (__clzsi2): Reduced size on architectures without __ARM_FEATURE_CLZ; changed so __clzsi2(0)=32 on architectures wtih __ARM_FEATURE_CLZ. * config/arm/t-elf (LIB1ASMFUNCS): Added _ctzdi2; moved _ctzsi2 to the weak function objects group. --- libgcc/config/arm/ctz2.S | 307 +++++++++++++++++++++++++++++---------- libgcc/config/arm/t-elf | 3 +- 2 files changed, 232 insertions(+), 78 deletions(-) diff --git a/libgcc/config/arm/ctz2.S b/libgcc/config/arm/ctz2.S index 8702c9afb94..ee6df6d6d01 100644 --- a/libgcc/config/arm/ctz2.S +++ b/libgcc/config/arm/ctz2.S @@ -1,86 +1,239 @@ -/* Copyright (C) 1995-2021 Free Software Foundation, Inc. +/* ctz2.S: ARM optimized 'ctz' functions -This file is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the -Free Software Foundation; either version 3, or (at your option) any -later version. + Copyright (C) 2020-2021 Free Software Foundation, Inc. + Contributed by Daniel Engel (g...@danielengel.com) -This file is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -General Public License for more details. + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. -Under Section 7 of GPL version 3, you are granted additional -permissions described in the GCC Runtime Library Exception, version -3.1, as published by the Free Software Foundation. + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. -You should have received a copy of the GNU General Public License and -a copy of the GCC Runtime Library Exception along with this program; -see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -<http://www.gnu.org/licenses/>. */ + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ -#ifdef L_ctzsi2 -#ifdef NOT_ISA_TARGET_32BIT -FUNC_START ctzsi2 - negs r1, r0 - ands r0, r0, r1 - movs r1, #28 - movs r3, #1 - lsls r3, r3, #16 - cmp r0, r3 /* 0x10000 */ - bcc 2f - lsrs r0, r0, #16 - subs r1, r1, #16 -2: lsrs r3, r3, #8 - cmp r0, r3 /* #0x100 */ - bcc 2f - lsrs r0, r0, #8 - subs r1, r1, #8 -2: lsrs r3, r3, #4 - cmp r0, r3 /* #0x10 */ - bcc 2f - lsrs r0, r0, #4 - subs r1, r1, #4 -2: adr r2, 1f - ldrb r0, [r2, r0] - subs r0, r0, r1 - bx lr -.align 2 -1: -.byte 27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31 - FUNC_END ctzsi2 + +// When the hardware 'ctz' function is available, an efficient version +// of __ctzsi2(x) can be created by calculating '31 - __ctzsi2(lsb(x))', +// where lsb(x) is 'x' with only the least-significant '1' bit set. +// The following offset applies to all of the functions in this file. +#if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ + #define CTZ_RESULT_OFFSET 1 #else -ARM_FUNC_START ctzsi2 - rsb r1, r0, #0 - and r0, r0, r1 -# if defined (__ARM_FEATURE_CLZ) - clz r0, r0 - rsb r0, r0, #31 - RET -# else - mov r1, #28 - cmp r0, #0x10000 - do_it cs, t - movcs r0, r0, lsr #16 - subcs r1, r1, #16 - cmp r0, #0x100 - do_it cs, t - movcs r0, r0, lsr #8 - subcs r1, r1, #8 - cmp r0, #0x10 - do_it cs, t - movcs r0, r0, lsr #4 - subcs r1, r1, #4 - adr r2, 1f - ldrb r0, [r2, r0] - sub r0, r0, r1 - RET -.align 2 -1: -.byte 27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31 -# endif /* !defined (__ARM_FEATURE_CLZ) */ - FUNC_END ctzsi2 + #define CTZ_RESULT_OFFSET 0 +#endif + + +#ifdef L_ctzdi2 + +// int __ctzdi2(long long) +// Counts trailing zeros in a 64 bit double word. +// Expects the argument in $r1:$r0. +// Returns the result in $r0. +// Uses $r2 and possibly $r3 as scratch space. +FUNC_START_SECTION ctzdi2 .text.sorted.libgcc.ctz2.ctzdi2 + CFI_START_FUNCTION + + #if defined(__ARMEB__) && __ARMEB__ + // Assume all the bits in the argument are zero. + movs r2, #(64 - CTZ_RESULT_OFFSET) + + // Check if the lower word is zero. + cmp r1, #0 + + // The lower word is zero, so calculate 32 + __ctzsi2(upper). + beq SYM(__internal_ctzsi2) + + // The lower word is non-zero, so set up __ctzsi2(lower). + // Then fall through. + movs r0, r1 + + #else /* !__ARMEB__ */ + // Check if the lower word is zero. + cmp r0, #0 + + // If the lower word is non-zero, result is just __ctzsi2(lower). + bne SYM(__ctzsi2) + + // The lower word is zero, so calculate 32 + __ctzsi2(upper). + movs r2, #(64 - CTZ_RESULT_OFFSET) + movs r0, r1 + b SYM(__internal_ctzsi2) + + #endif /* !__ARMEB__ */ + +#endif /* L_ctzdi2 */ + + +// The bitwise implementation of __ctzdi2() tightly couples with __ctzsi2(), +// such that instructions must appear consecutively in the same memory +// section for proper flow control. However, this construction inhibits +// the ability to discard __ctzdi2() when only using __ctzsi2(). +// Therefore, this block configures __ctzsi2() for compilation twice. +// The first version is a minimal standalone implementation, and the second +// version is the continuation of __ctzdi2(). The standalone version must +// be declared WEAK, so that the combined version can supersede it and +// provide both symbols when required. +// '_ctzsi2' should appear before '_ctzdi2' in LIB1ASMFUNCS. +#if defined(L_ctzsi2) || defined(L_ctzdi2) + +#ifdef L_ctzsi2 +// int __ctzsi2(int) +// Counts trailing zeros in a 32 bit word. +// Expects the argument in $r0. +// Returns the result in $r0. +// Uses $r2 and possibly $r3 as scratch space. +WEAK_START_SECTION ctzsi2 .text.sorted.libgcc.ctz2.ctzdi2 + CFI_START_FUNCTION + +#else /* L_ctzdi2 */ +FUNC_ENTRY ctzsi2 + #endif -#endif /* L_clzsi2 */ + + // Assume all the bits in the argument are zero + movs r2, #(32 - CTZ_RESULT_OFFSET) + +#ifdef L_ctzsi2 + WEAK_ENTRY internal_ctzsi2 +#else /* L_ctzdi2 */ + FUNC_ENTRY internal_ctzsi2 +#endif + + #if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ + + // Find the least-significant '1' bit of the argument. + rsbs r1, r0, #0 + ands r1, r0 + + // Maintain result compatibility with the software implementation. + // Technically, __ctzsi2(0) is undefined, but 32 seems better than -1. + // (or possibly 31 if this is an intermediate result for __ctzdi2(0)). + // The carry flag from 'rsbs' gives '-1' iff the argument was 'zero'. + // (NOTE: 'ands' with 0 shift bits does not change the carry flag.) + // After the jump, the final result will be '31 - (-1)'. + sbcs r0, r0 + + #ifdef __HAVE_FEATURE_IT + do_it ne + #else + beq LLSYM(__ctz_zero) + #endif + + // Gives the number of '0' bits left of the least-significant '1'. + IT(clz,ne) r0, r1 + + #elif defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__ + // Size optimized: 24 bytes, 52 cycles + // Speed optimized: 52 bytes, 21 cycles + + // Binary search starts at half the word width. + movs r3, #16 + + LLSYM(__ctz_loop): + // Test the upper 'n' bits of the operand for ZERO. + movs r1, r0 + lsls r1, r3 + + // When the test fails, discard the lower bits of the register, + // and deduct the count of discarded bits from the result. + #ifdef __HAVE_FEATURE_IT + do_it ne, t + #else + beq LLSYM(__ctz_skip) + #endif + + IT(mov,ne) r0, r1 + IT(sub,ne) r2, r3 + + LLSYM(__ctz_skip): + // Decrease the shift distance for the next test. + lsrs r3, #1 + bne LLSYM(__ctz_loop) + + // Prepare the remainder. + lsrs r0, #31 + + #else /* !__OPTIMIZE_SIZE__ */ + + // Unrolled binary search. + lsls r1, r0, #16 + + #ifdef __HAVE_FEATURE_IT + do_it ne, t + #else + beq LLSYM(__ctz8) + #endif + + // Out of 32 bits, the first '1' is somewhere in the lowest 16, + // so the higher 16 bits are no longer interesting. + IT(mov,ne) r0, r1 + IT(sub,ne) r2, #16 + + LLSYM(__ctz8): + lsls r1, r0, #8 + + #ifdef __HAVE_FEATURE_IT + do_it ne, t + #else + beq LLSYM(__ctz4) + #endif + + // Out of 16 bits, the first '1' is somewhere in the lowest 8, + // so the higher 8 bits are no longer interesting. + IT(mov,ne) r0, r1 + IT(sub,ne) r2, #8 + + LLSYM(__ctz4): + lsls r1, r0, #4 + + #ifdef __HAVE_FEATURE_IT + do_it ne, t + #else + beq LLSYM(__ctz2) + #endif + + // Out of 8 bits, the first '1' is somewhere in the lowest 4, + // so the higher 4 bits are no longer interesting. + IT(mov,ne) r0, r1 + IT(sub,ne) r2, #4 + + LLSYM(__ctz2): + // Look up the remainder by index. + lsrs r0, #28 + adr r3, LLSYM(__ctz_remainder) + ldrb r0, [r3, r0] + + #endif /* !__OPTIMIZE_SIZE__ */ + + LLSYM(__ctz_zero): + // Apply the remainder. + subs r0, r2, r0 + RET + + #if (!defined(__ARM_FEATURE_CLZ) || !__ARM_FEATURE_CLZ) && \ + (!defined(__OPTIMIZE_SIZE__) || !__OPTIMIZE_SIZE__) + .align 2 + LLSYM(__ctz_remainder): + .byte 0,4,3,4,2,4,3,4,1,4,3,4,2,4,3,4 + #endif + + CFI_END_FUNCTION +FUNC_END ctzsi2 + +#ifdef L_ctzdi2 +FUNC_END ctzdi2 +#endif + +#endif /* L_ctzsi2 || L_ctzdi2 */ diff --git a/libgcc/config/arm/t-elf b/libgcc/config/arm/t-elf index af779afa0a9..33b83ac4adf 100644 --- a/libgcc/config/arm/t-elf +++ b/libgcc/config/arm/t-elf @@ -23,6 +23,7 @@ endif # !__symbian__ # See respective sources for rationale. LIB1ASMFUNCS += \ _clzsi2 \ + _ctzsi2 \ # Group 1: Integer function objects. @@ -31,7 +32,7 @@ LIB1ASMFUNCS += \ _ashrdi3 \ _lshrdi3 \ _clzdi2 \ - _ctzsi2 \ + _ctzdi2 \ _dvmd_tls \ _divsi3 \ _modsi3 \ -- 2.25.1