On 11/01/2021 11:10, g...@danielengel.com wrote: > From: Daniel Engel <g...@danielengel.com> > > This version combines __ctzdi2() with __ctzsi2() into a single object with > an efficient tail call. The former implementation of __ctzdi2() was in C. > > On architectures without a clz instruction, this version merges the formerly > separate Thumb and ARM code sequences into a unified instruction sequence. > This change significantly improves the Thumb performance without affecting ARM > performance. Finally, this version adds a new __OPTIMIZE_SIZE__ build option. > > On architectures with a clz instruction, __ctzsi2() now return 32 instead > of -1 when the argument is 0. This costs an extra 2 instructions, branchless. > Although the output of this function is technically undefined when the > argument > is 0, this makes the behavior consistent with __clzsi2(). > > Likewise, __ctzdi2() now returns '64' on a zero argument instead of '31' >
I think almost all of the comments on the CLZ support apply equally here as well. R. > gcc/libgcc/ChangeLog: > 2021-01-07 Daniel Engel <g...@danielengel.com> > > * config/arm/bits/ctz2.S: Size-optimized __ctzsi2(), new function > __ctzdi2(); > added logic to return '32' for x=0 when using hardware clz instruction. > * config/arm/t-elf: Add _ctzdi2, move _clzsi2 to weak LIB1ASMFUNCS > group. > --- > libgcc/config/arm/bits/ctz2.S | 287 ++++++++++++++++++++++++++-------- > libgcc/config/arm/t-elf | 3 +- > 2 files changed, 228 insertions(+), 62 deletions(-) > > diff --git a/libgcc/config/arm/bits/ctz2.S b/libgcc/config/arm/bits/ctz2.S > index f0422d1fbba..4241fdad283 100644 > --- a/libgcc/config/arm/bits/ctz2.S > +++ b/libgcc/config/arm/bits/ctz2.S > @@ -1,65 +1,230 @@ > +/* ctz2.S: ARM optimized 'ctz' functions > > -#ifdef L_ctzsi2 > -#ifdef NOT_ISA_TARGET_32BIT > -FUNC_START ctzsi2 > - negs r1, r0 > - ands r0, r0, r1 > - movs r1, #28 > - movs r3, #1 > - lsls r3, r3, #16 > - cmp r0, r3 /* 0x10000 */ > - bcc 2f > - lsrs r0, r0, #16 > - subs r1, r1, #16 > -2: lsrs r3, r3, #8 > - cmp r0, r3 /* #0x100 */ > - bcc 2f > - lsrs r0, r0, #8 > - subs r1, r1, #8 > -2: lsrs r3, r3, #4 > - cmp r0, r3 /* #0x10 */ > - bcc 2f > - lsrs r0, r0, #4 > - subs r1, r1, #4 > -2: adr r2, 1f > - ldrb r0, [r2, r0] > - subs r0, r0, r1 > - bx lr > -.align 2 > -1: > -.byte 27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31 > - FUNC_END ctzsi2 > + Copyright (C) 2020-2021 Free Software Foundation, Inc. > + Contributed by Daniel Engel (g...@danielengel.com) > + > + This file is free software; you can redistribute it and/or modify it > + under the terms of the GNU General Public License as published by the > + Free Software Foundation; either version 3, or (at your option) any > + later version. > + > + This file is distributed in the hope that it will be useful, but > + WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + General Public License for more details. > + > + Under Section 7 of GPL version 3, you are granted additional > + permissions described in the GCC Runtime Library Exception, version > + 3.1, as published by the Free Software Foundation. > + > + You should have received a copy of the GNU General Public License and > + a copy of the GCC Runtime Library Exception along with this program; > + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see > + <http://www.gnu.org/licenses/>. */ > + > + > +// When the hardware 'ctz' function is available, an efficient version > +// of __ctzsi2(x) can be created by calculating '31 - __ctzsi2(lsb(x))', > +// where lsb(x) is 'x' with only the least-significant '1' bit set. > +// The following offset applies to all of the functions in this file. > +#if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ > + #define CTZ_RESULT_OFFSET 1 > #else > -ARM_FUNC_START ctzsi2 > - rsb r1, r0, #0 > - and r0, r0, r1 > -# if defined (__ARM_FEATURE_CLZ) > - clz r0, r0 > - rsb r0, r0, #31 > - RET > -# else > - mov r1, #28 > - cmp r0, #0x10000 > - do_it cs, t > - movcs r0, r0, lsr #16 > - subcs r1, r1, #16 > - cmp r0, #0x100 > - do_it cs, t > - movcs r0, r0, lsr #8 > - subcs r1, r1, #8 > - cmp r0, #0x10 > - do_it cs, t > - movcs r0, r0, lsr #4 > - subcs r1, r1, #4 > - adr r2, 1f > - ldrb r0, [r2, r0] > - sub r0, r0, r1 > - RET > -.align 2 > -1: > -.byte 27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31 > -# endif /* !defined (__ARM_FEATURE_CLZ) */ > - FUNC_END ctzsi2 > + #define CTZ_RESULT_OFFSET 0 > #endif > -#endif /* L_clzsi2 */ > + > + > +#ifdef L_ctzdi2 > + > +// int __ctzdi2(long long) > +// Counts trailing zeros in a 64 bit double word. > +// Expects the argument in $r1:$r0. > +// Returns the result in $r0. > +// Uses $r2 and possibly $r3 as scratch space. > +FUNC_START_SECTION ctzdi2 .text.sorted.libgcc.ctz2.ctzdi2 > + CFI_START_FUNCTION > + > + #if defined(__ARMEB__) && __ARMEB__ > + // Assume all the bits in the argument are zero. > + movs r2, #(64 - CTZ_RESULT_OFFSET) > + > + // Check if the lower word is zero. > + cmp r1, #0 > + > + // The lower word is zero, so calculate 32 + __ctzsi2(upper). > + beq SYM(__internal_ctzsi2) > + > + // The lower word is non-zero, so set up __ctzsi2(lower). > + // Then fall through. > + movs r0, r1 > + > + #else /* !__ARMEB__ */ > + // Check if the lower word is zero. > + cmp r0, #0 > + > + // If the lower word is non-zero, result is just __ctzsi2(lower). > + bne SYM(__ctzsi2) > + > + // The lower word is zero, so calculate 32 + __ctzsi2(upper). > + movs r2, #(64 - CTZ_RESULT_OFFSET) > + movs r0, r1 > + b SYM(__internal_ctzsi2) > + > + #endif /* !__ARMEB__ */ > + > +#endif /* L_ctzdi2 */ > + > + > +// The bitwise implementation of __ctzdi2() tightly couples with __ctzsi2(), > +// such that instructions must appear consecutively in the same memory > +// section for proper flow control. However, this construction inhibits > +// the ability to discard __ctzdi2() when only using __ctzsi2(). > +// Therefore, this block configures __ctzsi2() for compilation twice. > +// The first version is a minimal standalone implementation, and the second > +// version is the continuation of __ctzdi2(). The standalone version must > +// be declared WEAK, so that the combined version can supersede it and > +// provide both symbols when required. > +// '_ctzsi2' should appear before '_ctzdi2' in LIB1ASMFUNCS. > +#if defined(L_ctzsi2) || defined(L_ctzdi2) > + > +#ifdef L_ctzsi2 > +// int __ctzsi2(int) > +// Counts trailing zeros in a 32 bit word. > +// Expects the argument in $r0. > +// Returns the result in $r0. > +// Uses $r2 and possibly $r3 as scratch space. > +WEAK_START_SECTION ctzsi2 .text.sorted.libgcc.ctz2.ctzdi2 > + CFI_START_FUNCTION > + > +#else /* L_ctzdi2 */ > +FUNC_ENTRY ctzsi2 > + > +#endif > + > + // Assume all the bits in the argument are zero > + movs r2, #(32 - CTZ_RESULT_OFFSET) > + > +#ifdef L_ctzsi2 > + WEAK_ENTRY internal_ctzsi2 > +#else /* L_ctzdi2 */ > + FUNC_ENTRY internal_ctzsi2 > +#endif > + > + #if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ > + > + // Find the least-significant '1' bit of the argument. > + rsbs r1, r0, #0 > + ands r1, r0 > + > + // Maintain result compatibility with the software implementation. > + // Technically, __ctzsi2(0) is undefined, but 32 seems better than > -1. > + // (or possibly 31 if this is an intermediate result for > __ctzdi2(0)). > + // The carry flag from 'rsbs' gives '-1' iff the argument was 'zero'. > + // (NOTE: 'ands' with 0 shift bits does not change the carry flag.) > + // After the jump, the final result will be '31 - (-1)'. > + sbcs r0, r0 > + > + #ifdef __ARM_FEATURE_IT > + do_it ne > + #else > + beq LLSYM(__ctz_zero) > + #endif > + > + // Gives the number of '0' bits left of the least-significant '1'. > + IT(clz,ne) r0, r1 > + > + #elif defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__ > + // Size optimized: 24 bytes, 52 cycles > + // Speed optimized: 52 bytes, 21 cycles > + > + // Binary search starts at half the word width. > + movs r3, #16 > + > + LLSYM(__ctz_loop): > + // Test the upper 'n' bits of the operand for ZERO. > + movs r1, r0 > + lsls r1, r3 > + > + // When the test fails, discard the lower bits of the register, > + // and deduct the count of discarded bits from the result. > + #ifdef __ARM_FEATURE_IT > + do_it ne, t > + #else > + beq LLSYM(__ctz_skip) > + #endif > + > + IT(mov,ne) r0, r1 > + IT(sub,ne) r2, r3 > + > + LLSYM(__ctz_skip): > + // Decrease the shift distance for the next test. > + lsrs r3, #1 > + bne LLSYM(__ctz_loop) > + > + // Prepare the remainder. > + lsrs r0, #31 > + > + #else /* !__OPTIMIZE_SIZE__ */ > + > + // Unrolled binary search. > + lsls r1, r0, #16 > + #ifdef __ARM_FEATURE_IT > + do_it ne, t > + #else > + beq LLSYM(__ctz8) > + #endif > + > + IT(mov,ne) r0, r1 > + IT(sub,ne) r2, #16 > + > + LLSYM(__ctz8): > + lsls r1, r0, #8 > + #ifdef __ARM_FEATURE_IT > + do_it ne, t > + #else > + beq LLSYM(__ctz4) > + #endif > + > + IT(mov,ne) r0, r1 > + IT(sub,ne) r2, #8 > + > + LLSYM(__ctz4): > + lsls r1, r0, #4 > + #ifdef __ARM_FEATURE_IT > + do_it ne, t > + #else > + beq LLSYM(__ctz2) > + #endif > + > + IT(mov,ne) r0, r1 > + IT(sub,ne) r2, #4 > + > + LLSYM(__ctz2): > + // Load the remainder by index > + lsrs r0, #28 > + adr r3, LLSYM(__ctz_remainder) > + ldrb r0, [r3, r0] > + > + #endif /* !__OPTIMIZE_SIZE__ */ > + > + LLSYM(__ctz_zero): > + // Apply the remainder. > + subs r0, r2, r0 > + RET > + > + #if (!defined(__ARM_FEATURE_CLZ) || !__ARM_FEATURE_CLZ) && \ > + (!defined(__OPTIMIZE_SIZE__) || !__OPTIMIZE_SIZE__) > + .align 2 > + LLSYM(__ctz_remainder): > + .byte 0,4,3,4,2,4,3,4,1,4,3,4,2,4,3,4 > + #endif > + > + CFI_END_FUNCTION > +FUNC_END ctzsi2 > + > +#ifdef L_ctzdi2 > +FUNC_END ctzdi2 > +#endif > + > +#endif /* L_ctzsi2 || L_ctzdi2 */ > > diff --git a/libgcc/config/arm/t-elf b/libgcc/config/arm/t-elf > index 8f1a4614904..998169e24c8 100644 > --- a/libgcc/config/arm/t-elf > +++ b/libgcc/config/arm/t-elf > @@ -24,6 +24,7 @@ endif # !__symbian__ > # See respective sources for rationale. > LIB1ASMFUNCS += \ > _clzsi2 \ > + _ctzsi2 \ > > > # Group 1: Integer functions > @@ -32,7 +33,7 @@ LIB1ASMFUNCS += \ > _ashrdi3 \ > _lshrdi3 \ > _clzdi2 \ > - _ctzsi2 \ > + _ctzdi2 \ > _dvmd_tls \ > _divsi3 \ > _modsi3 \ >