Wilco Dijkstra via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > Hi Richard, > >> Did you consider handling the case where the movks aren't for >> consecutive bitranges? E.g. the patch handles: > >> but it looks like it would be fairly easy to extend it to: >> >> 0x1234cccc5678cccc > > Yes, with a more general search loop we can get that case too - > it doesn't trigger much though. The code that checks for this is > now refactored into a new function. Given there are now many > more calls to aarch64_bitmask_imm, I added a streamlined internal > entry point that assumes the input is 64-bit.
Sounds good, but could you put it before the mode version, to avoid the forward declaration? OK with that change, thanks. Richard > Cheers, > Wilco > > [PATCH v2][AArch64] Improve immediate expansion [PR106583] > > Improve immediate expansion of immediates which can be created from a > bitmask immediate and 2 MOVKs. Simplify, refactor and improve > efficiency of bitmask checks. This reduces the number of 4-instruction > immediates in SPECINT/FP by 10-15%. > > Passes regress, OK for commit? > > gcc/ChangeLog: > > PR target/106583 > * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate) > Add support for a bitmask immediate with 2 MOVKs. > (aarch64_check_bitmask): New function after refactorization. > (aarch64_replicate_bitmask_imm): Remove function, merge into... > (aarch64_bitmask_imm): Simplify replication of small modes. > Split function into 64-bit only version for efficiency. > > gcc/testsuite: > PR target/106583 > * gcc.target/aarch64/pr106583.c: Add new test. > > --- > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > 926e81f028c82aac9a5fecc18f921f84399c24ae..b2d9c7380975028131d0fe731a97b3909874b87b > 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -306,6 +306,7 @@ static machine_mode aarch64_simd_container_mode > (scalar_mode, poly_int64); > static bool aarch64_print_address_internal (FILE*, machine_mode, rtx, > aarch64_addr_query_type); > static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val); > +static bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT); > > /* The processor for which instructions should be scheduled. */ > enum aarch64_processor aarch64_tune = cortexa53; > @@ -5502,6 +5503,30 @@ aarch64_output_sve_vector_inc_dec (const char > *operands, rtx x) > factor, nelts_per_vq); > } > > +/* Return true if the immediate VAL can be a bitfield immediate > + by changing the given MASK bits in VAL to zeroes, ones or bits > + from the other half of VAL. Return the new immediate in VAL2. */ > +static inline bool > +aarch64_check_bitmask (unsigned HOST_WIDE_INT val, > + unsigned HOST_WIDE_INT &val2, > + unsigned HOST_WIDE_INT mask) > +{ > + val2 = val & ~mask; > + if (val2 != val && aarch64_bitmask_imm (val2)) > + return true; > + val2 = val | mask; > + if (val2 != val && aarch64_bitmask_imm (val2)) > + return true; > + val = val & ~mask; > + val2 = val | (((val >> 32) | (val << 32)) & mask); > + if (val2 != val && aarch64_bitmask_imm (val2)) > + return true; > + val2 = val | (((val >> 16) | (val << 48)) & mask); > + if (val2 != val && aarch64_bitmask_imm (val2)) > + return true; > + return false; > +} > + > static int > aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate, > scalar_int_mode mode) > @@ -5568,36 +5593,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, > bool generate, > one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) + > ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0); > > - if (zero_match != 2 && one_match != 2) > + if (zero_match < 2 && one_match < 2) > { > /* Try emitting a bitmask immediate with a movk replacing 16 bits. > For a 64-bit bitmask try whether changing 16 bits to all ones or > zeroes creates a valid bitmask. To check any repeated bitmask, > try using 16 bits from the other 32-bit half of val. */ > > - for (i = 0; i < 64; i += 16, mask <<= 16) > - { > - val2 = val & ~mask; > - if (val2 != val && aarch64_bitmask_imm (val2, mode)) > - break; > - val2 = val | mask; > - if (val2 != val && aarch64_bitmask_imm (val2, mode)) > - break; > - val2 = val2 & ~mask; > - val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask); > - if (val2 != val && aarch64_bitmask_imm (val2, mode)) > - break; > - } > - if (i != 64) > - { > - if (generate) > + for (i = 0; i < 64; i += 16) > + if (aarch64_check_bitmask (val, val2, mask << i)) > + { > + if (generate) > + { > + emit_insn (gen_rtx_SET (dest, GEN_INT (val2))); > + emit_insn (gen_insv_immdi (dest, GEN_INT (i), > + GEN_INT ((val >> i) & 0xffff))); > + } > + return 2; > + } > + } > + > + /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. > */ > + if (zero_match + one_match == 0) > + { > + for (i = 0; i < 48; i += 16) > + for (int j = i + 16; j < 64; j += 16) > + if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j))) > { > - emit_insn (gen_rtx_SET (dest, GEN_INT (val2))); > - emit_insn (gen_insv_immdi (dest, GEN_INT (i), > - GEN_INT ((val >> i) & 0xffff))); > + if (generate) > + { > + emit_insn (gen_rtx_SET (dest, GEN_INT (val2))); > + emit_insn (gen_insv_immdi (dest, GEN_INT (i), > + GEN_INT ((val >> i) & 0xffff))); > + emit_insn (gen_insv_immdi (dest, GEN_INT (j), > + GEN_INT ((val >> j) & 0xffff))); > + } > + return 3; > } > - return 2; > - } > } > > /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which > @@ -10168,22 +10200,6 @@ aarch64_movk_shift (const wide_int_ref &and_val, > return -1; > } > > -/* VAL is a value with the inner mode of MODE. Replicate it to fill a > - 64-bit (DImode) integer. */ > - > -static unsigned HOST_WIDE_INT > -aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode) > -{ > - unsigned int size = GET_MODE_UNIT_PRECISION (mode); > - while (size < 64) > - { > - val &= (HOST_WIDE_INT_1U << size) - 1; > - val |= val << size; > - size *= 2; > - } > - return val; > -} > - > /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */ > > static const unsigned HOST_WIDE_INT bitmask_imm_mul[] = > @@ -10196,26 +10212,42 @@ static const unsigned HOST_WIDE_INT > bitmask_imm_mul[] = > }; > > > -/* Return true if val is a valid bitmask immediate. */ > - > +/* Return true if val is a valid bitmask immediate for any mode. */ > bool > aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode) > { > - unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one; > + if (mode == DImode) > + return aarch64_bitmask_imm (val_in); > + > + unsigned HOST_WIDE_INT val = val_in; > + > + if (mode == SImode) > + return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32)); > + > + /* Replicate small immediates to fit 64 bits. */ > + int size = GET_MODE_UNIT_PRECISION (mode); > + val &= (HOST_WIDE_INT_1U << size) - 1; > + val *= bitmask_imm_mul[__builtin_clz (size) - 26]; > + > + return aarch64_bitmask_imm (val); > +} > + > + > +/* Return true if 64-bit val is a valid bitmask immediate. */ > + > +static bool > +aarch64_bitmask_imm (unsigned HOST_WIDE_INT val) > +{ > + unsigned HOST_WIDE_INT tmp, mask, first_one, next_one; > int bits; > > /* Check for a single sequence of one bits and return quickly if so. > The special cases of all ones and all zeroes returns false. */ > - val = aarch64_replicate_bitmask_imm (val_in, mode); > tmp = val + (val & -val); > > if (tmp == (tmp & -tmp)) > return (val + 1) > 1; > > - /* Replicate 32-bit immediates so we can treat them as 64-bit. */ > - if (mode == SImode) > - val = (val << 32) | (val & 0xffffffff); > - > /* Invert if the immediate doesn't start with a zero bit - this means we > only need to search for sequences of one bits. */ > if (val & 1) > diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c > b/gcc/testsuite/gcc.target/aarch64/pr106583.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..0f931580817d78dc1cc58f03b251bd21bec71f59 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c > @@ -0,0 +1,41 @@ > +/* { dg-do assemble } */ > +/* { dg-options "-O2 --save-temps" } */ > + > +long f1 (void) > +{ > + return 0x7efefefefefefeff; > +} > + > +long f2 (void) > +{ > + return 0x12345678aaaaaaaa; > +} > + > +long f3 (void) > +{ > + return 0x1234cccccccc5678; > +} > + > +long f4 (void) > +{ > + return 0x7777123456787777; > +} > + > +long f5 (void) > +{ > + return 0x5555555512345678; > +} > + > +long f6 (void) > +{ > + return 0x1234bbbb5678bbbb; > +} > + > +long f7 (void) > +{ > + return 0x4444123444445678; > +} > + > + > +/* { dg-final { scan-assembler-times {\tmovk\t} 14 } } */ > +/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */