Wilco Dijkstra <wilco.dijks...@arm.com> writes: > Improve immediate expansion of immediates which can be created from a > bitmask immediate and 2 MOVKs. This reduces the number of 4-instruction > immediates in SPECINT/FP by 10-15%. > > Passes regress, OK for commit? > > gcc/ChangeLog: > > PR target/106583 > * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate) > Add support for a bitmask immediate with 2 MOVKs. > > gcc/testsuite: > PR target/106583 > * gcc.target/aarch64/pr106583.c: Add new test.
Nice. Did you consider handling the case where the movks aren't for consecutive bitranges? E.g. the patch handles: 0x12345678aaaaaaaa and: 0x1234cccccccc5678 but it looks like it would be fairly easy to extend it to: 0x1234cccc5678cccc too. Also, could you commonise: val2 = val & ~mask; if (val2 != val && aarch64_bitmask_imm (val2, mode)) break; val2 = val | mask; if (val2 != val && aarch64_bitmask_imm (val2, mode)) break; val2 = val2 & ~mask; val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask); if (val2 != val && aarch64_bitmask_imm (val2, mode)) break; ? It's subtle enough that IMO it'd be better not to cut-&-paste it. Thanks, Richard > --- > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > 926e81f028c82aac9a5fecc18f921f84399c24ae..1601d11710cb6132c80a77bb4fe2f8429519aa5a > 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -5568,7 +5568,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool > generate, > one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) + > ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0); > > - if (zero_match != 2 && one_match != 2) > + if (zero_match < 2 && one_match < 2) > { > /* Try emitting a bitmask immediate with a movk replacing 16 bits. > For a 64-bit bitmask try whether changing 16 bits to all ones or > @@ -5600,6 +5600,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, > bool generate, > } > } > > + /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. > */ > + if (zero_match + one_match == 0) > + { > + mask = 0xffffffff; > + > + for (i = 0; i < 64; i += 16) > + { > + val2 = val & ~mask; > + if (aarch64_bitmask_imm (val2, mode)) > + break; > + val2 = val | mask; > + if (aarch64_bitmask_imm (val2, mode)) > + break; > + val2 = val2 & ~mask; > + val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask); > + if (aarch64_bitmask_imm (val2, mode)) > + break; > + > + mask = (mask << 16) | (mask >> 48); > + } > + > + if (i != 64) > + { > + if (generate) > + { > + emit_insn (gen_rtx_SET (dest, GEN_INT (val2))); > + emit_insn (gen_insv_immdi (dest, GEN_INT (i), > + GEN_INT ((val >> i) & 0xffff))); > + i = (i + 16) & 63; > + emit_insn (gen_insv_immdi (dest, GEN_INT (i), > + GEN_INT ((val >> i) & 0xffff))); > + } > + > + return 3; > + } > + } > + > /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which > are emitted by the initial mov. If one_match > zero_match, skip set > bits, > otherwise skip zero bits. */ > diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c > b/gcc/testsuite/gcc.target/aarch64/pr106583.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..f0a027a0950e506d4ddaacce5e151f57070948dc > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c > @@ -0,0 +1,30 @@ > +/* { dg-do assemble } */ > +/* { dg-options "-O2 --save-temps" } */ > + > +long f1 (void) > +{ > + return 0x7efefefefefefeff; > +} > + > +long f2 (void) > +{ > + return 0x12345678aaaaaaaa; > +} > + > +long f3 (void) > +{ > + return 0x1234cccccccc5678; > +} > + > +long f4 (void) > +{ > + return 0x7777123456787777; > +} > + > +long f5 (void) > +{ > + return 0x5555555512345678; > +} > + > +/* { dg-final { scan-assembler-times {\tmovk\t} 10 } } */ > +/* { dg-final { scan-assembler-times {\tmov\t} 5 } } */