> -----Original Message----- > From: Richard Sandiford <richard.sandif...@arm.com> > Sent: Wednesday, March 8, 2023 9:18 AM > To: Tamar Christina <tamar.christ...@arm.com> > Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; Richard Earnshaw > <richard.earns...@arm.com>; Marcus Shawcroft > <marcus.shawcr...@arm.com>; Kyrylo Tkachov <kyrylo.tkac...@arm.com> > Subject: Re: [PATCH 4/4]AArch64 Update div-bitmask to implement new > optab instead of target hook [PR108583] > > Tamar Christina <tamar.christ...@arm.com> writes: > > Ping, > > > > And updating the hook. > > > > There are no new test as new correctness tests were added to the > > mid-end and the existing codegen tests for this already exist. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > PR target/108583 > > * config/aarch64/aarch64-simd.md > (@aarch64_bitmask_udiv<mode>3): Remove. > > (*bitmask_shift_plus<mode>): New. > > * config/aarch64/aarch64-sve2.md (*bitmask_shift_plus<mode>): New. > > (@aarch64_bitmask_udiv<mode>3): Remove. > > * config/aarch64/aarch64.cc > > (aarch64_vectorize_can_special_div_by_constant, > > TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST): Removed. > > (TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT, > > aarch64_vectorize_preferred_div_as_shifts_over_mult): New. > > > > --- inline copy of patch --- > > > > diff --git a/gcc/config/aarch64/aarch64-simd.md > > b/gcc/config/aarch64/aarch64-simd.md > > index > > > 7f212bf37cd2c120dceb7efa733c9fa76226f029..e1ecb88634f93d380ef534 > 093ea6 > > 599dc7278108 100644 > > --- a/gcc/config/aarch64/aarch64-simd.md > > +++ b/gcc/config/aarch64/aarch64-simd.md > > @@ -4867,60 +4867,27 @@ (define_expand > "aarch64_<sur><addsub>hn2<mode>" > > } > > ) > > > > -;; div optimizations using narrowings -;; we can do the division e.g. > > shorts by 255 faster by calculating it as -;; (x + ((x + 257) >> 8)) > > >> 8 assuming the operation is done in -;; double the precision of x. > > -;; > > -;; If we imagine a short as being composed of two blocks of bytes > > then -;; adding 257 or 0b0000_0001_0000_0001 to the number is > > equivalent to -;; adding 1 to each sub component: > > -;; > > -;; short value of 16-bits > > -;; ┌──────────────┬────────────────┐ > > -;; │ │ │ > > -;; └──────────────┴────────────────┘ > > -;; 8-bit part1 ▲ 8-bit part2 ▲ > > -;; │ │ > > -;; │ │ > > -;; +1 +1 > > -;; > > -;; after the first addition, we have to shift right by 8, and narrow > > the -;; results back to a byte. Remember that the addition must be > > done in -;; double the precision of the input. Since 8 is half the > > size of a short -;; we can use a narrowing halfing instruction in > > AArch64, addhn which also -;; does the addition in a wider precision > > and narrows back to a byte. The -;; shift itself is implicit in the > > operation as it writes back only the top -;; half of the result. i.e. bits > > 2*esize- > 1:esize. > > -;; > > -;; Since we have narrowed the result of the first part back to a > > byte, for -;; the second addition we can use a widening addition, uaddw. > > -;; > > -;; For the final shift, since it's unsigned arithmetic we emit an ushr by > > 8. > > -;; > > -;; The shift is later optimized by combine to a uzp2 with movi #0. > > -(define_expand "@aarch64_bitmask_udiv<mode>3" > > - [(match_operand:VQN 0 "register_operand") > > - (match_operand:VQN 1 "register_operand") > > - (match_operand:VQN 2 "immediate_operand")] > > +;; Optimize ((a + b) >> n) + c where n is half the bitsize of the > > +vector (define_insn_and_split "*bitmask_shift_plus<mode>" > > + [(set (match_operand:VQN 0 "register_operand" "=&w") > > + (plus:VQN > > + (lshiftrt:VQN > > + (plus:VQN (match_operand:VQN 1 "register_operand" "w") > > + (match_operand:VQN 2 "register_operand" "w")) > > + (match_operand:VQN 3 > > +"aarch64_simd_shift_imm_vec_exact_top" "Dr")) > > I guess this is personal preference, sorry, but I think we should drop the > constraint. The predicate does the real check, and the operand is never > reloaded, so "Dr" isn't any more helpful than an empty constraint, and IMO > can be confusing. > > > + (match_operand:VQN 4 "register_operand" "w")))] > > "TARGET_SIMD" > > + "#" > > + "&& true" > > + [(const_int 0)] > > { > > - unsigned HOST_WIDE_INT size > > - = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1; > > - rtx elt = unwrap_const_vec_duplicate (operands[2]); > > - if (!CONST_INT_P (elt) || UINTVAL (elt) != size) > > - FAIL; > > - > > - rtx addend = gen_reg_rtx (<MODE>mode); > > - rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1); > > - emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, > > <VNARROWQ2>mode)); > > - rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode); > > - rtx tmp2 = gen_reg_rtx (<MODE>mode); > > - emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend)); > > - unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode); > > - rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, > > bitsize); > > - emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1)); > > - emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, > > shift_vector)); > > + rtx tmp; > > + if (can_create_pseudo_p ()) > > + tmp = gen_reg_rtx (<VNARROWQ>mode); else > > + tmp = gen_rtx_REG (<VNARROWQ>mode, REGNO (operands[0])); > > + emit_insn (gen_aarch64_addhn<mode> (tmp, operands[1], > operands[2])); > > + emit_insn (gen_aarch64_uaddw<Vnarrowq> (operands[0], operands[4], > > + tmp)); > > DONE; > > }) > > In the previous review, I said: > > However, IIUC, this pattern would only be formed from combining > three distinct patterns. Is that right? If so, we should be able > to handle it as a plain define_split, with no define_insn. > That should make things simpler, so would be worth trying before > the changes I mentioned above. > > Did you try that? I still think it'd be preferable to defining a new insn.
Yes I did! Sorry I forgot to mention that. When I made it a split for some reason It wasn't matching it anymore. Regards, Tamar > > > diff --git a/gcc/config/aarch64/aarch64-sve2.md > > b/gcc/config/aarch64/aarch64-sve2.md > > index > > > 40c0728a7e6f00c395c360ce7625bc2e4a018809..bed44d7d687387738622 > 2d56144c > > c115e3953a61 100644 > > --- a/gcc/config/aarch64/aarch64-sve2.md > > +++ b/gcc/config/aarch64/aarch64-sve2.md > > @@ -2317,41 +2317,24 @@ (define_insn > "@aarch64_sve_<optab><mode>" > > ;; ---- [INT] Misc optab implementations ;; > > ---------------------------------------------------------------------- > > --- > > ;; Includes: > > -;; - aarch64_bitmask_udiv > > +;; - bitmask_shift_plus > > This is no longer an optab. > > The original purpose of the "Includes:" comments was to list the ISA > instructions that are actually being generated, as a short-cut to working > through all the abstractions. Just listing define_insn names doesn't really > add > anything over reading the insns themselves. > > Since the new pattern is an alternative way of generating ADDHNB, it probably > belongs in the "Narrowing binary arithmetic" section. > > > ;; > > ---------------------------------------------------------------------- > > --- > > > > -;; div optimizations using narrowings -;; we can do the division e.g. > > shorts by 255 faster by calculating it as -;; (x + ((x + 257) >> 8)) > > >> 8 assuming the operation is done in -;; double the precision of x. > > -;; > > -;; See aarch64-simd.md for bigger explanation. > > -(define_expand "@aarch64_bitmask_udiv<mode>3" > > - [(match_operand:SVE_FULL_HSDI 0 "register_operand") > > - (match_operand:SVE_FULL_HSDI 1 "register_operand") > > - (match_operand:SVE_FULL_HSDI 2 "immediate_operand")] > > +;; Optimize ((a + b) >> n) where n is half the bitsize of the vector > > +(define_insn "*bitmask_shift_plus<mode>" > > + [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w") > > + (unspec:SVE_FULL_HSDI > > + [(match_operand:<VPRED> 1) > > + (lshiftrt:SVE_FULL_HSDI > > + (plus:SVE_FULL_HSDI > > + (match_operand:SVE_FULL_HSDI 2 "register_operand" "w") > > + (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")) > > + (match_operand:SVE_FULL_HSDI 4 > > + "aarch64_simd_shift_imm_vec_exact_top" "Dr"))] > > Same comment about the constraints here. > > > + UNSPEC_PRED_X))] > > "TARGET_SVE2" > > -{ > > - unsigned HOST_WIDE_INT size > > - = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1; > > - rtx elt = unwrap_const_vec_duplicate (operands[2]); > > - if (!CONST_INT_P (elt) || UINTVAL (elt) != size) > > - FAIL; > > - > > - rtx addend = gen_reg_rtx (<MODE>mode); > > - rtx tmp1 = gen_reg_rtx (<VNARROW>mode); > > - rtx tmp2 = gen_reg_rtx (<VNARROW>mode); > > - rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1); > > - emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, > > <VNARROW>mode)); > > - emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, > operands[1], > > - addend)); > > - emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, > operands[1], > > - lowpart_subreg (<MODE>mode, tmp1, > > - <VNARROW>mode))); > > - emit_move_insn (operands[0], > > - lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode)); > > - DONE; > > -}) > > + "addhnb\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>" > > +) > > The pattern LGTM otherwise. > > > ;; > > > =================================================================== > === > > === > > ;; == Permutation > > diff --git a/gcc/config/aarch64/aarch64.cc > > b/gcc/config/aarch64/aarch64.cc index > > > e6f47cbbb0d04a6f33b9a741ebb614cabd0204b9..eb4f99ee524844ed5b36 > 84c6fe80 > > 7a4128685423 100644 > > --- a/gcc/config/aarch64/aarch64.cc > > +++ b/gcc/config/aarch64/aarch64.cc > > @@ -3849,6 +3849,19 @@ aarch64_vectorize_related_mode > (machine_mode vector_mode, > > return default_vectorize_related_mode (vector_mode, element_mode, > > nunits); } > > > > +/* Implement > TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */ > > + > > +static bool > > +aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type) > > +{ > > + machine_mode mode = TYPE_MODE (type); > > + unsigned int vec_flags = aarch64_classify_vector_mode (mode); > > + bool sve_p = (vec_flags & VEC_ANY_SVE); > > + bool simd_p = (vec_flags & VEC_ADVSIMD); > > + > > + return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD); } > > + > > And the hook LGTM too. > > Thanks, > Richard > > > /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations, > > prefer to use the first arithmetic operand as the else value if > > the else value doesn't matter, since that exactly matches the SVE > > @@ -24363,46 +24376,6 @@ aarch64_vectorize_vec_perm_const > > (machine_mode vmode, machine_mode op_mode, > > > > return ret; > > } > > - > > -/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */ > > - > > -bool > > -aarch64_vectorize_can_special_div_by_constant (enum tree_code code, > > - tree vectype, wide_int cst, > > - rtx *output, rtx in0, rtx > > in1) > > -{ > > - if (code != TRUNC_DIV_EXPR > > - || !TYPE_UNSIGNED (vectype)) > > - return false; > > - > > - machine_mode mode = TYPE_MODE (vectype); > > - unsigned int flags = aarch64_classify_vector_mode (mode); > > - if ((flags & VEC_ANY_SVE) && !TARGET_SVE2) > > - return false; > > - > > - int pow = wi::exact_log2 (cst + 1); > > - auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE > > (vectype)); > > - /* SVE actually has a div operator, we may have gotten here through > > - that route. */ > > - if (pow != (int) (element_precision (vectype) / 2) > > - || insn_code == CODE_FOR_nothing) > > - return false; > > - > > - /* We can use the optimized pattern. */ > > - if (in0 == NULL_RTX && in1 == NULL_RTX) > > - return true; > > - > > - gcc_assert (output); > > - > > - expand_operand ops[3]; > > - create_output_operand (&ops[0], *output, mode); > > - create_input_operand (&ops[1], in0, mode); > > - create_fixed_operand (&ops[2], in1); > > - expand_insn (insn_code, 3, ops); > > - *output = ops[0].value; > > - return true; > > -} > > - > > /* Generate a byte permute mask for a register of mode MODE, > > which has NUNITS units. */ > > > > @@ -27904,13 +27877,13 @@ > aarch64_libgcc_floating_mode_supported_p > > #undef TARGET_MAX_ANCHOR_OFFSET > > #define TARGET_MAX_ANCHOR_OFFSET 4095 > > > > +#undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT > > +#define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \ > > + aarch64_vectorize_preferred_div_as_shifts_over_mult > > + > > #undef TARGET_VECTOR_ALIGNMENT > > #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment > > > > -#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST > > -#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \ > > - aarch64_vectorize_can_special_div_by_constant > > - > > #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT > > #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \ > > aarch64_vectorize_preferred_vector_alignment