RE: [PATCH 4/4]AArch64 Update div-bitmask to implement new optab instead of target hook [PR108583]

Tamar Christina via Gcc-patches Wed, 08 Mar 2023 01:25:48 -0800

> -----Original Message-----
> From: Richard Sandiford <richard.sandif...@arm.com>
> Sent: Wednesday, March 8, 2023 9:18 AM
> To: Tamar Christina <tamar.christ...@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; Richard Earnshaw
> <richard.earns...@arm.com>; Marcus Shawcroft
> <marcus.shawcr...@arm.com>; Kyrylo Tkachov <kyrylo.tkac...@arm.com>
> Subject: Re: [PATCH 4/4]AArch64 Update div-bitmask to implement new
> optab instead of target hook [PR108583]
> 
> Tamar Christina <tamar.christ...@arm.com> writes:
> > Ping,
> >
> > And updating the hook.
> >
> > There are no new test as new correctness tests were added to the
> > mid-end and the existing codegen tests for this already exist.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> >         PR target/108583
> >         * config/aarch64/aarch64-simd.md
> (@aarch64_bitmask_udiv<mode>3): Remove.
> >         (*bitmask_shift_plus<mode>): New.
> >         * config/aarch64/aarch64-sve2.md (*bitmask_shift_plus<mode>): New.
> >         (@aarch64_bitmask_udiv<mode>3): Remove.
> >         * config/aarch64/aarch64.cc
> >         (aarch64_vectorize_can_special_div_by_constant,
> >         TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST): Removed.
> >         (TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT,
> >         aarch64_vectorize_preferred_div_as_shifts_over_mult): New.
> >
> > --- inline copy of patch ---
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index
> >
> 7f212bf37cd2c120dceb7efa733c9fa76226f029..e1ecb88634f93d380ef534
> 093ea6
> > 599dc7278108 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -4867,60 +4867,27 @@ (define_expand
> "aarch64_<sur><addsub>hn2<mode>"
> >    }
> >  )
> >
> > -;; div optimizations using narrowings -;; we can do the division e.g.
> > shorts by 255 faster by calculating it as -;; (x + ((x + 257) >> 8))
> > >> 8 assuming the operation is done in -;; double the precision of x.
> > -;;
> > -;; If we imagine a short as being composed of two blocks of bytes
> > then -;; adding 257 or 0b0000_0001_0000_0001 to the number is
> > equivalent to -;; adding 1 to each sub component:
> > -;;
> > -;;      short value of 16-bits
> > -;; ┌──────────────┬────────────────┐
> > -;; │              │                │
> > -;; └──────────────┴────────────────┘
> > -;;   8-bit part1 ▲  8-bit part2   ▲
> > -;;               │                │
> > -;;               │                │
> > -;;              +1               +1
> > -;;
> > -;; after the first addition, we have to shift right by 8, and narrow
> > the -;; results back to a byte.  Remember that the addition must be
> > done in -;; double the precision of the input.  Since 8 is half the
> > size of a short -;; we can use a narrowing halfing instruction in
> > AArch64, addhn which also -;; does the addition in a wider precision
> > and narrows back to a byte.  The -;; shift itself is implicit in the
> > operation as it writes back only the top -;; half of the result. i.e. bits 
> > 2*esize-
> 1:esize.
> > -;;
> > -;; Since we have narrowed the result of the first part back to a
> > byte, for -;; the second addition we can use a widening addition, uaddw.
> > -;;
> > -;; For the final shift, since it's unsigned arithmetic we emit an ushr by 
> > 8.
> > -;;
> > -;; The shift is later optimized by combine to a uzp2 with movi #0.
> > -(define_expand "@aarch64_bitmask_udiv<mode>3"
> > -  [(match_operand:VQN 0 "register_operand")
> > -   (match_operand:VQN 1 "register_operand")
> > -   (match_operand:VQN 2 "immediate_operand")]
> > +;; Optimize ((a + b) >> n) + c where n is half the bitsize of the
> > +vector (define_insn_and_split "*bitmask_shift_plus<mode>"
> > +  [(set (match_operand:VQN 0 "register_operand" "=&w")
> > +       (plus:VQN
> > +         (lshiftrt:VQN
> > +           (plus:VQN (match_operand:VQN 1 "register_operand" "w")
> > +                     (match_operand:VQN 2 "register_operand" "w"))
> > +           (match_operand:VQN 3
> > +"aarch64_simd_shift_imm_vec_exact_top" "Dr"))
> 
> I guess this is personal preference, sorry, but I think we should drop the
> constraint.  The predicate does the real check, and the operand is never
> reloaded, so "Dr" isn't any more helpful than an empty constraint, and IMO
> can be confusing.
> 
> > +         (match_operand:VQN 4 "register_operand" "w")))]
> >    "TARGET_SIMD"
> > +  "#"
> > +  "&& true"
> > +  [(const_int 0)]
> >  {
> > -  unsigned HOST_WIDE_INT size
> > -    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
> > -  rtx elt = unwrap_const_vec_duplicate (operands[2]);
> > -  if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
> > -    FAIL;
> > -
> > -  rtx addend = gen_reg_rtx (<MODE>mode);
> > -  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
> > -  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val,
> > <VNARROWQ2>mode));
> > -  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
> > -  rtx tmp2 = gen_reg_rtx (<MODE>mode);
> > -  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
> > -  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
> > -  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> > bitsize);
> > -  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
> > -  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2,
> > shift_vector));
> > +  rtx tmp;
> > +  if (can_create_pseudo_p ())
> > +    tmp = gen_reg_rtx (<VNARROWQ>mode);  else
> > +    tmp = gen_rtx_REG (<VNARROWQ>mode, REGNO (operands[0]));
> > + emit_insn (gen_aarch64_addhn<mode> (tmp, operands[1],
> operands[2]));
> > + emit_insn (gen_aarch64_uaddw<Vnarrowq> (operands[0], operands[4],
> > + tmp));
> >    DONE;
> >  })
> 
> In the previous review, I said:
> 
>   However, IIUC, this pattern would only be formed from combining
>   three distinct patterns.  Is that right?  If so, we should be able
>   to handle it as a plain define_split, with no define_insn.
>   That should make things simpler, so would be worth trying before
>   the changes I mentioned above.
> 
> Did you try that?  I still think it'd be preferable to defining a new insn.


Yes I did! Sorry I forgot to mention that.  When I made it a split for some
reason It wasn't matching it anymore.

Regards,
Tamar
> 
> > diff --git a/gcc/config/aarch64/aarch64-sve2.md
> > b/gcc/config/aarch64/aarch64-sve2.md
> > index
> >
> 40c0728a7e6f00c395c360ce7625bc2e4a018809..bed44d7d687387738622
> 2d56144c
> > c115e3953a61 100644
> > --- a/gcc/config/aarch64/aarch64-sve2.md
> > +++ b/gcc/config/aarch64/aarch64-sve2.md
> > @@ -2317,41 +2317,24 @@ (define_insn
> "@aarch64_sve_<optab><mode>"
> >  ;; ---- [INT] Misc optab implementations  ;;
> > ----------------------------------------------------------------------
> > ---
> >  ;; Includes:
> > -;; - aarch64_bitmask_udiv
> > +;; - bitmask_shift_plus
> 
> This is no longer an optab.
> 
> The original purpose of the "Includes:" comments was to list the ISA
> instructions that are actually being generated, as a short-cut to working
> through all the abstractions.  Just listing define_insn names doesn't really 
> add
> anything over reading the insns themselves.
> 
> Since the new pattern is an alternative way of generating ADDHNB, it probably
> belongs in the "Narrowing binary arithmetic" section.
> 
> >  ;;
> > ----------------------------------------------------------------------
> > ---
> >
> > -;; div optimizations using narrowings -;; we can do the division e.g.
> > shorts by 255 faster by calculating it as -;; (x + ((x + 257) >> 8))
> > >> 8 assuming the operation is done in -;; double the precision of x.
> > -;;
> > -;; See aarch64-simd.md for bigger explanation.
> > -(define_expand "@aarch64_bitmask_udiv<mode>3"
> > -  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
> > -   (match_operand:SVE_FULL_HSDI 1 "register_operand")
> > -   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
> > +;; Optimize ((a + b) >> n) where n is half the bitsize of the vector
> > +(define_insn "*bitmask_shift_plus<mode>"
> > +  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
> > +       (unspec:SVE_FULL_HSDI
> > +          [(match_operand:<VPRED> 1)
> > +           (lshiftrt:SVE_FULL_HSDI
> > +             (plus:SVE_FULL_HSDI
> > +               (match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
> > +               (match_operand:SVE_FULL_HSDI 3 "register_operand" "w"))
> > +             (match_operand:SVE_FULL_HSDI 4
> > +                "aarch64_simd_shift_imm_vec_exact_top" "Dr"))]
> 
> Same comment about the constraints here.
> 
> > +          UNSPEC_PRED_X))]
> >    "TARGET_SVE2"
> > -{
> > -  unsigned HOST_WIDE_INT size
> > -    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
> > -  rtx elt = unwrap_const_vec_duplicate (operands[2]);
> > -  if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
> > -    FAIL;
> > -
> > -  rtx addend = gen_reg_rtx (<MODE>mode);
> > -  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
> > -  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
> > -  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
> > -  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val,
> > <VNARROW>mode));
> > -  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1,
> operands[1],
> > -                             addend));
> > -  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2,
> operands[1],
> > -                             lowpart_subreg (<MODE>mode, tmp1,
> > -                                             <VNARROW>mode)));
> > -  emit_move_insn (operands[0],
> > -                 lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
> > -  DONE;
> > -})
> > +  "addhnb\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
> > +)
> 
> The pattern LGTM otherwise.
> 
> >  ;;
> >
> ===================================================================
> ===
> > ===
> >  ;; == Permutation
> > diff --git a/gcc/config/aarch64/aarch64.cc
> > b/gcc/config/aarch64/aarch64.cc index
> >
> e6f47cbbb0d04a6f33b9a741ebb614cabd0204b9..eb4f99ee524844ed5b36
> 84c6fe80
> > 7a4128685423 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -3849,6 +3849,19 @@ aarch64_vectorize_related_mode
> (machine_mode vector_mode,
> >    return default_vectorize_related_mode (vector_mode, element_mode,
> > nunits);  }
> >
> > +/* Implement
> TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
> > +
> > +static bool
> > +aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
> > +{
> > +  machine_mode mode = TYPE_MODE (type);
> > +  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
> > +  bool sve_p = (vec_flags & VEC_ANY_SVE);
> > +  bool simd_p = (vec_flags & VEC_ADVSIMD);
> > +
> > +  return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD); }
> > +
> 
> And the hook LGTM too.
> 
> Thanks,
> Richard
> 
> >  /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
> >     prefer to use the first arithmetic operand as the else value if
> >     the else value doesn't matter, since that exactly matches the SVE
> > @@ -24363,46 +24376,6 @@ aarch64_vectorize_vec_perm_const
> > (machine_mode vmode, machine_mode op_mode,
> >
> >    return ret;
> >  }
> > -
> > -/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
> > -
> > -bool
> > -aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
> > -                                              tree vectype, wide_int cst,
> > -                                              rtx *output, rtx in0, rtx 
> > in1)
> > -{
> > -  if (code != TRUNC_DIV_EXPR
> > -      || !TYPE_UNSIGNED (vectype))
> > -    return false;
> > -
> > -  machine_mode mode = TYPE_MODE (vectype);
> > -  unsigned int flags = aarch64_classify_vector_mode (mode);
> > -  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
> > -    return false;
> > -
> > -  int pow = wi::exact_log2 (cst + 1);
> > -  auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE
> > (vectype));
> > -  /* SVE actually has a div operator, we may have gotten here through
> > -     that route.  */
> > -  if (pow != (int) (element_precision (vectype) / 2)
> > -      || insn_code == CODE_FOR_nothing)
> > -    return false;
> > -
> > -  /* We can use the optimized pattern.  */
> > -  if (in0 == NULL_RTX && in1 == NULL_RTX)
> > -    return true;
> > -
> > -  gcc_assert (output);
> > -
> > -  expand_operand ops[3];
> > -  create_output_operand (&ops[0], *output, mode);
> > -  create_input_operand (&ops[1], in0, mode);
> > -  create_fixed_operand (&ops[2], in1);
> > -  expand_insn (insn_code, 3, ops);
> > -  *output = ops[0].value;
> > -  return true;
> > -}
> > -
> >  /* Generate a byte permute mask for a register of mode MODE,
> >     which has NUNITS units.  */
> >
> > @@ -27904,13 +27877,13 @@
> aarch64_libgcc_floating_mode_supported_p
> >  #undef TARGET_MAX_ANCHOR_OFFSET
> >  #define TARGET_MAX_ANCHOR_OFFSET 4095
> >
> > +#undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
> > +#define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
> > +  aarch64_vectorize_preferred_div_as_shifts_over_mult
> > +
> >  #undef TARGET_VECTOR_ALIGNMENT
> >  #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
> >
> > -#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> > -#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
> > -  aarch64_vectorize_can_special_div_by_constant
> > -
> >  #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
> >  #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
> >    aarch64_vectorize_preferred_vector_alignment

RE: [PATCH 4/4]AArch64 Update div-bitmask to implement new optab instead of target hook [PR108583]

Reply via email to