Hi Richard, This is the updated patch with your comments. In addition to that, I removed vectype_in from the build_vect_cond_expr call, since it wasn't really necessary.
Alejandro > -----Original Message----- > From: Richard Sandiford <richard.sandif...@arm.com> > Sent: 26 April 2019 14:29 > To: Alejandro Martinez Vicente <alejandro.martinezvice...@arm.com> > Cc: GCC Patches <gcc-patches@gcc.gnu.org>; nd <n...@arm.com>; Richard > Biener <richard.guent...@gmail.com> > Subject: Re: [Aarch64][SVE] Dot product support > > Alejandro Martinez Vicente <alejandro.martinezvice...@arm.com> writes: > > Hi, > > > > This patch does two things. For the general vectorizer, it adds > > support to perform fully masked reductions over expressions that don't > support masking. > > This is achieved by using VEC_COND_EXPR where possible. At the moment > > this is implemented for DOT_PROD_EXPR only, but the framework is there > > to extend it to other expressions. > > > > Related to that, this patch adds support to vectorize dot product > > using SVE. It also uses the new functionality to ensure that the resulting > loop is masked. > > > > Given this input code: > > > > uint32_t > > dotprod (uint8_t *restrict x, uint8_t *restrict y, int n) { > > uint32_t sum = 0; > > > > for (int i = 0; i < n; i++) > > { > > sum += x[i] * y[i]; > > } > > > > return sum; > > } > > > > The resulting SVE code is: > > > > 0000000000000000 <dotprod>: > > 0: 7100005f cmp w2, #0x0 > > 4: 5400024d b.le 4c <dotprod+0x4c> > > 8: d2800003 mov x3, #0x0 // #0 > > c: 93407c42 sxtw x2, w2 > > 10: 2538c001 mov z1.b, #0 > > 14: 25221fe0 whilelo p0.b, xzr, x2 > > 18: 2538c003 mov z3.b, #0 > > 1c: d503201f nop > > 20: a4034002 ld1b {z2.b}, p0/z, [x0, x3] > > 24: a4034020 ld1b {z0.b}, p0/z, [x1, x3] > > 28: 0430e3e3 incb x3 > > 2c: 0523c000 sel z0.b, p0, z0.b, z3.b > > 30: 25221c60 whilelo p0.b, x3, x2 > > 34: 44820401 udot z1.s, z0.b, z2.b > > 38: 54ffff41 b.ne 20 <dotprod+0x20> // b.any > > 3c: 2598e3e0 ptrue p0.s > > 40: 04812021 uaddv d1, p0, z1.s > > 44: 1e260020 fmov w0, s1 > > 48: d65f03c0 ret > > 4c: 1e2703e1 fmov s1, wzr > > 50: 1e260020 fmov w0, s1 > > 54: d65f03c0 ret > > > > Notice how udot is used inside a fully masked loop. > > > > I tested this patch in an aarch64 machine bootstrapping the compiler > > and running the checks. > > > > I admit it is too late to merge this into gcc 9, but I'm posting it > > anyway so it can be considered for gcc 10. > > > > Alejandro > > > > > > gcc/Changelog: > > > > 2019-01-31 Alejandro Martinez <alejandro.martinezvice...@arm.com> > > > > * config/aarch64/aarch64-sve.md (<sur>dot_prod<vsi2qi>): Taken > from SVE > > ACLE branch. > > * config/aarch64/iterators.md: Copied Vetype_fourth, VSI2QI and > vsi2qi from > > SVE ACLE branch. > > * tree-vect-loop.c (use_mask_by_cond_expr_p): New function to > check if a > > VEC_COND_EXPR be inserted to emulate a conditional internal > function. > > (build_vect_cond_expr): Emit the VEC_COND_EXPR. > > (vectorizable_reduction): Use the functions above to vectorize in a > > fully masked loop codes that don't have a conditional internal > > function. > > > > gcc/testsuite/Changelog: > > > > 2019-01-31 Alejandro Martinez <alejandro.martinezvice...@arm.com> > > > > * gcc.target/aarch64/sve/dot_1.c: New test for dot product. > > > > diff --git a/gcc/config/aarch64/aarch64-sve.md > > b/gcc/config/aarch64/aarch64-sve.md > > index 5bb3422..2779a21 100644 > > --- a/gcc/config/aarch64/aarch64-sve.md > > +++ b/gcc/config/aarch64/aarch64-sve.md > > @@ -3128,3 +3128,17 @@ > > DONE; > > } > > ) > > + > > +;; Unpredicated DOT product. > > +(define_insn "<sur>dot_prod<vsi2qi>" > > + [(set (match_operand:SVE_SDI 0 "register_operand" "=w, ?&w") > > + (plus:SVE_SDI (unspec:SVE_SDI [(match_operand:<VSI2QI> 1 > "register_operand" "w, w") > > + (match_operand:<VSI2QI> 2 > "register_operand" "w, w")] > > + DOTPROD) > > + (match_operand:SVE_SDI 3 "register_operand" "0, w")))] > > + "TARGET_SVE" > > + "@ > > + <sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth> > > + > movprfx\t%0, %3\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vet > ype_fourth>" > > + [(set_attr "movprfx" "*,yes")] > > +) > > diff --git a/gcc/config/aarch64/iterators.md > > b/gcc/config/aarch64/iterators.md index 85fa161..faf588a 100644 > > --- a/gcc/config/aarch64/iterators.md > > +++ b/gcc/config/aarch64/iterators.md > > @@ -663,6 +663,9 @@ > > (QI "b") (HI "h") > > (SI "s") (DI "d")]) > > > > +;; Mode to one fourth individual element type mapping used in instruction > DOT. > > +(define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")]) > > Maybe better as: > > ;; Like Vetype, but map to types that are a quarter of the element size. > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c > > b/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c > > new file mode 100644 > > index 0000000..8ff6671 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c > > @@ -0,0 +1,39 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -ftree-vectorize" } */ > > + > > +#include <stdint.h> > > + > > +#define DEF_DOT(TYPE1, TYPE2) > \ > > +TYPE1 __attribute__ ((noinline, noclone)) \ > > +dot_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n) > > \ > > +{ \ > > + TYPE1 sum = 0; \ > > + for (int i = 0; i < n; i++) > > \ > > + { > > \ > > + sum += x[i] * y[i]; \ > > + } > > \ > > + return sum; > \ > > +} > > + > > +DEF_DOT(uint32_t, uint8_t) > > +DEF_DOT(int32_t, int8_t) > > +DEF_DOT(int64_t, int16_t) > > + > > +/* The uint16_t->uint64_t dot product requires a casting to satisfy the C > > + language rules. */ > > +uint64_t __attribute__ ((noinline, noclone)) dot_uint64_t_uint16_t > > +(uint16_t *restrict x, uint16_t *restrict y, int n) { > > + uint64_t sum = 0; > > + for (int i = 0; i < n; i++) > > + { > > + sum += (unsigned int)x[i] * y[i]; > > + } > > + return sum; > > +} > > + > > +/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, > > +z[0-9]+\.b\n} 1 } } */ > > +/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.s, z[0-9]+\.b, > > +z[0-9]+\.b\n} 1 } } */ > > +/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, > > +z[0-9]+\.h\n} 1 } } */ > > +/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.d, z[0-9]+\.h, > > +z[0-9]+\.h\n} 1 } } */ > > +/* { dg-final { scan-assembler-times {\twhilelo\t} 8 } } */ > > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index > > 202cab9..2278e8b 100644 > > --- a/gcc/tree-vect-loop.c > > +++ b/gcc/tree-vect-loop.c > > @@ -5885,6 +5885,52 @@ is_nonwrapping_integer_induction > (stmt_vec_info stmt_vinfo, struct loop *loop) > > <= TYPE_PRECISION (lhs_type)); > > } > > > > +/* If there is no conditional internal function availabe to vectorize > > +CODE, can > > available > > > + a VEC_COND_EXPR be inserted to emulate one? */ static bool > > +use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn, > > + tree vectype_in) > > Need to document the COND_FN and VECTYPE_IN parameters too. > > > +{ > > + if (cond_fn != IFN_LAST > > + && direct_internal_fn_supported_p (cond_fn, vectype_in, > > + OPTIMIZE_FOR_SPEED)) > > + return false; > > + > > + switch (code) > > + { > > + case DOT_PROD_EXPR: > > + return true; > > + > > + default: > > + return false; > > + } > > +} > > + > > +/* Create a vectorized conditional version of code by using a > > +non-conditional > > ...of CODE by... > > > + vector expression plus any VEC_COND_EXPR that may be needed. */ > > +static void build_vect_cond_expr (enum tree_code code, tree vop[3], > > +tree mask, > > + tree vectype_in, gimple_stmt_iterator * gsi) > > Need to document the parameters here too. Should be no space after > pointer "*". (Yeah, the code's not very consistent about this.) > > > +{ > > + switch (code) > > + { > > + case DOT_PROD_EXPR: > > + { > > + tree zero = build_int_cst (TREE_TYPE (vectype_in), 0); > > + zero = build_vector_from_val (vectype_in, zero); > > Should reduce to: > > tree zero = build_zero_cst (vectype_in); > > > + tree new_op1 = make_temp_ssa_name (vectype_in, NULL, > "new_op1"); > > Maybe "masked_" rather than "new_"? This name will persist in debugging > dumps for later passes. > > Thanks, > Richard
dot_v2.patch
Description: dot_v2.patch