Alejandro Martinez Vicente <alejandro.martinezvice...@arm.com> writes: > Thanks for your comments Richard. I think this patch addresses them.
Yeah, this is OK to install, thanks. Richard > > Alejandro > >> -----Original Message----- >> From: Richard Sandiford <richard.sandif...@arm.com> >> Sent: 07 May 2019 15:46 >> To: Alejandro Martinez Vicente <alejandro.martinezvice...@arm.com> >> Cc: James Greenhalgh <james.greenha...@arm.com>; GCC Patches <gcc- >> patc...@gcc.gnu.org>; nd <n...@arm.com>; Richard Biener >> <richard.guent...@gmail.com> >> Subject: Re: [Aarch64][SVE] Vectorise sum-of-absolute-differences >> >> Alejandro Martinez Vicente <alejandro.martinezvice...@arm.com> writes: >> > +;; Helper expander for aarch64_<su>abd<mode>_3 to save the callers ;; >> > +the hassle of constructing the other arm of the MINUS. >> > +(define_expand "<su>abd<mode>_3" >> > + [(use (match_operand:SVE_I 0 "register_operand")) >> > + (USMAX:SVE_I (match_operand:SVE_I 1 "register_operand") >> > + (match_operand:SVE_I 2 "register_operand"))] >> > + "TARGET_SVE" >> > + { >> > + rtx other_arm >> > + = simplify_gen_binary (<MAX_OPP>, <MODE>mode, operands[1], >> > +operands[2]); >> >> I realise this is just copied from the Advanced SIMD version, but >> simplify_gen_binary is a bit dangerous here, since we explicitly want an >> unsimplified <MAX_OPP> with the two operands given. Probably better as: >> >> gen_rtx_<MAX_OPP> (<MODE>mode, ...) >> >> > + emit_insn (gen_aarch64_<su>abd<mode>_3 (operands[0], operands[1], >> > + operands[2], other_arm)); >> > + DONE; >> > + } >> > +) >> > + >> > +;; Unpredicated integer absolute difference. >> > +(define_expand "aarch64_<su>abd<mode>_3" >> > + [(set (match_operand:SVE_I 0 "register_operand") >> > + (unspec:SVE_I >> > + [(match_dup 4) >> > + (minus:SVE_I >> > + (USMAX:SVE_I >> > + (match_operand:SVE_I 1 "register_operand" "w") >> > + (match_operand:SVE_I 2 "register_operand" "w")) >> > + (match_operator 3 "aarch64_<max_opp>" >> > + [(match_dup 1) >> > + (match_dup 2)]))] >> > + UNSPEC_MERGE_PTRUE))] >> > + "TARGET_SVE" >> > + { >> > + operands[4] = force_reg (<VPRED>mode, CONSTM1_RTX >> (<VPRED>mode)); >> > + } >> > +) >> >> I think we should go directly from <su>abd<mode>_3 to the final pattern, so >> that <su>abd<mode>_3 does the force_reg too. This would make... >> >> > +;; Predicated integer absolute difference. >> > +(define_insn "*aarch64_<su>abd<mode>_3" >> >> ...this the named pattern, instead of starting with "*". >> >> > + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") >> > + (unspec:SVE_I >> > + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") >> > + (minus:SVE_I >> > + (USMAX:SVE_I >> > + (match_operand:SVE_I 2 "register_operand" "w, w") >> >> Should be "0, w", so that the first alternative ties the input to the output. >> >> > + (match_operand:SVE_I 3 "register_operand" "w, w")) >> > + (match_operator 4 "aarch64_<max_opp>" >> > + [(match_dup 2) >> > + (match_dup 3)]))] >> > + UNSPEC_MERGE_PTRUE))] >> > + "TARGET_SVE" >> > + "@ >> > + <su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >> > + >> movprfx\t%0, %2\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype >> >" >> > + [(set_attr "movprfx" "*,yes")] >> > +) >> > + >> > +;; Emit a sequence to produce a sum-of-absolute-differences of the >> > +inputs in ;; operands 1 and 2. The sequence also has to perform a >> > +widening reduction of ;; the difference into a vector and accumulate >> > +that into operand 3 before ;; copying that into the result operand 0. >> > +;; Perform that with a sequence of: >> > +;; MOV ones.b, #1 >> > +;; UABD diff.b, p0/m, op1.b, op2.b >> > +;; UDOT op3.s, diff.b, ones.b >> > +;; MOV op0, op3 // should be eliminated in later passes. >> > +;; The signed version just uses the signed variants of the above >> instructions. >> >> Think it would be clearer if we removed the last line and just used [SU]ABD >> instead of UABD, since that's the only sign-dependent part of the operation. >> Also think we should SVEize it with MOVPRFX, since a separate MOV should >> never be needed: >> >> ;; MOV ones.b, #1 >> ;; [SU]ABD diff.b, ptrue/m, op1.b, op2.b >> ;; MOVPRFX op0, op3 // If necessary >> ;; UDOT op0.s, diff.b, ones.b >> >> > +(define_expand "<sur>sad<vsi2qi>" >> > + [(use (match_operand:SVE_SDI 0 "register_operand")) >> > + (unspec:<VSI2QI> [(use (match_operand:<VSI2QI> 1 "register_operand")) >> > + (use (match_operand:<VSI2QI> 2 "register_operand"))] >> ABAL) >> > + (use (match_operand:SVE_SDI 3 "register_operand"))] >> > + "TARGET_SVE" >> > + { >> > + rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode)); >> > + rtx diff = gen_reg_rtx (<VSI2QI>mode); >> > + emit_insn (gen_<sur>abd<vsi2qi>_3 (diff, operands[1], operands[2])); >> > + emit_insn (gen_udot_prod<vsi2qi> (operands[3], diff, ones, >> operands[3])); >> > + emit_move_insn (operands[0], operands[3]); >> >> It would be better to make operands[0] the destination of the UDOT and >> drop the move. >> >> Thanks, >> Richard >> >> > + DONE; >> > + } >> > +) >> > diff --git a/gcc/config/aarch64/iterators.md >> > b/gcc/config/aarch64/iterators.md index b3b2d6e..20aa0e9 100644 >> > --- a/gcc/config/aarch64/iterators.md >> > +++ b/gcc/config/aarch64/iterators.md >> > @@ -1060,6 +1060,9 @@ >> > ;; Map smax to smin and umax to umin. >> > (define_code_attr max_opp [(smax "smin") (umax "umin")]) >> > >> > +;; Same as above, but louder. >> > +(define_code_attr MAX_OPP [(smax "SMIN") (umax "UMIN")]) >> >> :-) > > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64-sve.md > index 02d33b7..e94801d 100644 > --- a/gcc/config/aarch64/aarch64-sve.md > +++ b/gcc/config/aarch64/aarch64-sve.md > @@ -3148,3 +3148,64 @@ > movprfx\t%0, %3\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, > %2.<Vetype_fourth>" > [(set_attr "movprfx" "*,yes")] > ) > + > +;; Helper expander for aarch64_<su>abd<mode>_3 to save the callers > +;; the hassle of constructing the other arm of the MINUS. > +(define_expand "<su>abd<mode>_3" > + [(use (match_operand:SVE_I 0 "register_operand")) > + (USMAX:SVE_I (match_operand:SVE_I 1 "register_operand") > + (match_operand:SVE_I 2 "register_operand"))] > + "TARGET_SVE" > + { > + rtx pred = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode)); > + rtx other_arm = gen_rtx_<MAX_OPP> (<MODE>mode, operands[1], operands[2]); > + emit_insn (gen_aarch64_<su>abd<mode>_3 (operands[0], pred, operands[1], > + operands[2], other_arm)); > + DONE; > + } > +) > + > +;; Predicated integer absolute difference. > +(define_insn "aarch64_<su>abd<mode>_3" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") > + (unspec:SVE_I > + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") > + (minus:SVE_I > + (USMAX:SVE_I > + (match_operand:SVE_I 2 "register_operand" "0, w") > + (match_operand:SVE_I 3 "register_operand" "w, w")) > + (match_operator 4 "aarch64_<max_opp>" > + [(match_dup 2) > + (match_dup 3)]))] > + UNSPEC_MERGE_PTRUE))] > + "TARGET_SVE" > + "@ > + <su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> > + movprfx\t%0, %2\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>" > + [(set_attr "movprfx" "*,yes")] > +) > + > +;; Emit a sequence to produce a sum-of-absolute-differences of the inputs in > +;; operands 1 and 2. The sequence also has to perform a widening reduction > of > +;; the difference into a vector and accumulate that into operand 3 before > +;; copying that into the result operand 0. > +;; Perform that with a sequence of: > +;; MOV ones.b, #1 > +;; [SU]ABD diff.b, p0/m, op1.b, op2.b > +;; MOVPRFX op0, op3 // If necessary > +;; UDOT op0.s, diff.b, ones.b > + > +(define_expand "<sur>sad<vsi2qi>" > + [(use (match_operand:SVE_SDI 0 "register_operand")) > + (unspec:<VSI2QI> [(use (match_operand:<VSI2QI> 1 "register_operand")) > + (use (match_operand:<VSI2QI> 2 "register_operand"))] ABAL) > + (use (match_operand:SVE_SDI 3 "register_operand"))] > + "TARGET_SVE" > + { > + rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode)); > + rtx diff = gen_reg_rtx (<VSI2QI>mode); > + emit_insn (gen_<sur>abd<vsi2qi>_3 (diff, operands[1], operands[2])); > + emit_insn (gen_udot_prod<vsi2qi> (operands[0], diff, ones, operands[3])); > + DONE; > + } > +) > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index b3b2d6e..20aa0e9 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -1060,6 +1060,9 @@ > ;; Map smax to smin and umax to umin. > (define_code_attr max_opp [(smax "smin") (umax "umin")]) > > +;; Same as above, but louder. > +(define_code_attr MAX_OPP [(smax "SMIN") (umax "UMIN")]) > + > ;; The number of subvectors in an SVE_STRUCT. > (define_mode_attr vector_count [(VNx32QI "2") (VNx16HI "2") > (VNx8SI "2") (VNx4DI "2") > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c > new file mode 100644 > index 0000000..e7bf64a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c > @@ -0,0 +1,28 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize" } */ > + > +#include <stdint.h> > + > +#define DEF_SAD(TYPE1, TYPE2) > \ > +TYPE1 __attribute__ ((noinline, noclone)) \ > +sum_abs_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n) > \ > +{ \ > + TYPE1 sum = 0; \ > + for (int i = 0; i < n; i++) > \ > + { > \ > + sum += __builtin_abs (x[i] - y[i]); \ > + } > \ > + return sum; > \ > +} > + > +DEF_SAD(int32_t, uint8_t) > +DEF_SAD(int32_t, int8_t) > +DEF_SAD(int64_t, uint16_t) > +DEF_SAD(int64_t, int16_t) > + > +/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m, > z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */ > +/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m, > z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */ > +/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, > z[0-9]+\.b\n} 2 } } */ > +/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ > +/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ > +/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, > z[0-9]+\.h\n} 2 } } */ > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > index 493c1ab..057a874 100644 > --- a/gcc/tree-vect-loop.c > +++ b/gcc/tree-vect-loop.c > @@ -5973,6 +5973,7 @@ use_mask_by_cond_expr_p (enum tree_code code, > internal_fn cond_fn, > switch (code) > { > case DOT_PROD_EXPR: > + case SAD_EXPR: > return true; > > default: > @@ -6002,6 +6003,17 @@ build_vect_cond_expr (enum tree_code code, tree > vop[3], tree mask, > break; > } > > + case SAD_EXPR: > + { > + tree vectype = TREE_TYPE (vop[1]); > + tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1"); > + gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR, > + mask, vop[1], vop[0]); > + gsi_insert_before (gsi, select, GSI_SAME_STMT); > + vop[1] = masked_op1; > + break; > + } > + > default: > gcc_unreachable (); > }