Tamar Christina <tamar.christ...@arm.com> writes: > Hi All, > > In GCC 11 we implemented the vectorizer optab for widening left shifts, > however this optab is only supported for uniform shift constants. > > At the moment GCC still has two loop vectorization strategy (classical loop > and > SLP based loop vec) and the optab is implemented as a scalar pattern. > > This means that when we apply it to a non-uniform constant inside a loop we > only > find out during SLP build that the constants aren't uniform. At this point > it's > too late and we lose SLP entirely. > > Over the years I've tried various options but none of it works well: > > 1. Dissolving patterns during SLP built (problematic, also dissolves them for > non-slp). > 2. Optionally ignoring patterns for SLP build (problematic, ends up > interfearing > with relevancy detection). > 3. Relaxing contraint on SLP build to allow non-constant values and dissolving > them after SLP build using an SLP pattern. (problematic, ends up breaking > shift reassociation). > > As a result we've concluded that for now this pattern should just be removed > and formed during RTL. > > The plan is to move this to an SLP only pattern once we remove classical loop > vectorization support from GCC, at which time we can also properly support > SVE's > Top and Bottom variants. > > This removes the optab and reworks the RTL to recognize both the vector > variant > and the intrinsics variant. Also just simplifies all these patterns. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > PR target/106346 > * config/aarch64/aarch64-simd.md (vec_widen_<sur>shiftl_lo_<mode>, > vec_widen_<sur>shiftl_hi_<mode>): Remove. > (aarch64_<sur>shll<mode>_internal): Renamed to... > (aarch64_<su>shll<mode>): .. This. > (aarch64_<sur>shll2<mode>_internal): Renamed to... > (aarch64_<su>shll2<mode>): .. This. > (aarch64_<sur>shll_n<mode>, aarch64_<sur>shll2_n<mode>): Re-use new > optabs. > * config/aarch64/constraints.md (D2, D3): New. > * config/aarch64/predicates.md (aarch64_simd_shift_imm_vec): New. > > gcc/testsuite/ChangeLog: > > PR target/106346 > * gcc.target/aarch64/pr98772.c: Adjust assembly. > * gcc.target/aarch64/vect-widen-shift.c: New test. > > --- inline copy of patch -- > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > d95394101470446e55f25a2397dd112239b6a54d..afd5b8632afbcddf8dad14495c3446c560eb085d > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -6387,105 +6387,66 @@ (define_insn > "aarch64_<sur>q<r>shl<mode><vczle><vczbe>" > [(set_attr "type" "neon_sat_shift_reg<q>")] > ) > > -(define_expand "vec_widen_<sur>shiftl_lo_<mode>" > - [(set (match_operand:<VWIDE> 0 "register_operand" "=w") > - (unspec:<VWIDE> [(match_operand:VQW 1 "register_operand" "w") > - (match_operand:SI 2 > - "aarch64_simd_shift_imm_bitsize_<ve_mode>" "i")] > - VSHLL))] > - "TARGET_SIMD" > - { > - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false); > - emit_insn (gen_aarch64_<sur>shll<mode>_internal (operands[0], > operands[1], > - p, operands[2])); > - DONE; > - } > -) > - > -(define_expand "vec_widen_<sur>shiftl_hi_<mode>" > - [(set (match_operand:<VWIDE> 0 "register_operand") > - (unspec:<VWIDE> [(match_operand:VQW 1 "register_operand" "w") > - (match_operand:SI 2 > - "immediate_operand" "i")] > - VSHLL))] > - "TARGET_SIMD" > - { > - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); > - emit_insn (gen_aarch64_<sur>shll2<mode>_internal (operands[0], > operands[1], > - p, operands[2])); > - DONE; > - } > -) > - > ;; vshll_n > > -(define_insn "aarch64_<sur>shll<mode>_internal" > - [(set (match_operand:<VWIDE> 0 "register_operand" "=w") > - (unspec:<VWIDE> [(vec_select:<VHALF> > - (match_operand:VQW 1 "register_operand" "w") > - (match_operand:VQW 2 "vect_par_cnst_lo_half" "")) > - (match_operand:SI 3 > - "aarch64_simd_shift_imm_bitsize_<ve_mode>" "i")] > - VSHLL))] > +(define_insn "aarch64_<su>shll<mode>" > + [(set (match_operand:<VWIDE> 0 "register_operand") > + (ashift:<VWIDE> (ANY_EXTEND:<VWIDE> > + (match_operand:VD_BHSI 1 "register_operand")) > + (match_operand:<VWIDE> 2 > + "aarch64_simd_shift_imm_vec")))]
The name of this predicate seems more general than its meaning. How about naming it aarch64_simd_shift_imm_vec_half_bitsize, to follow: ;; Predicates used by the various SIMD shift operations. These ;; fall in to 3 categories. ;; Shifts with a range 0-(bit_size - 1) (aarch64_simd_shift_imm) ;; Shifts with a range 1-bit_size (aarch64_simd_shift_imm_offset) ;; Shifts with a range 0-bit_size (aarch64_simd_shift_imm_bitsize) Or aarch64_simd_shll_imm_vec, for something shorter. > "TARGET_SIMD" > - { > - if (INTVAL (operands[3]) == GET_MODE_UNIT_BITSIZE (<MODE>mode)) > - return "shll\\t%0.<Vwtype>, %1.<Vhalftype>, %3"; > - else > - return "<sur>shll\\t%0.<Vwtype>, %1.<Vhalftype>, %3"; > + {@ [cons: =0, 1, 2] > + [w, w, D2] shll\t%0.<Vwtype>, %1.<Vtype>, %I2 > + [w, w, D3] <su>shll\t%0.<Vwtype>, %1.<Vtype>, %I2 > } > [(set_attr "type" "neon_shift_imm_long")] > ) > > -(define_insn "aarch64_<sur>shll2<mode>_internal" > - [(set (match_operand:<VWIDE> 0 "register_operand" "=w") > - (unspec:<VWIDE> [(vec_select:<VHALF> > - (match_operand:VQW 1 "register_operand" "w") > - (match_operand:VQW 2 "vect_par_cnst_hi_half" "")) > - (match_operand:SI 3 > - "aarch64_simd_shift_imm_bitsize_<ve_mode>" "i")] > +(define_expand "aarch64_<sur>shll_n<mode>" > + [(set (match_operand:<VWIDE> 0 "register_operand") > + (unspec:<VWIDE> [(match_operand:VD_BHSI 1 "register_operand") > + (match_operand:SI 2 > + "aarch64_simd_shift_imm_bitsize_<ve_mode>")] > VSHLL))] > "TARGET_SIMD" > { > - if (INTVAL (operands[3]) == GET_MODE_UNIT_BITSIZE (<MODE>mode)) > - return "shll2\\t%0.<Vwtype>, %1.<Vtype>, %3"; > - else > - return "<sur>shll2\\t%0.<Vwtype>, %1.<Vtype>, %3"; > + rtx shft = gen_const_vec_duplicate (<VWIDE>mode, operands[2]); > + emit_insn (gen_aarch64_<sur>shll<mode> (operands[0], operands[1], shft)); > + DONE; > } > - [(set_attr "type" "neon_shift_imm_long")] > ) > > -(define_insn "aarch64_<sur>shll_n<mode>" > - [(set (match_operand:<VWIDE> 0 "register_operand" "=w") > - (unspec:<VWIDE> [(match_operand:VD_BHSI 1 "register_operand" "w") > - (match_operand:SI 2 > - "aarch64_simd_shift_imm_bitsize_<ve_mode>" "i")] > - VSHLL))] > +;; vshll_high_n > + > +(define_insn "aarch64_<su>shll2<mode>" > + [(set (match_operand:<VWIDE> 0 "register_operand") > + (ashift:<VWIDE> (ANY_EXTEND:<VWIDE> > + (vec_select:<VHALF> > + (match_operand:VQW 1 "register_operand") > + (match_operand:VQW 2 "vect_par_cnst_hi_half"))) > + (match_operand:<VWIDE> 3 > + "aarch64_simd_shift_imm_vec")))] > "TARGET_SIMD" > - { > - if (INTVAL (operands[2]) == GET_MODE_UNIT_BITSIZE (<MODE>mode)) > - return "shll\\t%0.<Vwtype>, %1.<Vtype>, %2"; > - else > - return "<sur>shll\\t%0.<Vwtype>, %1.<Vtype>, %2"; > + {@ [cons: =0, 1, 2, 3] > + [w, w, , D2] shll2\t%0.<Vwtype>, %1.<Vtype>, %I3 > + [w, w, , D3] <su>shll2\t%0.<Vwtype>, %1.<Vtype>, %I3 > } > [(set_attr "type" "neon_shift_imm_long")] > ) > > -;; vshll_high_n > - > -(define_insn "aarch64_<sur>shll2_n<mode>" > +(define_expand "aarch64_<sur>shll2_n<mode>" > [(set (match_operand:<VWIDE> 0 "register_operand" "=w") > (unspec:<VWIDE> [(match_operand:VQW 1 "register_operand" "w") > (match_operand:SI 2 "immediate_operand" "i")] Pre-existing, but I think the predicate on operand 2 should be aarch64_simd_shift_imm_bitsize_<ve_mode>, as above. As it stands we accept: int64x2_t foo(int32x4_t x) { return vshll_high_n_s32(x, 33); } and generate the invalid: sshll2 v0.2d, v0.4s, 33 I think after the patch we'd ICE instead. The pattern shouldn't have any constraints, now that it's an expander. > - VSHLL))] > + VSHLL))] > "TARGET_SIMD" > { > - if (INTVAL (operands[2]) == GET_MODE_UNIT_BITSIZE (<MODE>mode)) > - return "shll2\\t%0.<Vwtype>, %1.<Vtype>, %2"; > - else > - return "<sur>shll2\\t%0.<Vwtype>, %1.<Vtype>, %2"; > + rtx shft = gen_const_vec_duplicate (<VWIDE>mode, operands[2]); > + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); > + emit_insn (gen_aarch64_<sur>shll2<mode> (operands[0], operands[1], p, > shft)); > + DONE; > } > - [(set_attr "type" "neon_shift_imm_long")] > ) > > ;; vrshr_n > diff --git a/gcc/config/aarch64/constraints.md > b/gcc/config/aarch64/constraints.md > index > 6df1dbec2a8097abe9783ed1670c77a8fad4ca57..07613013622deb8797255cbf10c265080066565b > 100644 > --- a/gcc/config/aarch64/constraints.md > +++ b/gcc/config/aarch64/constraints.md > @@ -468,6 +468,22 @@ (define_constraint "D1" > GET_MODE_UNIT_BITSIZE (mode) - 1, > GET_MODE_UNIT_BITSIZE (mode) - 1)"))) > > +(define_constraint "D2" > + "@internal > + A constraint that matches vector of immediates that is bits(mode)/2." > + (and (match_code "const,const_vector") > + (match_test "aarch64_const_vec_all_same_in_range_p (op, > + GET_MODE_UNIT_BITSIZE (mode) / 2, > + GET_MODE_UNIT_BITSIZE (mode) / 2)"))) Looks like this can reuse aarch64_simd_shift_imm_vec_exact_top. > + > +(define_constraint "D3" > + "@internal > + A constraint that matches vector of immediates that is with 0 to > + (bits(mode)/2)-1." > + (and (match_code "const,const_vector") > + (match_test "aarch64_const_vec_all_same_in_range_p (op, 0, > + (GET_MODE_UNIT_BITSIZE (mode) / 2) - 1)"))) Having this mapping for D2 and D3, with D2 corresponded to prec/2, kind-of makes D3 a false mnemonic. How about DL instead? (L for "left-shift long" or "low-part", take your pick) Looks good otherwise. Thanks, Richard > + > (define_constraint "Dr" > "@internal > A constraint that matches vector of immediates for right shifts." > diff --git a/gcc/config/aarch64/predicates.md > b/gcc/config/aarch64/predicates.md > index > d5a4a1cd9bf8cde8e779de6e0afa531f04892a7b..2b50e39aa8651b415443b5fd90f187c83d5591d5 > 100644 > --- a/gcc/config/aarch64/predicates.md > +++ b/gcc/config/aarch64/predicates.md > @@ -638,6 +638,11 @@ (define_predicate "aarch64_simd_raddsubhn_imm_vec" > HOST_WIDE_INT_1U > << (GET_MODE_UNIT_BITSIZE (mode) / 2 - 1))"))) > > +(define_predicate "aarch64_simd_shift_imm_vec" > + (and (match_code "const_vector") > + (match_test "aarch64_const_vec_all_same_in_range_p (op, 0, > + GET_MODE_UNIT_BITSIZE (mode) / 2)"))) > + > (define_predicate "aarch64_simd_shift_imm_bitsize_qi" > (and (match_code "const_int") > (match_test "IN_RANGE (INTVAL (op), 0, 8)"))) > diff --git a/gcc/testsuite/gcc.target/aarch64/pr98772.c > b/gcc/testsuite/gcc.target/aarch64/pr98772.c > index > 8259251a7c0b64ae8362ea29ec3cf1d2a9d63547..52ad012dcfe72721b8c987bb826c0ffb8ba3f31e > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/pr98772.c > +++ b/gcc/testsuite/gcc.target/aarch64/pr98772.c > @@ -155,4 +155,4 @@ int main () > /* { dg-final { scan-assembler-times "uaddl\\tv" 2 } } */ > /* { dg-final { scan-assembler-times "usubl\\tv" 2 } } */ > /* { dg-final { scan-assembler-times "umull\\tv" 2 } } */ > -/* { dg-final { scan-assembler-times "shl\\tv" 2 } } */ > +/* { dg-final { scan-assembler-times "shll\\tv" 2 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-widen-shift.c > b/gcc/testsuite/gcc.target/aarch64/vect-widen-shift.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..6ee41f63ef8a145c0eb7f213950e7501e058b2fa > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vect-widen-shift.c > @@ -0,0 +1,50 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -save-temps" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > +#include <stdint.h> > +#include <string.h> > + > +#pragma GCC target "+nosve" > + > +#define ARR_SIZE 1024 > + > +/* Should produce an shll,shll2 pair*/ > +/* > +** sshll_opt1: > +** ... > +** shll v[0-9]+.4s, v[0-9]+.4h, 16 > +** shll2 v[0-9]+.4s, v[0-9]+.8h, 16 > +** ... > +*/ > +void sshll_opt1 (int32_t *foo, int16_t *a, int16_t *b) > +{ > + for( int i = 0; i < ARR_SIZE - 3;i=i+4) > + { > + foo[i] = a[i] << 16; > + foo[i+1] = a[i+1] << 16; > + foo[i+2] = a[i+2] << 16; > + foo[i+3] = a[i+3] << 16; > + } > +} > + > +/* > +** sshll_opt2: > +** ... > +** sxtl v[0-9]+.4s, v[0-9]+.4h > +** sxtl2 v[0-9]+.4s, v[0-9]+.8h > +** sshl v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** sshl v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** ... > +*/ > +void sshll_opt2 (int32_t *foo, int16_t *a, int16_t *b) > +{ > + for( int i = 0; i < ARR_SIZE - 3;i=i+4) > + { > + foo[i] = a[i] << 16; > + foo[i+1] = a[i+1] << 15; > + foo[i+2] = a[i+2] << 14; > + foo[i+3] = a[i+3] << 17; > + } > +} > + > +