Tamar Christina <tamar.christ...@arm.com> writes:
> Hi All,
>
> In GCC 11 we implemented the vectorizer optab for widening left shifts,
> however this optab is only supported for uniform shift constants.
>
> At the moment GCC still has two loop vectorization strategy (classical loop 
> and
> SLP based loop vec) and the optab is implemented as a scalar pattern.
>
> This means that when we apply it to a non-uniform constant inside a loop we 
> only
> find out during SLP build that the constants aren't uniform.  At this point 
> it's
> too late and we lose SLP entirely.
>
> Over the years I've tried various options but none of it works well:
>
> 1. Dissolving patterns during SLP built (problematic, also dissolves them for
> non-slp).
> 2. Optionally ignoring patterns for SLP build (problematic, ends up 
> interfearing
> with relevancy detection).
> 3. Relaxing contraint on SLP build to allow non-constant values and dissolving
> them after SLP build using an SLP pattern.  (problematic, ends up breaking
> shift reassociation).
>
> As a result we've concluded that for now this pattern should just be removed
> and formed during RTL.
>
> The plan is to move this to an SLP only pattern once we remove classical loop
> vectorization support from GCC, at which time we can also properly support 
> SVE's
> Top and Bottom variants.
>
> This removes the optab and reworks the RTL to recognize both the vector 
> variant
> and the intrinsics variant.  Also just simplifies all these patterns.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>       PR target/106346
>       * config/aarch64/aarch64-simd.md (vec_widen_<sur>shiftl_lo_<mode>,
>       vec_widen_<sur>shiftl_hi_<mode>): Remove.
>       (aarch64_<sur>shll<mode>_internal): Renamed to...
>       (aarch64_<su>shll<mode>): .. This.
>       (aarch64_<sur>shll2<mode>_internal): Renamed to...
>       (aarch64_<su>shll2<mode>): .. This.
>       (aarch64_<sur>shll_n<mode>, aarch64_<sur>shll2_n<mode>): Re-use new
>       optabs.
>       * config/aarch64/constraints.md (D2, D3): New.
>       * config/aarch64/predicates.md (aarch64_simd_shift_imm_vec): New.
>
> gcc/testsuite/ChangeLog:
>
>       PR target/106346
>       * gcc.target/aarch64/pr98772.c: Adjust assembly.
>       * gcc.target/aarch64/vect-widen-shift.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> d95394101470446e55f25a2397dd112239b6a54d..afd5b8632afbcddf8dad14495c3446c560eb085d
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -6387,105 +6387,66 @@ (define_insn 
> "aarch64_<sur>q<r>shl<mode><vczle><vczbe>"
>    [(set_attr "type" "neon_sat_shift_reg<q>")]
>  )
>  
> -(define_expand "vec_widen_<sur>shiftl_lo_<mode>"
> -  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
> -     (unspec:<VWIDE> [(match_operand:VQW 1 "register_operand" "w")
> -                      (match_operand:SI 2
> -                        "aarch64_simd_shift_imm_bitsize_<ve_mode>" "i")]
> -                      VSHLL))]
> -  "TARGET_SIMD"
> -  {
> -    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
> -    emit_insn (gen_aarch64_<sur>shll<mode>_internal (operands[0], 
> operands[1],
> -                                                  p, operands[2]));
> -    DONE;
> -  }
> -)
> -
> -(define_expand "vec_widen_<sur>shiftl_hi_<mode>"
> -   [(set (match_operand:<VWIDE> 0 "register_operand")
> -     (unspec:<VWIDE> [(match_operand:VQW 1 "register_operand" "w")
> -                      (match_operand:SI 2
> -                        "immediate_operand" "i")]
> -                       VSHLL))]
> -   "TARGET_SIMD"
> -   {
> -    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
> -    emit_insn (gen_aarch64_<sur>shll2<mode>_internal (operands[0], 
> operands[1],
> -                                                   p, operands[2]));
> -    DONE;
> -   }
> -)
> -
>  ;; vshll_n
>  
> -(define_insn "aarch64_<sur>shll<mode>_internal"
> -  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
> -     (unspec:<VWIDE> [(vec_select:<VHALF>
> -                         (match_operand:VQW 1 "register_operand" "w")
> -                         (match_operand:VQW 2 "vect_par_cnst_lo_half" ""))
> -                      (match_operand:SI 3
> -                        "aarch64_simd_shift_imm_bitsize_<ve_mode>" "i")]
> -                      VSHLL))]
> +(define_insn "aarch64_<su>shll<mode>"
> +  [(set (match_operand:<VWIDE> 0 "register_operand")
> +     (ashift:<VWIDE> (ANY_EXTEND:<VWIDE>
> +                         (match_operand:VD_BHSI 1 "register_operand"))
> +                      (match_operand:<VWIDE> 2
> +                        "aarch64_simd_shift_imm_vec")))]

The name of this predicate seems more general than its meaning.
How about naming it aarch64_simd_shift_imm_vec_half_bitsize, to follow:

;; Predicates used by the various SIMD shift operations.  These
;; fall in to 3 categories.
;;   Shifts with a range 0-(bit_size - 1) (aarch64_simd_shift_imm)
;;   Shifts with a range 1-bit_size (aarch64_simd_shift_imm_offset)
;;   Shifts with a range 0-bit_size (aarch64_simd_shift_imm_bitsize)

Or aarch64_simd_shll_imm_vec, for something shorter.

>    "TARGET_SIMD"
> -  {
> -    if (INTVAL (operands[3]) == GET_MODE_UNIT_BITSIZE (<MODE>mode))
> -      return "shll\\t%0.<Vwtype>, %1.<Vhalftype>, %3";
> -    else
> -      return "<sur>shll\\t%0.<Vwtype>, %1.<Vhalftype>, %3";
> +  {@ [cons: =0, 1, 2]
> +     [w, w, D2] shll\t%0.<Vwtype>, %1.<Vtype>, %I2
> +     [w, w, D3] <su>shll\t%0.<Vwtype>, %1.<Vtype>, %I2
>    }
>    [(set_attr "type" "neon_shift_imm_long")]
>  )
>  
> -(define_insn "aarch64_<sur>shll2<mode>_internal"
> -  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
> -     (unspec:<VWIDE> [(vec_select:<VHALF>
> -                         (match_operand:VQW 1 "register_operand" "w")
> -                         (match_operand:VQW 2 "vect_par_cnst_hi_half" ""))
> -                      (match_operand:SI 3
> -                        "aarch64_simd_shift_imm_bitsize_<ve_mode>" "i")]
> +(define_expand "aarch64_<sur>shll_n<mode>"
> +  [(set (match_operand:<VWIDE> 0 "register_operand")
> +     (unspec:<VWIDE> [(match_operand:VD_BHSI 1 "register_operand")
> +                      (match_operand:SI 2
> +                        "aarch64_simd_shift_imm_bitsize_<ve_mode>")]
>                        VSHLL))]
>    "TARGET_SIMD"
>    {
> -    if (INTVAL (operands[3]) == GET_MODE_UNIT_BITSIZE (<MODE>mode))
> -      return "shll2\\t%0.<Vwtype>, %1.<Vtype>, %3";
> -    else
> -      return "<sur>shll2\\t%0.<Vwtype>, %1.<Vtype>, %3";
> +    rtx shft = gen_const_vec_duplicate (<VWIDE>mode, operands[2]);
> +    emit_insn (gen_aarch64_<sur>shll<mode> (operands[0], operands[1], shft));
> +    DONE;
>    }
> -  [(set_attr "type" "neon_shift_imm_long")]
>  )
>  
> -(define_insn "aarch64_<sur>shll_n<mode>"
> -  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
> -     (unspec:<VWIDE> [(match_operand:VD_BHSI 1 "register_operand" "w")
> -                      (match_operand:SI 2
> -                        "aarch64_simd_shift_imm_bitsize_<ve_mode>" "i")]
> -                         VSHLL))]
> +;; vshll_high_n
> +
> +(define_insn "aarch64_<su>shll2<mode>"
> +  [(set (match_operand:<VWIDE> 0 "register_operand")
> +     (ashift:<VWIDE> (ANY_EXTEND:<VWIDE>
> +                       (vec_select:<VHALF>
> +                         (match_operand:VQW 1 "register_operand")
> +                         (match_operand:VQW 2 "vect_par_cnst_hi_half")))
> +                      (match_operand:<VWIDE> 3
> +                        "aarch64_simd_shift_imm_vec")))]
>    "TARGET_SIMD"
> -  {
> -    if (INTVAL (operands[2]) == GET_MODE_UNIT_BITSIZE (<MODE>mode))
> -      return "shll\\t%0.<Vwtype>, %1.<Vtype>, %2";
> -    else
> -      return "<sur>shll\\t%0.<Vwtype>, %1.<Vtype>, %2";
> +  {@ [cons: =0, 1, 2, 3]
> +     [w, w, , D2] shll2\t%0.<Vwtype>, %1.<Vtype>, %I3
> +     [w, w, , D3] <su>shll2\t%0.<Vwtype>, %1.<Vtype>, %I3
>    }
>    [(set_attr "type" "neon_shift_imm_long")]
>  )
>  
> -;; vshll_high_n
> -
> -(define_insn "aarch64_<sur>shll2_n<mode>"
> +(define_expand "aarch64_<sur>shll2_n<mode>"
>    [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
>       (unspec:<VWIDE> [(match_operand:VQW 1 "register_operand" "w")
>                        (match_operand:SI 2 "immediate_operand" "i")]

Pre-existing, but I think the predicate on operand 2 should be
aarch64_simd_shift_imm_bitsize_<ve_mode>, as above.  As it stands
we accept:

int64x2_t foo(int32x4_t x) { return vshll_high_n_s32(x, 33); }

and generate the invalid:

        sshll2  v0.2d, v0.4s, 33

I think after the patch we'd ICE instead.

The pattern shouldn't have any constraints, now that it's an expander.

> -                         VSHLL))]
> +                      VSHLL))]
>    "TARGET_SIMD"
>    {
> -    if (INTVAL (operands[2]) == GET_MODE_UNIT_BITSIZE (<MODE>mode))
> -      return "shll2\\t%0.<Vwtype>, %1.<Vtype>, %2";
> -    else
> -      return "<sur>shll2\\t%0.<Vwtype>, %1.<Vtype>, %2";
> +    rtx shft = gen_const_vec_duplicate (<VWIDE>mode, operands[2]);
> +    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
> +    emit_insn (gen_aarch64_<sur>shll2<mode> (operands[0], operands[1], p, 
> shft));
> +    DONE;
>    }
> -  [(set_attr "type" "neon_shift_imm_long")]
>  )
>  
>  ;; vrshr_n
> diff --git a/gcc/config/aarch64/constraints.md 
> b/gcc/config/aarch64/constraints.md
> index 
> 6df1dbec2a8097abe9783ed1670c77a8fad4ca57..07613013622deb8797255cbf10c265080066565b
>  100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -468,6 +468,22 @@ (define_constraint "D1"
>                       GET_MODE_UNIT_BITSIZE (mode) - 1,
>                       GET_MODE_UNIT_BITSIZE (mode) - 1)")))
>  
> +(define_constraint "D2"
> +  "@internal
> + A constraint that matches vector of immediates that is bits(mode)/2."
> + (and (match_code "const,const_vector")
> +      (match_test "aarch64_const_vec_all_same_in_range_p (op,
> +                     GET_MODE_UNIT_BITSIZE (mode) / 2,
> +                     GET_MODE_UNIT_BITSIZE (mode) / 2)")))

Looks like this can reuse aarch64_simd_shift_imm_vec_exact_top.

> +
> +(define_constraint "D3"
> +  "@internal
> + A constraint that matches vector of immediates that is with 0 to
> + (bits(mode)/2)-1."
> + (and (match_code "const,const_vector")
> +      (match_test "aarch64_const_vec_all_same_in_range_p (op, 0,
> +                     (GET_MODE_UNIT_BITSIZE (mode) / 2) - 1)")))

Having this mapping for D2 and D3, with D2 corresponded to prec/2,
kind-of makes D3 a false mnemonic.  How about DL instead?  (L for
"left-shift long" or "low-part", take your pick)

Looks good otherwise.

Thanks,
Richard

> +
>  (define_constraint "Dr"
>    "@internal
>   A constraint that matches vector of immediates for right shifts."
> diff --git a/gcc/config/aarch64/predicates.md 
> b/gcc/config/aarch64/predicates.md
> index 
> d5a4a1cd9bf8cde8e779de6e0afa531f04892a7b..2b50e39aa8651b415443b5fd90f187c83d5591d5
>  100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -638,6 +638,11 @@ (define_predicate "aarch64_simd_raddsubhn_imm_vec"
>                               HOST_WIDE_INT_1U
>                               << (GET_MODE_UNIT_BITSIZE  (mode) / 2 - 1))")))
>  
> +(define_predicate "aarch64_simd_shift_imm_vec"
> +  (and (match_code "const_vector")
> +       (match_test "aarch64_const_vec_all_same_in_range_p (op, 0,
> +                             GET_MODE_UNIT_BITSIZE (mode) / 2)")))
> +
>  (define_predicate "aarch64_simd_shift_imm_bitsize_qi"
>    (and (match_code "const_int")
>         (match_test "IN_RANGE (INTVAL (op), 0, 8)")))
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr98772.c 
> b/gcc/testsuite/gcc.target/aarch64/pr98772.c
> index 
> 8259251a7c0b64ae8362ea29ec3cf1d2a9d63547..52ad012dcfe72721b8c987bb826c0ffb8ba3f31e
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/pr98772.c
> +++ b/gcc/testsuite/gcc.target/aarch64/pr98772.c
> @@ -155,4 +155,4 @@ int main ()
>  /* { dg-final { scan-assembler-times "uaddl\\tv" 2 } } */
>  /* { dg-final { scan-assembler-times "usubl\\tv" 2 } } */
>  /* { dg-final { scan-assembler-times "umull\\tv" 2 } } */
> -/* { dg-final { scan-assembler-times "shl\\tv" 2 } } */
> +/* { dg-final { scan-assembler-times "shll\\tv" 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-widen-shift.c 
> b/gcc/testsuite/gcc.target/aarch64/vect-widen-shift.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..6ee41f63ef8a145c0eb7f213950e7501e058b2fa
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-widen-shift.c
> @@ -0,0 +1,50 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +#include <stdint.h>
> +#include <string.h>
> +
> +#pragma GCC target "+nosve"
> +
> +#define ARR_SIZE 1024
> +
> +/* Should produce an shll,shll2 pair*/
> +/*
> +** sshll_opt1:
> +**   ...
> +**   shll    v[0-9]+.4s, v[0-9]+.4h, 16
> +**   shll2   v[0-9]+.4s, v[0-9]+.8h, 16
> +**   ...
> +*/
> +void sshll_opt1 (int32_t *foo, int16_t *a, int16_t *b)
> +{
> +    for( int i = 0; i < ARR_SIZE - 3;i=i+4)
> +    {
> +        foo[i]   = a[i]   << 16;
> +        foo[i+1] = a[i+1] << 16;
> +        foo[i+2] = a[i+2] << 16;
> +        foo[i+3] = a[i+3] << 16;
> +    }
> +}
> +
> +/*
> +** sshll_opt2:
> +**   ...
> +**   sxtl    v[0-9]+.4s, v[0-9]+.4h
> +**   sxtl2   v[0-9]+.4s, v[0-9]+.8h
> +**   sshl    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +**   sshl    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +**   ...
> +*/
> +void sshll_opt2 (int32_t *foo, int16_t *a, int16_t *b)
> +{
> +    for( int i = 0; i < ARR_SIZE - 3;i=i+4)
> +    {
> +        foo[i]   = a[i]   << 16;
> +        foo[i+1] = a[i+1] << 15;
> +        foo[i+2] = a[i+2] << 14;
> +        foo[i+3] = a[i+3] << 17;
> +    }
> +}
> +
> +

Reply via email to