Juzhe-Zhong <juzhe.zh...@rivai.ai> writes:
> This patch fixes this following FAILs in RISC-V regression:
>
> FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects  scan-tree-dump 
> vect "Loop contains only SLP stmts"
> FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP 
> stmts"
> FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects  scan-tree-dump 
> vect "Loop contains only SLP stmts"
> FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP 
> stmts"
>
> The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD.
>
> We have 2 following situations of scalar recognized MASK_LEN_GATHER_LOAD:
>
> 1. conditional gather load: MASK_LEN_GATHER_LOAD (base, offset, scale, zero, 
> condtional mask).
>    
>    This situation we just need to leverage the current MASK_GATHER_LOAD which 
> can achieve SLP MASK_LEN_GATHER_LOAD.
>
> 2. un-conditional gather load: MASK_LEN_GATHER_LOAD (base, offset, scale, 
> zero, -1)
>    
>    Current SLP check will failed on dummy mask -1, so we relax the check in 
> tree-vect-slp.cc and allow it to be materialized.
>     
> Consider this following case:
>
> void __attribute__((noipa))
> f (int *restrict y, int *restrict x, int *restrict indices, int n)
> {
>   for (int i = 0; i < n; ++i)
>     {
>       y[i * 2] = x[indices[i * 2]] + 1;
>       y[i * 2 + 1] = x[indices[i * 2 + 1]] + 2;
>     }
> }
>
> https://godbolt.org/z/WG3M3n7Mo
>
> GCC unable to SLP using VEC_LOAD_LANES/VEC_STORE_LANES:
>
> f:
>         ble     a3,zero,.L5
> .L3:
>         vsetvli a5,a3,e8,mf4,ta,ma
>         vsetvli zero,a5,e32,m1,ta,ma
>         vlseg2e32.v     v6,(a2)
>         vsetvli a4,zero,e64,m2,ta,ma
>         vsext.vf2       v2,v6
>         vsll.vi v2,v2,2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vluxei64.v      v1,(a1),v2
>         vsetvli a4,zero,e64,m2,ta,ma
>         vsext.vf2       v2,v7
>         vsetvli zero,zero,e32,m1,ta,ma
>         vadd.vi v4,v1,1
>         vsetvli zero,zero,e64,m2,ta,ma
>         vsll.vi v2,v2,2
>         vsetvli zero,a5,e32,m1,ta,ma
>         vluxei64.v      v2,(a1),v2
>         vsetvli a4,zero,e32,m1,ta,ma
>         slli    a6,a5,3
>         vadd.vi v5,v2,2
>         sub     a3,a3,a5
>         vsetvli zero,a5,e32,m1,ta,ma
>         vsseg2e32.v     v4,(a0)
>         add     a2,a2,a6
>         add     a0,a0,a6
>         bne     a3,zero,.L3
> .L5:
>         ret
>
> After this patch:
>
> f:
>       ble     a3,zero,.L5
>       li      a5,1
>       csrr    t1,vlenb
>       slli    a5,a5,33
>       srli    a7,t1,2
>       addi    a5,a5,1
>       slli    a3,a3,1
>       neg     t3,a7
>       vsetvli a4,zero,e64,m1,ta,ma
>       vmv.v.x v4,a5
> .L3:
>       minu    a5,a3,a7
>       vsetvli zero,a5,e32,m1,ta,ma
>       vle32.v v1,0(a2)
>       vsetvli a4,zero,e64,m2,ta,ma
>       vsext.vf2       v2,v1
>       vsll.vi v2,v2,2
>       vsetvli zero,a5,e32,m1,ta,ma
>       vluxei64.v      v2,(a1),v2
>       vsetvli a4,zero,e32,m1,ta,ma
>       mv      a6,a3
>       vadd.vv v2,v2,v4
>       vsetvli zero,a5,e32,m1,ta,ma
>       vse32.v v2,0(a0)
>       add     a2,a2,t1
>       add     a0,a0,t1
>       add     a3,a3,t3
>       bgtu    a6,a7,.L3
> .L5:
>       ret
>
> Note that I found we are missing conditional mask gather_load SLP test, 
> Append a test for it in this patch.

Yeah, we're missing a target-independent test.  I'm afraid I used
aarch64-specific tests for a lot of this stuff, since (a) I wanted
to check the quality of the asm output and (b) it's very hard to write
gcc.dg/vect tests that don't fail on some target or other.  Thanks for
picking this up.

>
> Tested on RISC-V and Bootstrap && Regression on X86 passed.
>
> Ok for trunk ?
>
> gcc/ChangeLog:
>
>       * tree-vect-slp.cc (vect_get_operand_map): Add MASK_LEN_GATHER_LOAD.
>       (vect_get_and_check_slp_defs): Ditto.
>       (vect_build_slp_tree_1): Ditto.
>       (vect_build_slp_tree_2): Ditto.
>       * tree-vect-stmts.cc (vectorizable_load): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>       * gcc.dg/vect/vect-gather-6.c: New test.
>
> ---
>  gcc/testsuite/gcc.dg/vect/vect-gather-6.c | 15 +++++++++++++++
>  gcc/tree-vect-slp.cc                      | 22 ++++++++++++++++++----
>  gcc/tree-vect-stmts.cc                    | 10 +++++++++-
>  3 files changed, 42 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-gather-6.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-6.c 
> b/gcc/testsuite/gcc.dg/vect/vect-gather-6.c
> new file mode 100644
> index 00000000000..ff55f321854
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-6.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +
> +void
> +f (int *restrict y, int *restrict x, int *restrict indices, int *restrict 
> cond, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    {
> +      if (cond[i * 2])
> +     y[i * 2] = x[indices[i * 2]] + 1;
> +      if (cond[i * 2 + 1])
> +     y[i * 2 + 1] = x[indices[i * 2 + 1]] + 2;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target 
> vect_gather_load_ifn } } } */
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index fa098f9ff4e..38fe6ba6296 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -542,6 +542,7 @@ vect_get_operand_map (const gimple *stmt, unsigned char 
> swap = 0)
>           return arg1_map;
>  
>         case IFN_MASK_GATHER_LOAD:
> +       case IFN_MASK_LEN_GATHER_LOAD:
>           return arg1_arg4_map;
>  
>         case IFN_MASK_STORE:
> @@ -700,8 +701,7 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned 
> char swap,
>       {
>         tree type = TREE_TYPE (oprnd);
>         dt = dts[i];
> -       if ((dt == vect_constant_def
> -            || dt == vect_external_def)
> +       if (dt == vect_external_def
>             && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
>             && (TREE_CODE (type) == BOOLEAN_TYPE
>                 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
> @@ -713,6 +713,17 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned 
> char swap,
>                                "for variable-length SLP %T\n", oprnd);
>             return -1;
>           }
> +       if (dt == vect_constant_def
> +           && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
> +           && !can_duplicate_and_interleave_p (vinfo, stmts.length (), type))
> +         {
> +           if (dump_enabled_p ())
> +             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                              "Build SLP failed: invalid type of def "
> +                              "for variable-length SLP %T\n",
> +                              oprnd);
> +           return -1;
> +         }
>  
>         /* For the swapping logic below force vect_reduction_def
>            for the reduction op in a SLP reduction group.  */
> @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char 
> *swap,
>  
>         if (cfn == CFN_MASK_LOAD
>             || cfn == CFN_GATHER_LOAD
> -           || cfn == CFN_MASK_GATHER_LOAD)
> +           || cfn == CFN_MASK_GATHER_LOAD
> +           || cfn == CFN_MASK_LEN_GATHER_LOAD)
>           ldst_p = true;
>         else if (cfn == CFN_MASK_STORE)
>           {
> @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char 
> *swap,
>         if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
>             && rhs_code != CFN_GATHER_LOAD
>             && rhs_code != CFN_MASK_GATHER_LOAD
> +           && rhs_code != CFN_MASK_LEN_GATHER_LOAD
>             /* Not grouped loads are handled as externals for BB
>                vectorization.  For loop vectorization we can handle
>                splats the same we handle single element interleaving.  */
> @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
>        if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
>       gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
>                   || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
> -                 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
> +                 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
> +                 || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD));
>        else
>       {
>         *max_nunits = this_max_nunits;
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index ce925cc1d53..994234eec08 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -9738,12 +9738,20 @@ vectorizable_load (vec_info *vinfo,
>       return false;
>  
>        mask_index = internal_fn_mask_index (ifn);
> +      slp_tree slp_op = NULL;
>        if (mask_index >= 0 && slp_node)
>       mask_index = vect_slp_child_index_for_operand (call, mask_index);
>        if (mask_index >= 0
>         && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
> -                                   &mask, NULL, &mask_dt, &mask_vectype))
> +                                   &mask, &slp_op, &mask_dt, &mask_vectype))
>       return false;
> +      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, 
> mask_vectype))
> +     {
> +       if (dump_enabled_p ())
> +         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                          "incompatible vector types for invariants\n");
> +       return false;
> +     }

slp_op and mask_vectype are only initialised when mask_index >= 0.
Shouldn't this code be under mask_index >= 0 too?

Also, when do we encounter mismatched mask_vectypes?  Presumably the SLP
node has a known vectype by this point.  I think a comment would be useful.

Thanks,
Richard

Reply via email to