On Wed, 27 Nov 2024, Tamar Christina wrote:

> Hi All,
> 
> The testcase
> 
> #include <stdint.h>
> #include <string.h>
> 
> #define N 8
> #define L 8
> 
> void f(const uint8_t * restrict seq1,
>        const uint8_t *idx, uint8_t *seq_out) {
>   for (int i = 0; i < L; ++i) {
>     uint8_t h = idx[i];
>     memcpy((void *)&seq_out[i * N], (const void *)&seq1[h * N / 2], N / 2);
>   }
> }
> 
> compiled at -O3 -mcpu=neoverse-n1+sve
> 
> miscompiles to:
> 
>     ld1w    z31.s, p3/z, [x23, z29.s, sxtw]
>     ld1w    z29.s, p7/z, [x23, z30.s, sxtw]
>     st1w    z29.s, p7, [x24, z12.s, sxtw]
>     st1w    z31.s, p7, [x24, z12.s, sxtw]
> 
> rather than
> 
>     ld1w    z31.s, p3/z, [x23, z29.s, sxtw]
>     ld1w    z29.s, p7/z, [x23, z30.s, sxtw]
>     st1w    z29.s, p7, [x24, z12.s, sxtw]
>     addvl   x3, x24, #2
>     st1w    z31.s, p3, [x3, z12.s, sxtw]
> 
> Where two things go wrong, the wrong mask is used and the address pointers to
> the stores are wrong.
> 
> This issue is happening because the codegen loop in vectorizable_store is a
> nested loop where in the outer loop we iterate over ncopies and in the inner
> loop we loop over vec_num.
> 
> For SLP ncopies == 1 and vec_num == SLP_NUM_STMS, but the loop mask is
> determined by only the outerloop index and the pointer address is only updated
> in the outer loop.
> 
> As such for SLP we always use the same predicate and the same memory location.
> This patch flattens the two loops and instead iterates over ncopies * vec_num
> and simplified the indexing.
> 
> This does not fully fix the gcc_r miscompile error in SPECCPU 2017 as the 
> error
> moves somewhere else.  I will look at that next but fixes some other libraries
> that also started failing.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, arm-none-linux-gnueabihf,
> x86_64-pc-linux-gnu -m32, -m64 and no issues
> 
> Ok for master?

OK.

Thanks,
Richard.

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>       PR tree-optimization/117557
>       * tree-vect-stmts.cc (vectorizable_store): Flatten the ncopies and
>       vec_num loops.
> 
> gcc/testsuite/ChangeLog:
> 
>       PR tree-optimization/117557
>       * gcc.target/aarch64/pr117557.c: New test.
> 
> ---
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr117557.c 
> b/gcc/testsuite/gcc.target/aarch64/pr117557.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..80b3fde41109988db70eafd715224df0b0029cd1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr117557.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mcpu=neoverse-n1+sve -fdump-tree-vect" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <stdint.h>
> +#include <string.h>
> +
> +#define N 8
> +#define L 8
> +
> +/*
> +**f:
> +**   ...
> +**   ld1w    z[0-9]+.s, p([0-9]+)/z, \[x[0-9]+, z[0-9]+.s, sxtw\]
> +**   ld1w    z[0-9]+.s, p([0-9]+)/z, \[x[0-9]+, z[0-9]+.s, sxtw\]
> +**   st1w    z[0-9]+.s, p\1, \[x[0-9]+, z[0-9]+.s, sxtw\]
> +**   incb    x([0-9]+), all, mul #2
> +**   st1w    z[0-9]+.s, p\2, \[x\3, z[0-9]+.s, sxtw\]
> +**   ret
> +**   ...
> +*/
> +void f(const uint8_t * restrict seq1,
> +       const uint8_t *idx, uint8_t *seq_out) {
> +  for (int i = 0; i < L; ++i) {
> +    uint8_t h = idx[i];
> +    memcpy((void *)&seq_out[i * N], (const void *)&seq1[h * N / 2], N / 2);
> +  }
> +}
> +
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 
> c2d5818b2786123fac7afe290d85c7dd2bda4308..4759c274f3ccbb111a907576539b2a8efb7726a3
>  100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -9228,7 +9228,8 @@ vectorizable_store (vec_info *vinfo,
>        gcc_assert (!grouped_store);
>        auto_vec<tree> vec_offsets;
>        unsigned int inside_cost = 0, prologue_cost = 0;
> -      for (j = 0; j < ncopies; j++)
> +      int num_stmts = ncopies * vec_num;
> +      for (j = 0; j < num_stmts; j++)
>       {
>         gimple *new_stmt;
>         if (j == 0)
> @@ -9246,14 +9247,14 @@ vectorizable_store (vec_info *vinfo,
>                   vect_get_slp_defs (op_node, gvec_oprnds[0]);
>                 else
>                   vect_get_vec_defs_for_operand (vinfo, first_stmt_info,
> -                                                ncopies, op, gvec_oprnds[0]);
> +                                                num_stmts, op, 
> gvec_oprnds[0]);
>                 if (mask)
>                   {
>                     if (slp_node)
>                       vect_get_slp_defs (mask_node, &vec_masks);
>                     else
>                       vect_get_vec_defs_for_operand (vinfo, stmt_info,
> -                                                    ncopies,
> +                                                    num_stmts,
>                                                      mask, &vec_masks,
>                                                      mask_vectype);
>                   }
> @@ -9279,281 +9280,280 @@ vectorizable_store (vec_info *vinfo,
>           }
>  
>         new_stmt = NULL;
> -       for (i = 0; i < vec_num; ++i)
> +       if (!costing_p)
>           {
> -           if (!costing_p)
> -             {
> -               vec_oprnd = (*gvec_oprnds[0])[vec_num * j + i];
> -               if (mask)
> -                 vec_mask = vec_masks[vec_num * j + i];
> -               /* We should have catched mismatched types earlier.  */
> -               gcc_assert (useless_type_conversion_p (vectype,
> -                                                      TREE_TYPE 
> (vec_oprnd)));
> -             }
> -           unsigned HOST_WIDE_INT align;
> -           tree final_mask = NULL_TREE;
> -           tree final_len = NULL_TREE;
> -           tree bias = NULL_TREE;
> -           if (!costing_p)
> +           vec_oprnd = (*gvec_oprnds[0])[j];
> +           if (mask)
> +             vec_mask = vec_masks[j];
> +           /* We should have catched mismatched types earlier.  */
> +           gcc_assert (useless_type_conversion_p (vectype,
> +                                                  TREE_TYPE (vec_oprnd)));
> +         }
> +       unsigned HOST_WIDE_INT align;
> +       tree final_mask = NULL_TREE;
> +       tree final_len = NULL_TREE;
> +       tree bias = NULL_TREE;
> +       if (!costing_p)
> +         {
> +           if (loop_masks)
> +             final_mask = vect_get_loop_mask (loop_vinfo, gsi,
> +                                              loop_masks, num_stmts,
> +                                              vectype, j);
> +           if (vec_mask)
> +             final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> +                                            final_mask, vec_mask, gsi);
> +         }
> +
> +       if (gs_info.ifn != IFN_LAST)
> +         {
> +           if (costing_p)
>               {
> -               if (loop_masks)
> -                 final_mask = vect_get_loop_mask (loop_vinfo, gsi,
> -                                                  loop_masks,
> -                                                  ncopies * vec_num,
> -                                                  vectype, j);
> -               if (vec_mask)
> -                 final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> -                                                final_mask, vec_mask, gsi);
> +               unsigned int cnunits = vect_nunits_for_cost (vectype);
> +               inside_cost
> +                 += record_stmt_cost (cost_vec, cnunits, scalar_store,
> +                                      stmt_info, slp_node, 0,
> +                                      vect_body);
> +               continue;
>               }
>  
> -           if (gs_info.ifn != IFN_LAST)
> -             {
> -               if (costing_p)
> -                 {
> -                   unsigned int cnunits = vect_nunits_for_cost (vectype);
> -                   inside_cost
> -                       += record_stmt_cost (cost_vec, cnunits, scalar_store,
> -                                            stmt_info, slp_node, 0,
> -                                            vect_body);
> -                   continue;
> -                 }
> +           if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +             vec_offset = vec_offsets[j];
>  
> -               if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> -                 vec_offset = vec_offsets[vec_num * j + i];
> -               tree scale = size_int (gs_info.scale);
> +           tree scale = size_int (gs_info.scale);
>  
> -               if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
> -                 {
> -                   if (loop_lens)
> -                     final_len = vect_get_loop_len (loop_vinfo, gsi,
> -                                                    loop_lens,
> -                                                    ncopies * vec_num,
> -                                                    vectype, j, 1);
> -                   else
> -                     final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> -                   signed char biasval
> -                     = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> -                   bias = build_int_cst (intQI_type_node, biasval);
> -                   if (!final_mask)
> -                     {
> -                       mask_vectype = truth_type_for (vectype);
> -                       final_mask = build_minus_one_cst (mask_vectype);
> -                     }
> -                 }
> +           if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
> +             {
> +               if (loop_lens)
> +                 final_len = vect_get_loop_len (loop_vinfo, gsi,
> +                                                loop_lens, num_stmts,
> +                                                vectype, j, 1);
> +               else
> +                 final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
>  
> -               gcall *call;
> -               if (final_len && final_mask)
> +               signed char biasval
> +                 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +               bias = build_int_cst (intQI_type_node, biasval);
> +               if (!final_mask)
>                   {
> -                   if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
> -                     call = gimple_build_call_internal (
> -                       IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
> -                       vec_offset, scale, vec_oprnd, final_mask, final_len,
> -                       bias);
> -                   else
> -                     /* Non-vector offset indicates that prefer to take
> -                        MASK_LEN_STRIDED_STORE instead of the
> -                        IFN_MASK_SCATTER_STORE with direct stride arg.  */
> -                     call = gimple_build_call_internal (
> -                       IFN_MASK_LEN_STRIDED_STORE, 6, dataref_ptr,
> -                       vec_offset, vec_oprnd, final_mask, final_len, bias);
> +                   mask_vectype = truth_type_for (vectype);
> +                   final_mask = build_minus_one_cst (mask_vectype);
>                   }
> -               else if (final_mask)
> -                 call = gimple_build_call_internal
> -                          (IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
> -                           vec_offset, scale, vec_oprnd, final_mask);
> +             }
> +
> +           gcall *call;
> +           if (final_len && final_mask)
> +             {
> +               if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
> +                 call = gimple_build_call_internal (
> +                         IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
> +                         vec_offset, scale, vec_oprnd, final_mask, final_len,
> +                         bias);
>                 else
> -                 call = gimple_build_call_internal (IFN_SCATTER_STORE, 4,
> -                                                    dataref_ptr, vec_offset,
> -                                                    scale, vec_oprnd);
> -               gimple_call_set_nothrow (call, true);
> -               vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> -               new_stmt = call;
> +                 /* Non-vector offset indicates that prefer to take
> +                    MASK_LEN_STRIDED_STORE instead of the
> +                    IFN_MASK_SCATTER_STORE with direct stride arg.  */
> +                 call = gimple_build_call_internal (
> +                         IFN_MASK_LEN_STRIDED_STORE, 6, dataref_ptr,
> +                         vec_offset, vec_oprnd, final_mask, final_len, bias);
>               }
> -           else if (gs_info.decl)
> +           else if (final_mask)
> +             call = gimple_build_call_internal
> +                          (IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
> +                           vec_offset, scale, vec_oprnd, final_mask);
> +           else
> +             call = gimple_build_call_internal (IFN_SCATTER_STORE, 4,
> +                                                dataref_ptr, vec_offset,
> +                                                scale, vec_oprnd);
> +           gimple_call_set_nothrow (call, true);
> +           vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> +           new_stmt = call;
> +         }
> +        else if (gs_info.decl)
> +         {
> +           /* The builtin decls path for scatter is legacy, x86 only.  */
> +           gcc_assert (nunits.is_constant ()
> +                       && (!final_mask
> +                           || SCALAR_INT_MODE_P
> +                                (TYPE_MODE (TREE_TYPE (final_mask)))));
> +           if (costing_p)
>               {
> -               /* The builtin decls path for scatter is legacy, x86 only.  */
> -               gcc_assert (nunits.is_constant ()
> -                           && (!final_mask
> -                               || SCALAR_INT_MODE_P
> -                                    (TYPE_MODE (TREE_TYPE (final_mask)))));
> -               if (costing_p)
> -                 {
> -                   unsigned int cnunits = vect_nunits_for_cost (vectype);
> -                   inside_cost
> -                     += record_stmt_cost (cost_vec, cnunits, scalar_store,
> -                                          stmt_info, slp_node, 0, vect_body);
> -                   continue;
> -                 }
> -               poly_uint64 offset_nunits
> -                 = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype);
> -               if (known_eq (nunits, offset_nunits))
> -                 {
> -                   new_stmt = vect_build_one_scatter_store_call
> +               unsigned int cnunits = vect_nunits_for_cost (vectype);
> +               inside_cost
> +                 += record_stmt_cost (cost_vec, cnunits, scalar_store,
> +                                      stmt_info, slp_node, 0, vect_body);
> +               continue;
> +             }
> +
> +             poly_uint64 offset_nunits
> +               = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype);
> +             if (known_eq (nunits, offset_nunits))
> +               {
> +                 new_stmt = vect_build_one_scatter_store_call
>                                  (vinfo, stmt_info, gsi, &gs_info,
> -                                 dataref_ptr, vec_offsets[vec_num * j + i],
> +                                 dataref_ptr, vec_offsets[j],
>                                   vec_oprnd, final_mask);
> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> -                                                new_stmt, gsi);
> -                 }
> -               else if (known_eq (nunits, offset_nunits * 2))
> -                 {
> -                   /* We have a offset vector with half the number of
> -                      lanes but the builtins will store full vectype
> -                      data from the lower lanes.  */
> -                   new_stmt = vect_build_one_scatter_store_call
> +                 vect_finish_stmt_generation (vinfo, stmt_info,
> +                                              new_stmt, gsi);
> +               }
> +             else if (known_eq (nunits, offset_nunits * 2))
> +               {
> +                 /* We have a offset vector with half the number of
> +                    lanes but the builtins will store full vectype
> +                    data from the lower lanes.  */
> +                 new_stmt = vect_build_one_scatter_store_call
>                                  (vinfo, stmt_info, gsi, &gs_info,
> -                                 dataref_ptr,
> -                                 vec_offsets[2 * vec_num * j + 2 * i],
> +                                 dataref_ptr, vec_offsets[2 * j],
>                                   vec_oprnd, final_mask);
> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> +                 vect_finish_stmt_generation (vinfo, stmt_info,
>                                                  new_stmt, gsi);
> -                   int count = nunits.to_constant ();
> -                   vec_perm_builder sel (count, count, 1);
> -                   sel.quick_grow (count);
> -                   for (int i = 0; i < count; ++i)
> -                     sel[i] = i | (count / 2);
> -                   vec_perm_indices indices (sel, 2, count);
> -                   tree perm_mask
> -                     = vect_gen_perm_mask_checked (vectype, indices);
> -                   new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR,
> -                                                   vec_oprnd, vec_oprnd,
> -                                                   perm_mask);
> -                   vec_oprnd = make_ssa_name (vectype);
> -                   gimple_set_lhs (new_stmt, vec_oprnd);
> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> -                                                new_stmt, gsi);
> -                   if (final_mask)
> -                     {
> -                       new_stmt = gimple_build_assign (NULL_TREE,
> -                                                       VEC_UNPACK_HI_EXPR,
> -                                                       final_mask);
> -                       final_mask = make_ssa_name
> +                 int count = nunits.to_constant ();
> +                 vec_perm_builder sel (count, count, 1);
> +                 sel.quick_grow (count);
> +                 for (int i = 0; i < count; ++i)
> +                   sel[i] = i | (count / 2);
> +                 vec_perm_indices indices (sel, 2, count);
> +                 tree perm_mask
> +                   = vect_gen_perm_mask_checked (vectype, indices);
> +                 new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR,
> +                                                 vec_oprnd, vec_oprnd,
> +                                                 perm_mask);
> +                 vec_oprnd = make_ssa_name (vectype);
> +                 gimple_set_lhs (new_stmt, vec_oprnd);
> +                 vect_finish_stmt_generation (vinfo, stmt_info,
> +                                              new_stmt, gsi);
> +                 if (final_mask)
> +                   {
> +                     new_stmt = gimple_build_assign (NULL_TREE,
> +                                                     VEC_UNPACK_HI_EXPR,
> +                                                     final_mask);
> +                     final_mask = make_ssa_name
>                                     (truth_type_for (gs_info.offset_vectype));
> -                       gimple_set_lhs (new_stmt, final_mask);
> -                       vect_finish_stmt_generation (vinfo, stmt_info,
> -                                                    new_stmt, gsi);
> +                     gimple_set_lhs (new_stmt, final_mask);
> +                     vect_finish_stmt_generation (vinfo, stmt_info,
> +                                                  new_stmt, gsi);
>                       }
> -                   new_stmt = vect_build_one_scatter_store_call
> -                                (vinfo, stmt_info, gsi, &gs_info,
> -                                 dataref_ptr,
> -                                 vec_offsets[2 * vec_num * j + 2 * i + 1],
> -                                 vec_oprnd, final_mask);
> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> -                                                new_stmt, gsi);
> -                 }
> -               else if (known_eq (nunits * 2, offset_nunits))
> -                 {
> -                   /* We have a offset vector with double the number of
> -                      lanes.  Select the low/high part accordingly.  */
> -                   vec_offset = vec_offsets[(vec_num * j + i) / 2];
> -                   if ((vec_num * j + i) & 1)
> -                     {
> -                       int count = offset_nunits.to_constant ();
> -                       vec_perm_builder sel (count, count, 1);
> -                       sel.quick_grow (count);
> -                       for (int i = 0; i < count; ++i)
> -                         sel[i] = i | (count / 2);
> -                       vec_perm_indices indices (sel, 2, count);
> -                       tree perm_mask = vect_gen_perm_mask_checked
> -                                          (TREE_TYPE (vec_offset), indices);
> -                       new_stmt = gimple_build_assign (NULL_TREE,
> -                                                       VEC_PERM_EXPR,
> -                                                       vec_offset,
> -                                                       vec_offset,
> -                                                       perm_mask);
> -                       vec_offset = make_ssa_name (TREE_TYPE (vec_offset));
> -                       gimple_set_lhs (new_stmt, vec_offset);
> -                       vect_finish_stmt_generation (vinfo, stmt_info,
> -                                                    new_stmt, gsi);
> -                     }
> -                   new_stmt = vect_build_one_scatter_store_call
> +
> +                 new_stmt = vect_build_one_scatter_store_call
> +                               (vinfo, stmt_info, gsi, &gs_info,
> +                                dataref_ptr, vec_offsets[2 * j + 1],
> +                                vec_oprnd, final_mask);
> +                 vect_finish_stmt_generation (vinfo, stmt_info,
> +                                              new_stmt, gsi);
> +               }
> +             else if (known_eq (nunits * 2, offset_nunits))
> +               {
> +                 /* We have a offset vector with double the number of
> +                    lanes.  Select the low/high part accordingly.  */
> +                 vec_offset = vec_offsets[j / 2];
> +                 if (j & 1)
> +                   {
> +                     int count = offset_nunits.to_constant ();
> +                     vec_perm_builder sel (count, count, 1);
> +                     sel.quick_grow (count);
> +                     for (int i = 0; i < count; ++i)
> +                       sel[i] = i | (count / 2);
> +                     vec_perm_indices indices (sel, 2, count);
> +                     tree perm_mask = vect_gen_perm_mask_checked
> +                                        (TREE_TYPE (vec_offset), indices);
> +                     new_stmt = gimple_build_assign (NULL_TREE,
> +                                                     VEC_PERM_EXPR,
> +                                                     vec_offset,
> +                                                     vec_offset,
> +                                                     perm_mask);
> +                     vec_offset = make_ssa_name (TREE_TYPE (vec_offset));
> +                     gimple_set_lhs (new_stmt, vec_offset);
> +                     vect_finish_stmt_generation (vinfo, stmt_info,
> +                                                  new_stmt, gsi);
> +                   }
> +
> +                 new_stmt = vect_build_one_scatter_store_call
>                                  (vinfo, stmt_info, gsi, &gs_info,
>                                   dataref_ptr, vec_offset,
>                                   vec_oprnd, final_mask);
> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> -                                                new_stmt, gsi);
> -                 }
> -               else
> -                 gcc_unreachable ();
> -             }
> -           else
> +                 vect_finish_stmt_generation (vinfo, stmt_info,
> +                                              new_stmt, gsi);
> +               }
> +             else
> +               gcc_unreachable ();
> +         }
> +       else
> +         {
> +           /* Emulated scatter.  */
> +           gcc_assert (!final_mask);
> +           if (costing_p)
>               {
> -               /* Emulated scatter.  */
> -               gcc_assert (!final_mask);
> -               if (costing_p)
> -                 {
> -                   unsigned int cnunits = vect_nunits_for_cost (vectype);
> -                   /* For emulated scatter N offset vector element extracts
> -                      (we assume the scalar scaling and ptr + offset add is
> -                      consumed by the load).  */
> -                   inside_cost
> -                     += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
> -                                          stmt_info, slp_node, 0, vect_body);
> -                   /* N scalar stores plus extracting the elements.  */
> -                   inside_cost
> -                     += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
> -                                          stmt_info, slp_node, 0, vect_body);
> -                   inside_cost
> -                     += record_stmt_cost (cost_vec, cnunits, scalar_store,
> -                                          stmt_info, slp_node, 0, vect_body);
> -                   continue;
> -                 }
> +               unsigned int cnunits = vect_nunits_for_cost (vectype);
> +               /* For emulated scatter N offset vector element extracts
> +                  (we assume the scalar scaling and ptr + offset add is
> +                  consumed by the load).  */
> +               inside_cost
> +                 += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
> +                                      stmt_info, slp_node, 0, vect_body);
> +               /* N scalar stores plus extracting the elements.  */
> +               inside_cost
> +                 += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
> +                                      stmt_info, slp_node, 0, vect_body);
> +               inside_cost
> +                 += record_stmt_cost (cost_vec, cnunits, scalar_store,
> +                                      stmt_info, slp_node, 0, vect_body);
> +               continue;
> +             }
>  
> -               unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
> -               unsigned HOST_WIDE_INT const_offset_nunits
> -                 = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype).to_constant 
> ();
> -               vec<constructor_elt, va_gc> *ctor_elts;
> -               vec_alloc (ctor_elts, const_nunits);
> -               gimple_seq stmts = NULL;
> -               tree elt_type = TREE_TYPE (vectype);
> -               unsigned HOST_WIDE_INT elt_size
> -                 = tree_to_uhwi (TYPE_SIZE (elt_type));
> -               /* We support offset vectors with more elements
> -                  than the data vector for now.  */
> -               unsigned HOST_WIDE_INT factor
> -                 = const_offset_nunits / const_nunits;
> -               vec_offset = vec_offsets[(vec_num * j + i) / factor];
> -               unsigned elt_offset
> -                 = ((vec_num * j + i) % factor) * const_nunits;
> -               tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> -               tree scale = size_int (gs_info.scale);
> -               align = get_object_alignment (DR_REF (first_dr_info->dr));
> -               tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
> -               for (unsigned k = 0; k < const_nunits; ++k)
> -                 {
> -                   /* Compute the offsetted pointer.  */
> -                   tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
> -                                           bitsize_int (k + elt_offset));
> -                   tree idx
> -                     = gimple_build (&stmts, BIT_FIELD_REF, idx_type,
> -                                     vec_offset, TYPE_SIZE (idx_type), boff);
> -                   idx = gimple_convert (&stmts, sizetype, idx);
> -                   idx = gimple_build (&stmts, MULT_EXPR, sizetype,
> -                                       idx, scale);
> -                   tree ptr
> -                     = gimple_build (&stmts, PLUS_EXPR,
> -                                     TREE_TYPE (dataref_ptr),
> -                                     dataref_ptr, idx);
> -                   ptr = gimple_convert (&stmts, ptr_type_node, ptr);
> -                   /* Extract the element to be stored.  */
> -                   tree elt
> -                     = gimple_build (&stmts, BIT_FIELD_REF,
> -                                     TREE_TYPE (vectype),
> -                                     vec_oprnd, TYPE_SIZE (elt_type),
> -                                     bitsize_int (k * elt_size));
> -                   gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> -                   stmts = NULL;
> -                   tree ref
> -                     = build2 (MEM_REF, ltype, ptr,
> -                               build_int_cst (ref_type, 0));
> -                   new_stmt = gimple_build_assign (ref, elt);
> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> -                                                new_stmt, gsi);
> -                 }
> -               if (slp)
> -                 slp_node->push_vec_def (new_stmt);
> +           unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
> +           unsigned HOST_WIDE_INT const_offset_nunits
> +             = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype).to_constant ();
> +           vec<constructor_elt, va_gc> *ctor_elts;
> +           vec_alloc (ctor_elts, const_nunits);
> +           gimple_seq stmts = NULL;
> +           tree elt_type = TREE_TYPE (vectype);
> +           unsigned HOST_WIDE_INT elt_size
> +             = tree_to_uhwi (TYPE_SIZE (elt_type));
> +           /* We support offset vectors with more elements
> +              than the data vector for now.  */
> +           unsigned HOST_WIDE_INT factor
> +             = const_offset_nunits / const_nunits;
> +           vec_offset = vec_offsets[j / factor];
> +           unsigned elt_offset
> +             = (j % factor) * const_nunits;
> +           tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> +           tree scale = size_int (gs_info.scale);
> +           align = get_object_alignment (DR_REF (first_dr_info->dr));
> +           tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
> +           for (unsigned k = 0; k < const_nunits; ++k)
> +             {
> +               /* Compute the offsetted pointer.  */
> +               tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
> +                                       bitsize_int (k + elt_offset));
> +               tree idx
> +                 = gimple_build (&stmts, BIT_FIELD_REF, idx_type,
> +                                 vec_offset, TYPE_SIZE (idx_type), boff);
> +               idx = gimple_convert (&stmts, sizetype, idx);
> +               idx = gimple_build (&stmts, MULT_EXPR, sizetype,
> +                                   idx, scale);
> +               tree ptr
> +                 = gimple_build (&stmts, PLUS_EXPR,
> +                                 TREE_TYPE (dataref_ptr),
> +                                 dataref_ptr, idx);
> +               ptr = gimple_convert (&stmts, ptr_type_node, ptr);
> +               /* Extract the element to be stored.  */
> +               tree elt
> +                 = gimple_build (&stmts, BIT_FIELD_REF,
> +                                 TREE_TYPE (vectype),
> +                                 vec_oprnd, TYPE_SIZE (elt_type),
> +                                 bitsize_int (k * elt_size));
> +               gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> +               stmts = NULL;
> +               tree ref
> +                 = build2 (MEM_REF, ltype, ptr,
> +                           build_int_cst (ref_type, 0));
> +               new_stmt = gimple_build_assign (ref, elt);
> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>               }
> +
> +           if (slp)
> +             slp_node->push_vec_def (new_stmt);
>           }
> +
>         if (!slp && !costing_p)
>           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>       }
> 
> 
> 
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to