On 07/11/2025 13:57, Richard Biener wrote:
On Wed, 5 Nov 2025, Christopher Bazley wrote:

On 28/10/2025 13:29, Richard Biener wrote:
On Tue, 28 Oct 2025, Christopher Bazley wrote:

+/* Materialize length number INDEX for a group of scalar stmts in SLP_NODE
that
+   operate on NVECTORS vectors of type VECTYPE, where 0 <= INDEX <
NVECTORS.  A
+   length limit is only required for the tail, therefore NULL_TREE is
returned
+   for every value of INDEX except that last; otherwise, return a value
that
+   contains FACTOR multiplied by the number of elements that should be
+   processed.  */
+
+tree
+vect_slp_get_bb_len (slp_tree slp_node, unsigned int nvectors, tree
vectype,
+                  unsigned int index, unsigned int factor)
+{
+  gcc_checking_assert (SLP_TREE_CAN_USE_LEN_P (slp_node));
+
+  /* Only the last vector can be a partial vector.  */
+  if (index < nvectors - 1)
+    return NULL_TREE;
+
+  /* vect_get_num_copies only allows a partial vector if it is the only
+     vector.  */
+  if (nvectors > 1)
+    return NULL_TREE;
+
+  gcc_checking_assert (nvectors == 1);
+
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  unsigned int group_size = SLP_TREE_LANES (slp_node);
+
+  /* A single vector can be a full vector, in which case no length limit is
+   * needed.  */
+  if (known_eq (nunits, group_size))
+    return NULL_TREE;
+
+  /* Return the scaled length of a single partial vector.  */
+  gcc_checking_assert (known_lt (group_size, nunits));
+  return size_int (group_size * factor);
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 3115c610736..5ec65b2b2de 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1408,7 +1408,9 @@ vectorizable_internal_function (combined_fn cfn, tree
fndecl,
   /* Record that a complete set of masks associated with VINFO would need to
      contain a sequence of NVECTORS masks that each control a vector of type
      VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
-   these vector masks with the vector version of SCALAR_MASK.  */
+   these vector masks with the vector version of SCALAR_MASK.
Alternatively,
+   if doing basic block vectorization, record that an equivalent mask would
be
+   required to vectorize SLP_NODE.  */
   static void
   vect_record_mask (vec_info *vinfo, slp_tree slp_node, unsigned int
   nvectors,
                 tree vectype, tree scalar_mask)
@@ -1418,7 +1420,10 @@ vect_record_mask (vec_info *vinfo, slp_tree slp_node,
unsigned int nvectors,
       vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
       nvectors,
                          vectype, scalar_mask);
     else
-    (void) slp_node; // FORNOW
+    {
+      gcc_checking_assert (!SLP_TREE_CAN_USE_LEN_P (slp_node));
+      SLP_TREE_CAN_USE_MASK_P (slp_node) = true;
+    }
   }

   /* Given a complete set of masks associated with VINFO, extract mask
number
@@ -1436,16 +1441,15 @@ vect_get_mask (vec_info *vinfo, slp_tree slp_node,
gimple_stmt_iterator *gsi,
       return vect_get_loop_mask (loop_vinfo, gsi, &LOOP_VINFO_MASKS
       (loop_vinfo),
                              nvectors, vectype, index);
     else
-    {
-      (void) slp_node; // FORNOW
-      return NULL_TREE;
-    }
+    return vect_slp_get_bb_mask (slp_node, gsi, nvectors, vectype, index);
   }

   /* Record that a complete set of lengths associated with VINFO would need
   to
      contain a sequence of NVECTORS lengths for controlling an operation on
      VECTYPE.  The operation splits each element of VECTYPE into FACTOR
      separate
-   subelements, measuring the length as a number of these subelements.  */
+   subelements, measuring the length as a number of these subelements.
+   Alternatively, if doing basic block vectorization, record that an
equivalent
+   length would be required to vectorize SLP_NODE.  */
   static void
   vect_record_len (vec_info *vinfo, slp_tree slp_node, unsigned int
   nvectors,
                tree vectype, unsigned int factor)
@@ -1455,7 +1459,10 @@ vect_record_len (vec_info *vinfo, slp_tree slp_node,
unsigned int nvectors,
       vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo),
       nvectors,
                         vectype, factor);
     else
-    (void) slp_node; // FORNOW
+    {
+      gcc_checking_assert (!SLP_TREE_CAN_USE_MASK_P (slp_node));
+      SLP_TREE_CAN_USE_LEN_P (slp_node) = true;
+    }
   }

   /* Given a complete set of lengths associated with VINFO, extract length
number
@@ -1476,10 +1483,7 @@ vect_get_len (vec_info *vinfo, slp_tree slp_node,
gimple_stmt_iterator *gsi,
       return vect_get_loop_len (loop_vinfo, gsi, &LOOP_VINFO_LENS
       (loop_vinfo),
                             nvectors, vectype, index, factor);
     else
-    {
-      (void) slp_node; // FORNOW
-      return NULL_TREE;
-    }
+    return vect_slp_get_bb_len (slp_node, nvectors, vectype, index,
factor);
   }

   static tree permute_vec_elements (vec_info *, tree, tree, tree,
stmt_vec_info,
@@ -14252,24 +14256,35 @@ supportable_indirect_convert_operation
(code_helper code,
      mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I.
      Add the statements to SEQ.  */

+void
+vect_gen_while_ssa_name (gimple_seq *seq, tree mask_type, tree start_index,
+                      tree end_index, tree ssa_name)
+{
+  tree cmp_type = TREE_TYPE (start_index);
+  gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
cmp_type,
+                                                    mask_type,
+                                                    OPTIMIZE_FOR_SPEED));
+  gcall *call
+    = gimple_build_call_internal (IFN_WHILE_ULT, 3, start_index, end_index,
+                               build_zero_cst (mask_type));
That's quite restrictive, for constant_p nunits you should be able to
create a VECTOR_CST.  How do you ensure that the actual vector length
is big enough btw?
CCing the list on my reply.

I think that an existing function named fold_while_ult already optimises
all IFN_WHILE_ULT usage that can be optimised safely, so I'm reluctant
to duplicate a version of that logic here.
x86 does not implement IFN_WHILE_ULT, that's what I wanted to say.
I do not remember any check that ensures this is only called for
targets that do?

The function vect_gen_while_ssa_name is called by

{vectorizable_call|vectorizable_operation|vectorizable_load|vectorizable_store|vectorizable_simd_clone_call} -> vect_get_mask (if and only if vect_can_use_mask_p) -> vect_slp_get_bb_mask

or

{vect_do_peeling|vect_transform_loop} -> vect_set_loop_condition -> vect_set_loop_condition_partial_vectors -> vect_set_loop_controls_directly (if and only if LOOP_VINFO_FULLY_MASKED_P) -> vect_gen_while.

For loop vectorisation, vect_can_use_mask_p is equivalent to LOOP_VINFO_FULLY_MASKED_P; for BB SLP, it is instead equivalent to SLP_TREE_CAN_USE_MASK_P.

SLP_TREE_CAN_USE_MASK_P is initialised to false and set to true in vect_record_mask. I intended it to be a BB-SLP-specific alternative to the !LOOP_VINFO_MASKS (L).is_empty () precondition in LOOP_VINFO_FULLY_MASKED_P; however, LOOP_VINFO_FULLY_MASKED_P also has another precondition that has no equivalent in the definition of SLP_TREE_CAN_USE_MASK_P: LOOP_VINFO_USING_PARTIAL_VECTORS_P.

LOOP_VINFO_USING_PARTIAL_VECTORS_P is true if both LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P and LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P are true. LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P is true if param_vect_partial_vector_usage was true (which does not depend on the target AFAIK) and it has not subsequently been set to false; LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P is true for VLA modes or if peeling is insufficient.

I created a BB SLP equivalent of LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P, named SLP_TREE_CAN_USE_PARTIAL_VECTORS_P, which is set to true during construction of an SLP node (similar to how LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P is initialised to true if param_vect_partial_vector_usage).

Target support for the appropriate load/store-lanes instructions is checked in check_load_store_for_partial_vectors (iff vect_can_use_partial_vectors_p). This function can call vect_cannot_use_partial_vectors (often as an alternative to vect_record_len or vect_record_mask). For loop vectorisation, calling vect_cannot_use_partial_vectors is equivalent to LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P = false; for BB SLP, it is equivalent to SLP_TREE_CAN_USE_PARTIAL_VECTORS_P = false. Consequently, the circumstances in which vect_can_use_mask_p returns false in vect_get_mask are similar to those in which LOOP_VINFO_FULLY_MASKED_P returns false in vect_set_loop_controls_directly, although LOOP_VINFO_FULLY_MASKED_P has additional preconditions.

A robust definition of vect_can_use_mask_p for BB SLP should probably have SLP_TREE_CAN_USE_PARTIAL_VECTORS_P as a precondition for returning true, in conjunction with SLP_TREE_CAN_USE_MASK_P (and similar for vect_can_use_len_p). At the moment, there is a maintenance hazard where vect_record_mask or vect_record_len could be called despite SLP_TREE_CAN_USE_PARTIAL_VECTORS_P == false (although that never happens in my patches) or either of the vect_record_* functions could be called before vect_cannot_use_partial_vectors (e.g., because vect_load_lanes_supported succeeded for one value of group_size but not another) .

Thanks for drawing my attention to this.

--
Christopher Bazley
Staff Software Engineer, GNU Tools Team.
Arm Ltd, 110 Fulbourn Road, Cambridge, CB1 9NJ, UK.
http://www.arm.com/

Reply via email to