On 11/11/2025 13:37, Richard Biener wrote:
On Mon, 10 Nov 2025, Christopher Bazley wrote:

On 10/11/2025 16:13, Christopher Bazley wrote:
On 10/11/2025 14:59, Christopher Bazley wrote:
On 07/11/2025 13:57, Richard Biener wrote:
On Wed, 5 Nov 2025, Christopher Bazley wrote:

On 28/10/2025 13:29, Richard Biener wrote:
On Tue, 28 Oct 2025, Christopher Bazley wrote:

+/* Materialize length number INDEX for a group of scalar stmts in
SLP_NODE
that
+   operate on NVECTORS vectors of type VECTYPE, where 0 <= INDEX <
NVECTORS.  A
+   length limit is only required for the tail, therefore NULL_TREE is
returned
+   for every value of INDEX except that last; otherwise, return a value
that
+   contains FACTOR multiplied by the number of elements that should be
+   processed.  */
+
+tree
+vect_slp_get_bb_len (slp_tree slp_node, unsigned int nvectors, tree
vectype,
+                  unsigned int index, unsigned int factor)
+{
+  gcc_checking_assert (SLP_TREE_CAN_USE_LEN_P (slp_node));
+
+  /* Only the last vector can be a partial vector.  */
+  if (index < nvectors - 1)
+    return NULL_TREE;
+
+  /* vect_get_num_copies only allows a partial vector if it is the only
+     vector.  */
+  if (nvectors > 1)
+    return NULL_TREE;
+
+  gcc_checking_assert (nvectors == 1);
+
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  unsigned int group_size = SLP_TREE_LANES (slp_node);
+
+  /* A single vector can be a full vector, in which case no length
limit is
+   * needed.  */
+  if (known_eq (nunits, group_size))
+    return NULL_TREE;
+
+  /* Return the scaled length of a single partial vector. */
+  gcc_checking_assert (known_lt (group_size, nunits));
+  return size_int (group_size * factor);
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 3115c610736..5ec65b2b2de 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1408,7 +1408,9 @@ vectorizable_internal_function (combined_fn cfn,
tree
fndecl,
    /* Record that a complete set of masks associated with VINFO would
need to
       contain a sequence of NVECTORS masks that each control a vector of
type
       VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would
AND
-   these vector masks with the vector version of SCALAR_MASK.  */
+   these vector masks with the vector version of SCALAR_MASK.
Alternatively,
+   if doing basic block vectorization, record that an equivalent mask
would
be
+   required to vectorize SLP_NODE.  */
    static void
    vect_record_mask (vec_info *vinfo, slp_tree slp_node, unsigned int
    nvectors,
                  tree vectype, tree scalar_mask)
@@ -1418,7 +1420,10 @@ vect_record_mask (vec_info *vinfo, slp_tree
slp_node,
unsigned int nvectors,
        vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS
(loop_vinfo),
        nvectors,
                           vectype, scalar_mask);
      else
-    (void) slp_node; // FORNOW
+    {
+      gcc_checking_assert (!SLP_TREE_CAN_USE_LEN_P (slp_node));
+      SLP_TREE_CAN_USE_MASK_P (slp_node) = true;
+    }
    }

    /* Given a complete set of masks associated with VINFO, extract mask
number
@@ -1436,16 +1441,15 @@ vect_get_mask (vec_info *vinfo, slp_tree
slp_node,
gimple_stmt_iterator *gsi,
        return vect_get_loop_mask (loop_vinfo, gsi, &LOOP_VINFO_MASKS
        (loop_vinfo),
                               nvectors, vectype, index);
      else
-    {
-      (void) slp_node; // FORNOW
-      return NULL_TREE;
-    }
+    return vect_slp_get_bb_mask (slp_node, gsi, nvectors, vectype,
index);
    }

    /* Record that a complete set of lengths associated with VINFO would
need
    to
       contain a sequence of NVECTORS lengths for controlling an
operation on
       VECTYPE.  The operation splits each element of VECTYPE into FACTOR
       separate
-   subelements, measuring the length as a number of these subelements.
*/
+   subelements, measuring the length as a number of these subelements.
+   Alternatively, if doing basic block vectorization, record that an
equivalent
+   length would be required to vectorize SLP_NODE.  */
    static void
    vect_record_len (vec_info *vinfo, slp_tree slp_node, unsigned int
    nvectors,
                 tree vectype, unsigned int factor)
@@ -1455,7 +1459,10 @@ vect_record_len (vec_info *vinfo, slp_tree
slp_node,
unsigned int nvectors,
        vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo),
        nvectors,
                          vectype, factor);
      else
-    (void) slp_node; // FORNOW
+    {
+      gcc_checking_assert (!SLP_TREE_CAN_USE_MASK_P (slp_node));
+      SLP_TREE_CAN_USE_LEN_P (slp_node) = true;
+    }
    }

    /* Given a complete set of lengths associated with VINFO, extract
length
number
@@ -1476,10 +1483,7 @@ vect_get_len (vec_info *vinfo, slp_tree slp_node,
gimple_stmt_iterator *gsi,
        return vect_get_loop_len (loop_vinfo, gsi, &LOOP_VINFO_LENS
        (loop_vinfo),
                              nvectors, vectype, index, factor);
      else
-    {
-      (void) slp_node; // FORNOW
-      return NULL_TREE;
-    }
+    return vect_slp_get_bb_len (slp_node, nvectors, vectype, index,
factor);
    }

    static tree permute_vec_elements (vec_info *, tree, tree, tree,
stmt_vec_info,
@@ -14252,24 +14256,35 @@ supportable_indirect_convert_operation
(code_helper code,
       mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I.
       Add the statements to SEQ.  */

+void
+vect_gen_while_ssa_name (gimple_seq *seq, tree mask_type, tree
start_index,
+                      tree end_index, tree ssa_name)
+{
+  tree cmp_type = TREE_TYPE (start_index);
+  gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
cmp_type,
+ mask_type,
+ OPTIMIZE_FOR_SPEED));
+  gcall *call
+    = gimple_build_call_internal (IFN_WHILE_ULT, 3, start_index,
end_index,
+                               build_zero_cst (mask_type));
That's quite restrictive, for constant_p nunits you should be able to
create a VECTOR_CST.  How do you ensure that the actual vector length
is big enough btw?
CCing the list on my reply.

I think that an existing function named fold_while_ult already optimises
all IFN_WHILE_ULT usage that can be optimised safely, so I'm reluctant
to duplicate a version of that logic here.
x86 does not implement IFN_WHILE_ULT, that's what I wanted to say.
I do not remember any check that ensures this is only called for
targets that do?
The function vect_gen_while_ssa_name is called by

{vectorizable_call|vectorizable_operation|vectorizable_load|vectorizable_store|vectorizable_simd_clone_call}
-> vect_get_mask (if and only if vect_can_use_mask_p) ->
vect_slp_get_bb_mask

or

{vect_do_peeling|vect_transform_loop} -> vect_set_loop_condition ->
vect_set_loop_condition_partial_vectors -> vect_set_loop_controls_directly
(if and only if LOOP_VINFO_FULLY_MASKED_P) -> vect_gen_while.

For loop vectorisation, vect_can_use_mask_p is equivalent to
LOOP_VINFO_FULLY_MASKED_P; for BB SLP, it is instead equivalent to
SLP_TREE_CAN_USE_MASK_P.

SLP_TREE_CAN_USE_MASK_P is initialised to false and set to true in
vect_record_mask. I intended it to be a BB-SLP-specific alternative to the
!LOOP_VINFO_MASKS (L).is_empty () precondition in
LOOP_VINFO_FULLY_MASKED_P; however, LOOP_VINFO_FULLY_MASKED_P also has
another precondition that has no equivalent in the definition of
SLP_TREE_CAN_USE_MASK_P: LOOP_VINFO_USING_PARTIAL_VECTORS_P.

LOOP_VINFO_USING_PARTIAL_VECTORS_P is true if both
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P and
LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P are true.
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P is true if
param_vect_partial_vector_usage was true (which does not depend on the
target AFAIK) and it has not subsequently been set to false;
LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P is true for VLA modes or if peeling
is insufficient.

I created a BB SLP equivalent
of LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P, named
SLP_TREE_CAN_USE_PARTIAL_VECTORS_P, which is set to true during
construction of an SLP node (similar to how
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P is initialised to true if
param_vect_partial_vector_usage).

Target support for the appropriate load/store-lanes instructions is checked
in check_load_store_for_partial_vectors (iff
vect_can_use_partial_vectors_p). This function can
call vect_cannot_use_partial_vectors (often as an alternative to
vect_record_len or vect_record_mask). For loop vectorisation, calling
vect_cannot_use_partial_vectors is equivalent to
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P = false; for BB SLP, it is equivalent
to SLP_TREE_CAN_USE_PARTIAL_VECTORS_P = false. Consequently, the
circumstances in which vect_can_use_mask_p returns false in vect_get_mask
are similar to those in which LOOP_VINFO_FULLY_MASKED_P returns false in
vect_set_loop_controls_directly, although LOOP_VINFO_FULLY_MASKED_P has
additional preconditions.

A robust definition of vect_can_use_mask_p for BB SLP should probably have
SLP_TREE_CAN_USE_PARTIAL_VECTORS_P as a precondition for returning true, in
conjunction with
Actually, I think vect_get_mask and vect_get_len are only called during the
transform phase. Analysis of the SLP tree should fail earlier than that,
during the analysis phase, if the target does not support the required
instructions (e.g., reflected by SLP_TREE_CAN_USE_PARTIAL_VECTORS_P ==
false). I think that condition is already checked in vect_analyze_stmt,
although the vect_can_use_mask_p and vect_can_use_len_p are not part of the
check.
SLP_TREE_CAN_USE_MASK_P (and similar for vect_can_use_len_p). At the
moment, there is a maintenance hazard where vect_record_mask or
vect_record_len could be called despite SLP_TREE_CAN_USE_PARTIAL_VECTORS_P
== false (although that never happens in my patches) or either of the
vect_record_* functions could be called before
vect_cannot_use_partial_vectors (e.g., because vect_load_lanes_supported
succeeded for one value of group_size but not another) .

It seems as though GCC is currently at least somewhat robust against
different calls to vect_load_lanes_supported returning different answers, as
evidenced by the following code in vect_analyze_loop_2:
Oh, but it looks as though that cannot happen for a single SLP node:
check_load_store_for_partial_vectors, vectorizable_operation and
vectorizable_call only call either vect_record_len or vect_record_mask, never
both. That means my existing assertions are correct...
Yes.  Which is why I said - given no technical reason we cannot mix
mask- and len-masking - we decide whether to use mask- or len-masking
when we analyze each individual SLP node operation.  So we don't need
the _CAN_USE_... vs. USING_... duality but we can record a decision
in the SLP node itself and in particular fail analysis if we cannot
mask an operation in a SLP node but we'd need to.

The first version of my alternative patch to store a partial vectors style per SLP node instead of the SLP_TREE_CAN_USE_PARTIAL_VECTORS_P, SLP_TREE_CAN_USE_MASK_P and SLP_TREE_CAN_USE_LEN_P flags does not work. I have made vect_cannot_use_partial_vectors into a no-op (for BB SLP), vect_can_use_partial_vectors_p always returns true (for BB SLP), and I rely on vect_record_len or vect_record_mask to set the partial vectors style.

It does not work because "no style set" and known_lt (group_size, nunits) are not a sufficient condition on which to reject a node in vect_analyze_stmt ("not vectorized: SLP node needs but cannot use partial vectors"). For example, with the published version of my RFC, an add operation with group_size < nunits is not rejected because SLP_TREE_CAN_USE_PARTIAL_VECTORS_P is still true; with the alternative that I am testing, the same add operation is rejected because no partial vectors style is ever recorded for that node.

To make this alternative patch work, I would need a way of easily identifying which nodes might need partial vectors. I can try adding SLP_TREE_MEMORY_ACCESS_TYPE != VMAT_UNINITIALIZED to the conjunction but I am not convinced that is sufficient because vect_cannot_use_partial_vectors or vect_record_mask are also called by vectorizable_operation, vectorizable_simd_clone_call and vectorizable_call. (vect_cannot_use_partial_vectors is also called by vectorizable_conversion. Are these invocations desirable/necessary for BB SLP? I suspect not but I'd appreciate guidance on this point.)

Having to maintain a list of functions that might call vect_cannot_use_partial_vectors feels undesirable and weakens the abstraction. I can switch to storing the partial vectors style as an enum value but think I still need to set a flag on the SLP node in the BB SLP implementation of vect_cannot_use_partial_vectors, as the published version of my RFC already does.

--
Christopher Bazley
Staff Software Engineer, GNU Tools Team.
Arm Ltd, 110 Fulbourn Road, Cambridge, CB1 9NJ, UK.
http://www.arm.com/

Reply via email to