> I think this is better, but I'd make this partial_store_elidable_p only and
> for
> loads replace with 'else'? Alternatively rename it to
> partial_load_store_all_lanes_masked_p
> or so? Btw, I see we're oddly rejecting any mask != -1 even when len
> == 0? Likewise
> we don't seem to treat mask == 0 the same as len == 0?
Yes, I also noticed the inconsistencies. At first I wanted to change as little
as possible but maybe another approach is also still ok:
What we're still missing to treat things more uniformly is an else operand for
len_load. Then we can use internal_fn_else_index for all partial loads and use
its value.
So I went ahead and did that in a separate preparation patch.
The attached v3 of the elision patch goes a bit further in than v1/v2 in it
tries to classify "all active", "all inactive", and "mixed" and also checks all
partial loads (like gathers, lanes). It depends on the preparation patch,
though, because it calls internal_fn_else_index unconditionally.
They have been bootstrapped and regtested individually as well as together on
x86 and power10. Regtested on riscv64, aarch64, and s390 (via qemu).
Regards
Robin
[PATCH v3] fold: Elide MASK_LEN_LOAD/STORE with zero length [PR122635].
This patch adds zero-length handling to gimple_fold_partial_store and
gimple_fold_partial_load and unifies them into
gimple_fold_partial_load_store.
It introduces a new function partial_load_store_mask_state that
returns
MASK_ALL_INACTIVE,
MASK_ALL_ACTIVE, or
MASK_UNKNOWN.
This result is used to either replace a load with its else value and
elide a store (when all inactive), turn the load/store into a regular
mem ref (all_active), or do nothing.
PR tree-optimization/122635
gcc/ChangeLog:
* gimple-fold.cc (enum mask_load_store_state): New enum.
(gimple_fold_partial_load_store_mem_ref): Only fold
"all active" loads/stores.
(partial_load_store_mask_state): New function to compute mask
state.
(gimple_fold_partial_load): Remove.
(gimple_fold_partial_load_store): New function.
(gimple_fold_partial_store): Remove.
(gimple_fold_call): Use new function.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/sve/pfalse-store.c: Expect more elided
stores.
* gcc.target/riscv/rvv/autovec/pr122635-1.c: New test.
* gcc.target/riscv/rvv/autovec/pr122635-2.c: New test.
* gcc.target/powerpc/p9-vec-length-epil-8.c: Expect two lxvl
less.
---
gcc/gimple-fold.cc | 225 ++++++++++++------
.../gcc.target/aarch64/sve/pfalse-store.c | 5 +-
.../gcc.target/powerpc/p9-vec-length-epil-8.c | 2 +-
.../gcc.target/riscv/rvv/autovec/pr122635-1.c | 20 ++
.../gcc.target/riscv/rvv/autovec/pr122635-2.c | 18 ++
5 files changed, 198 insertions(+), 72 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-2.c
diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
index 3fc76313622..a7b5d8352e7 100644
--- a/gcc/gimple-fold.cc
+++ b/gcc/gimple-fold.cc
@@ -5757,50 +5757,112 @@ arith_overflowed_p (enum tree_code code, const_tree
type,
return wi::min_precision (wres, sign) > TYPE_PRECISION (type);
}
-/* If IFN_{MASK,LEN,MASK_LEN}_LOAD/STORE call CALL is unconditional,
- return a MEM_REF for the memory it references, otherwise return null.
- VECTYPE is the type of the memory vector. MASK_P indicates it's for
- MASK if true, otherwise it's for LEN. */
+/* Mask state for partial load/store operations (mask and length). */
+enum mask_load_store_state {
+ MASK_ALL_INACTIVE, /* All lanes/elements are inactive (can be elided). */
+ MASK_ALL_ACTIVE, /* All lanes/elements are active (unconditional). */
+ MASK_UNKNOWN
+};
-static tree
-gimple_fold_partial_load_store_mem_ref (gcall *call, tree vectype, bool mask_p)
+/* Check the mask/length state of IFN_{MASK,LEN,MASK_LEN}_LOAD/STORE call CALL.
+ Returns whether all elements are active, all inactive, or mixed.
+ VECTYPE is the vector type of the operation. */
+
+static enum mask_load_store_state
+partial_load_store_mask_state (gcall *call, tree vectype)
{
- tree ptr = gimple_call_arg (call, 0);
- tree alias_align = gimple_call_arg (call, 1);
- if (!tree_fits_uhwi_p (alias_align))
- return NULL_TREE;
+ internal_fn ifn = gimple_call_internal_fn (call);
+ int mask_index = internal_fn_mask_index (ifn);
+ int len_index = internal_fn_len_index (ifn);
+
+ /* Extract length and mask arguments up front. */
+ tree len = len_index != -1 ? gimple_call_arg (call, len_index) : NULL_TREE;
+ tree bias = len ? gimple_call_arg (call, len_index + 1) : NULL_TREE;
+ tree mask = mask_index != -1 ? gimple_call_arg (call, mask_index) :
NULL_TREE;
+
+ poly_int64 nelts = GET_MODE_NUNITS (TYPE_MODE (vectype));
- if (mask_p)
+ poly_widest_int wlen = -1;
+ bool full_length_p = !len; /* No length means full length. */
+
+ /* Compute effective length. */
+ if (len && poly_int_tree_p (len))
{
- tree mask = gimple_call_arg (call, 2);
- if (!integer_all_onesp (mask))
- return NULL_TREE;
+ gcc_assert (TREE_CODE (bias) == INTEGER_CST);
+ wlen = wi::to_poly_widest (len) + wi::to_widest (bias);
+
+ if (known_eq (wlen, 0))
+ return MASK_ALL_INACTIVE;
+
+ if (known_eq (wlen, nelts))
+ full_length_p = true;
+ else
+ full_length_p = false;
}
- else
+
+ /* Check mask for early return cases. */
+ if (mask)
{
- internal_fn ifn = gimple_call_internal_fn (call);
- int len_index = internal_fn_len_index (ifn);
- tree basic_len = gimple_call_arg (call, len_index);
- if (!poly_int_tree_p (basic_len))
- return NULL_TREE;
- tree bias = gimple_call_arg (call, len_index + 1);
- gcc_assert (TREE_CODE (bias) == INTEGER_CST);
- /* For LEN_LOAD/LEN_STORE/MASK_LEN_LOAD/MASK_LEN_STORE,
- we don't fold when (bias + len) != VF. */
- if (maybe_ne (wi::to_poly_widest (basic_len) + wi::to_widest (bias),
- GET_MODE_NUNITS (TYPE_MODE (vectype))))
- return NULL_TREE;
+ if (integer_zerop (mask))
+ return MASK_ALL_INACTIVE;
+
+ if (full_length_p && integer_all_onesp (mask))
+ return MASK_ALL_ACTIVE;
+ }
+ else if (full_length_p)
+ /* No mask and full length means all active. */
+ return MASK_ALL_ACTIVE;
+
+ /* For VLA vectors, we can't do much more. */
+ if (!nelts.is_constant ())
+ return MASK_UNKNOWN;
+
+ /* Same for VLS vectors with non-constant mask. */
+ if (mask && TREE_CODE (mask) != VECTOR_CST)
+ return MASK_UNKNOWN;
- /* For MASK_LEN_{LOAD,STORE}, we should also check whether
- the mask is all ones mask. */
- if (ifn == IFN_MASK_LEN_LOAD || ifn == IFN_MASK_LEN_STORE)
+ /* Check VLS vector elements. */
+ gcc_assert (wlen.is_constant ());
+
+ HOST_WIDE_INT active_len = wlen.to_constant ().to_shwi ();
+ if (active_len == -1)
+ active_len = nelts.to_constant ();
+
+ /* Check if all elements in the active range match the mask. */
+ for (HOST_WIDE_INT i = 0; i < active_len; i++)
+ {
+ bool elt_active = !mask || !integer_zerop (vector_cst_elt (mask, i));
+ if (!elt_active)
{
- tree mask = gimple_call_arg (call, internal_fn_mask_index (ifn));
- if (!integer_all_onesp (mask))
- return NULL_TREE;
+ /* Found an inactive element. Check if all are inactive. */
+ for (HOST_WIDE_INT j = 0; j < active_len; j++)
+ if (!mask || !integer_zerop (vector_cst_elt (mask, j)))
+ return MASK_UNKNOWN; /* Mixed state. */
+ return MASK_ALL_INACTIVE;
}
}
+ /* All elements in active range are active. */
+ return full_length_p ? MASK_ALL_ACTIVE : MASK_UNKNOWN;
+}
+
+
+/* If IFN_{MASK,LEN,MASK_LEN}_LOAD/STORE call CALL is unconditional
+ (all lanes active), return a MEM_REF for the memory it references.
+ Otherwise return NULL_TREE. VECTYPE is the type of the memory vector. */
+
+static tree
+gimple_fold_partial_load_store_mem_ref (gcall *call, tree vectype)
+{
+ /* Only fold if all lanes are active (unconditional). */
+ if (partial_load_store_mask_state (call, vectype) != MASK_ALL_ACTIVE)
+ return NULL_TREE;
+
+ tree ptr = gimple_call_arg (call, 0);
+ tree alias_align = gimple_call_arg (call, 1);
+ if (!tree_fits_uhwi_p (alias_align))
+ return NULL_TREE;
+
unsigned HOST_WIDE_INT align = tree_to_uhwi (alias_align);
if (TYPE_ALIGN (vectype) != align)
vectype = build_aligned_type (vectype, align);
@@ -5808,41 +5870,68 @@ gimple_fold_partial_load_store_mem_ref (gcall *call,
tree vectype, bool mask_p)
return fold_build2 (MEM_REF, vectype, ptr, offset);
}
-/* Try to fold IFN_{MASK,LEN}_LOAD call CALL. Return true on success.
- MASK_P indicates it's for MASK if true, otherwise it's for LEN. */
+/* Try to fold IFN_{MASK,LEN}_LOAD/STORE call CALL. Return true on success.
*/
static bool
-gimple_fold_partial_load (gimple_stmt_iterator *gsi, gcall *call, bool mask_p)
+gimple_fold_partial_load_store (gimple_stmt_iterator *gsi, gcall *call)
{
+ internal_fn ifn = gimple_call_internal_fn (call);
tree lhs = gimple_call_lhs (call);
- if (!lhs)
- return false;
+ bool is_load = (lhs != NULL_TREE);
+ tree vectype;
- if (tree rhs
- = gimple_fold_partial_load_store_mem_ref (call, TREE_TYPE (lhs), mask_p))
+ if (is_load)
+ vectype = TREE_TYPE (lhs);
+ else
{
- gassign *new_stmt = gimple_build_assign (lhs, rhs);
- gimple_set_location (new_stmt, gimple_location (call));
- gimple_move_vops (new_stmt, call);
- gsi_replace (gsi, new_stmt, false);
- return true;
+ tree rhs = gimple_call_arg (call, internal_fn_stored_value_index (ifn));
+ vectype = TREE_TYPE (rhs);
}
- return false;
-}
-/* Try to fold IFN_{MASK,LEN}_STORE call CALL. Return true on success.
- MASK_P indicates it's for MASK if true, otherwise it's for LEN. */
+ enum mask_load_store_state state
+ = partial_load_store_mask_state (call, vectype);
-static bool
-gimple_fold_partial_store (gimple_stmt_iterator *gsi, gcall *call,
- bool mask_p)
-{
- internal_fn ifn = gimple_call_internal_fn (call);
- tree rhs = gimple_call_arg (call, internal_fn_stored_value_index (ifn));
- if (tree lhs
- = gimple_fold_partial_load_store_mem_ref (call, TREE_TYPE (rhs), mask_p))
+ /* Handle all-inactive case. */
+ if (state == MASK_ALL_INACTIVE)
{
- gassign *new_stmt = gimple_build_assign (lhs, rhs);
+ if (is_load)
+ {
+ /* Replace load with else value. */
+ int else_index = internal_fn_else_index (ifn);
+ tree else_value = gimple_call_arg (call, else_index);
+ gassign *new_stmt = gimple_build_assign (lhs, else_value);
+ gimple_set_location (new_stmt, gimple_location (call));
+ gsi_replace (gsi, new_stmt, false);
+ return true;
+ }
+ else
+ {
+ /* Remove inactive store. */
+ unlink_stmt_vdef (call);
+ release_defs (call);
+ gsi_replace (gsi, gimple_build_nop (), true);
+ return true;
+ }
+ }
+
+ /* We cannot simplify a gather/scatter or load/store lanes further. */
+ if (internal_gather_scatter_fn_p (ifn)
+ || TREE_CODE (vectype) == ARRAY_TYPE)
+ return false;
+
+ /* Handle all-active case - fold to regular memory operation. */
+ if (tree mem_ref = gimple_fold_partial_load_store_mem_ref (call, vectype))
+ {
+ gassign *new_stmt;
+ if (is_load)
+ new_stmt = gimple_build_assign (lhs, mem_ref);
+ else
+ {
+ tree rhs
+ = gimple_call_arg (call, internal_fn_stored_value_index (ifn));
+ new_stmt = gimple_build_assign (mem_ref, rhs);
+ }
+
gimple_set_location (new_stmt, gimple_location (call));
gimple_move_vops (new_stmt, call);
gsi_replace (gsi, new_stmt, false);
@@ -6075,19 +6164,21 @@ gimple_fold_call (gimple_stmt_iterator *gsi, bool
inplace)
cplx_result = true;
uaddc_usubc = true;
break;
- case IFN_MASK_LOAD:
- changed |= gimple_fold_partial_load (gsi, stmt, true);
- break;
- case IFN_MASK_STORE:
- changed |= gimple_fold_partial_store (gsi, stmt, true);
- break;
case IFN_LEN_LOAD:
+ case IFN_MASK_LOAD:
case IFN_MASK_LEN_LOAD:
- changed |= gimple_fold_partial_load (gsi, stmt, false);
- break;
+ case IFN_MASK_GATHER_LOAD:
+ case IFN_MASK_LEN_GATHER_LOAD:
+ case IFN_MASK_LOAD_LANES:
+ case IFN_MASK_LEN_LOAD_LANES:
case IFN_LEN_STORE:
+ case IFN_MASK_STORE:
case IFN_MASK_LEN_STORE:
- changed |= gimple_fold_partial_store (gsi, stmt, false);
+ case IFN_MASK_SCATTER_STORE:
+ case IFN_MASK_LEN_SCATTER_STORE:
+ case IFN_MASK_STORE_LANES:
+ case IFN_MASK_LEN_STORE_LANES:
+ changed |= gimple_fold_partial_load_store (gsi, stmt);
break;
default:
break;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pfalse-store.c
b/gcc/testsuite/gcc.target/aarch64/sve/pfalse-store.c
index 1539f58c824..39db13bbcd6 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pfalse-store.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pfalse-store.c
@@ -46,8 +46,5 @@ ALL_DATA (st2, x2_t)
ALL_DATA (st3, x3_t)
ALL_DATA (st4, x4_t)
-/* FIXME: Currently, st1/2/3/4 are not folded with a pfalse
- predicate, which is the reason for the 48 missing cases below. Once
- folding is implemented for these intrinsics, the sum should be 60. */
-/* { dg-final { scan-assembler-times {\t.cfi_startproc\n\tret\n} 12 } } */
+/* { dg-final { scan-assembler-times {\t.cfi_startproc\n\tret\n} 60 } } */
/* { dg-final { scan-assembler-times {\t.cfi_startproc\n} 60 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/p9-vec-length-epil-8.c
b/gcc/testsuite/gcc.target/powerpc/p9-vec-length-epil-8.c
index 34a2c8eb11b..5dff0d0ceb9 100644
--- a/gcc/testsuite/gcc.target/powerpc/p9-vec-length-epil-8.c
+++ b/gcc/testsuite/gcc.target/powerpc/p9-vec-length-epil-8.c
@@ -13,5 +13,5 @@
#include "p9-vec-length-8.h"
-/* { dg-final { scan-assembler-times {\mlxvl\M} 16 } } */
+/* { dg-final { scan-assembler-times {\mlxvl\M} 14 } } */
/* { dg-final { scan-assembler-times {\mstxvl\M} 7 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-1.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-1.c
new file mode 100644
index 00000000000..0beb3d70866
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv_zvl256b -mabi=lp64d -mrvv-vector-bits=zvl
-mno-autovec-segment" } */
+
+typedef struct {
+ int a[6];
+ float b[3];
+} c;
+
+int d(c *e) {
+ int f =0;
+ for (; f < 3; f++) {
+ e->a[2 * f] = e->b[f];
+ e->a[2 * f + 1] = -e->a[2 * f];
+ e->a[2 * f] = f + 3 * e->a[2 * f];
+ e->a[2 * f + 1] = f + 3 * e->a[2 * f + 1];
+ }
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not "vsetivli.*zero,0" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-2.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-2.c
new file mode 100644
index 00000000000..0de69b52cb0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-2.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv_zvl256b -mabi=lp64d -mrvv-vector-bits=zvl
-mno-autovec-segment" } */
+
+typedef struct {
+ int A[6];
+ float b[];
+} a;
+
+int b(a *a) {
+ int b = 0;
+ for (; b < 3; b++) {
+ a->A[2 * b] = a->b[b] - b + a->A[2 * b];
+ a->A[2 * b + 1] = b * a->A[2 * b + 1];
+ }
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not "vsetivli.*zero,0" } } */
--
2.51.1