On Tue, 9 Nov 2021, Tamar Christina wrote:
> > > + bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index);
> > > + bitmap_set_bit (exit_bbs, loop->latch->index);
> >
> > treating the latch as exit is probably premature optimization (yes, it's
> > empty).
> >
> > > +
> > > + do_rpo_vn (cfun, loop_preheader_edge (loop), exit_bbs);
> > > +
> > > + BITMAP_FREE (exit_bbs);
> >
> > ... deallocation can go. Note I wonder whether, if we are already spinning
> > up
> > VN, we should include the preheader in the operation?
> > We regularly end up emitting redundant vector initializers that could be
> > cleaned up earlier this way.
>
> I've change it to include the preheader but it looks like this breaks
> bootstrap on both
> x86 and AArch64.
>
> On x86 the following testcase
>
> double matmul_c8_vanilla_bbase_0;
> double *matmul_c8_vanilla_dest;
> matmul_c8_vanilla_x;
> matmul_c8_vanilla() {
> for (; matmul_c8_vanilla_x; matmul_c8_vanilla_x++)
> matmul_c8_vanilla_dest[matmul_c8_vanilla_x] += matmul_c8_vanilla_bbase_0;
> }
>
> ICEs with -std=gnu11 -ffast-math -ftree-vectorize -O2 with:
>
> internal compiler error: tree check: expected ssa_name, have var_decl in
> SSA_VAL, at tree-ssa-sccvn.c:535
> 0x80731c tree_check_failed(tree_node const*, char const*, int, char const*,
> ...)
> ../gcc-dsg/gcc/tree.c:8689
> 0x7ebda2 tree_check(tree_node*, char const*, int, char const*, tree_code)
> ../gcc-dsg/gcc/tree.h:3433
> 0x7ebda2 SSA_VAL(tree_node*, bool*)
> ../gcc-dsg/gcc/tree-ssa-sccvn.c:535
> 0x7ebda2 vuse_ssa_val
> ../gcc-dsg/gcc/tree-ssa-sccvn.c:553
> 0x7ebda2 vn_reference_lookup(tree_node*, tree_node*, vn_lookup_kind,
> vn_reference_s**, bool, tree_node**, tree_node*)
> ../gcc-dsg/gcc/tree-ssa-sccvn.c:3664
> 0x10d8ca5 visit_reference_op_load
> ../gcc-dsg/gcc/tree-ssa-sccvn.c:5166
> 0x10d8ca5 visit_stmt
> ../gcc-dsg/gcc/tree-ssa-sccvn.c:5615
> 0x10d976c process_bb
> ../gcc-dsg/gcc/tree-ssa-sccvn.c:7344
> 0x10dafe5 do_rpo_vn
> ../gcc-dsg/gcc/tree-ssa-sccvn.c:7942
> 0x10dc828 do_rpo_vn(function*, edge_def*, bitmap_head*)
> ../gcc-dsg/gcc/tree-ssa-sccvn.c:8039
> 0x119c39c vectorize_loops()
> ../gcc-dsg/gcc/tree-vectorizer.c:1304
OK, as I thought this is .MEMs not in SSA form. We're later
fixing that up so maybe try the attached which re-orders the
postprocessing after vectorization to do that earlier.
Richard.
> on AArch64 this one ICEs with -ffast-math -ftree-vectorize -O2
>
> _Complex *a;
> _Complex b;
> c, d;
> fn1() {
> _Complex e;
> for (; c; ++c)
> e = d * a[c];
> b = e;
> }
>
> With the message
>
> internal compiler error: tree check: expected ssa_name, have var_decl in
> VN_INFO, at tree-ssa-sccvn.c:451
> 0x734073 tree_check_failed(tree_node const*, char const*, int, char const*,
> ...)
> ../../gcc-fsf/gcc/tree.c:8691
> 0x10e2e2f tree_check(tree_node*, char const*, int, char const*, tree_code)
> ../../gcc-fsf/gcc/tree.h:3433
> 0x10e2e2f VN_INFO(tree_node*)
> ../../gcc-fsf/gcc/tree-ssa-sccvn.c:451
> 0x10ed223 process_bb
> ../../gcc-fsf/gcc/tree-ssa-sccvn.c:7331
> 0x10eea43 do_rpo_vn
> ../../gcc-fsf/gcc/tree-ssa-sccvn.c:7944
> 0x10efe2b do_rpo_vn(function*, edge_def*, bitmap_head*)
> ../../gcc-fsf/gcc/tree-ssa-sccvn.c:8039
> 0x11c436b vectorize_loops()
> ../../gcc-fsf/gcc/tree-vectorizer.c:1304
>
> Any ideas?
>
> Thanks,
> Tamar
>
> >
> > Otherwise the change looks OK.
> >
>
> --- inline copy of patch ---
>
> diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
> index
> edb7538a67f00cd80a608ee82510cf437fe88083..029d59016c9652f87d80fc5500f89532c79a66d0
> 100644
> --- a/gcc/tree-vectorizer.c
> +++ b/gcc/tree-vectorizer.c
> @@ -81,7 +81,7 @@ along with GCC; see the file COPYING3. If not see
> #include "gimple-pretty-print.h"
> #include "opt-problem.h"
> #include "internal-fn.h"
> -
> +#include "tree-ssa-sccvn.h"
>
> /* Loop or bb location, with hotness information. */
> dump_user_location_t vect_location;
> @@ -1298,6 +1298,17 @@ vectorize_loops (void)
> if (has_mask_store
> && targetm.vectorize.empty_mask_is_expensive (IFN_MASK_STORE))
> optimize_mask_stores (loop);
> +
> + auto_bitmap exit_bbs;
> + /* Perform local CSE, this esp. helps because we emit code for
> + predicates that need to be shared for optimal predicate usage.
> + However reassoc will re-order them and prevent CSE from working
> + as it should. CSE only the loop body, not the entry. */
> + bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index);
> +
> + edge entry = EDGE_PRED (loop_preheader_edge (loop)->src, 0);
> + do_rpo_vn (cfun, entry, exit_bbs);
> +
> loop->aux = NULL;
> }
>
>
--
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imend
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index a2e13acb6d2..78883b053b1 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -81,7 +81,7 @@ along with GCC; see the file COPYING3. If not see
#include "gimple-pretty-print.h"
#include "opt-problem.h"
#include "internal-fn.h"
-
+#include "tree-ssa-sccvn.h"
/* Loop or bb location, with hotness information. */
dump_user_location_t vect_location;
@@ -1278,23 +1278,6 @@ vectorize_loops (void)
}
}
- for (i = 1; i < number_of_loops (cfun); i++)
- {
- loop_vec_info loop_vinfo;
- bool has_mask_store;
-
- loop = get_loop (cfun, i);
- if (!loop || !loop->aux)
- continue;
- loop_vinfo = (loop_vec_info) loop->aux;
- has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo);
- delete loop_vinfo;
- if (has_mask_store
- && targetm.vectorize.empty_mask_is_expensive (IFN_MASK_STORE))
- optimize_mask_stores (loop);
- loop->aux = NULL;
- }
-
/* Fold IFN_GOMP_SIMD_{VF,LANE,LAST_LANE,ORDERED_{START,END}} builtins. */
if (cfun->has_simduid_loops)
{
@@ -1302,14 +1285,12 @@ vectorize_loops (void)
/* Avoid stale SCEV cache entries for the SIMD_LANE defs. */
scev_reset ();
}
-
/* Shrink any "omp array simd" temporary arrays to the
actual vectorization factors. */
if (simd_array_to_simduid_htab)
shrink_simd_arrays (simd_array_to_simduid_htab, simduid_to_vf_htab);
delete simduid_to_vf_htab;
cfun->has_simduid_loops = false;
- vect_slp_fini ();
if (num_vectorized_loops > 0)
{
@@ -1317,9 +1298,39 @@ vectorize_loops (void)
??? Also while we try hard to update loop-closed SSA form we fail
to properly do this in some corner-cases (see PR56286). */
rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa_only_virtuals);
- return TODO_cleanup_cfg;
+ ret |= TODO_cleanup_cfg;
}
+ for (i = 1; i < number_of_loops (cfun); i++)
+ {
+ loop_vec_info loop_vinfo;
+ bool has_mask_store;
+
+ loop = get_loop (cfun, i);
+ if (!loop || !loop->aux)
+ continue;
+ loop_vinfo = (loop_vec_info) loop->aux;
+ has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo);
+ delete loop_vinfo;
+ if (has_mask_store
+ && targetm.vectorize.empty_mask_is_expensive (IFN_MASK_STORE))
+ optimize_mask_stores (loop);
+
+ auto_bitmap exit_bbs;
+ /* Perform local CSE, this esp. helps because we emit code for
+ predicates that need to be shared for optimal predicate usage.
+ However reassoc will re-order them and prevent CSE from working
+ as it should. CSE only the loop body, not the entry. */
+ bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index);
+
+ edge entry = EDGE_PRED (loop_preheader_edge (loop)->src, 0);
+ do_rpo_vn (cfun, entry, exit_bbs);
+
+ loop->aux = NULL;
+ }
+
+ vect_slp_fini ();
+
return ret;
}