On Mon, Jun 9, 2025 at 2:42 PM Hongtao Liu <[email protected]> wrote:
>
> On Tue, Jun 3, 2025 at 2:59 PM H.J. Lu <[email protected]> wrote:
> >
> > Extend the remove_redundant_vector pass to handle vector broadcasts from
> > constant and variable scalars. When broadcasting from constants and
> > function arguments, we can place a single widest vector broadcast at
> > entry of the nearest common dominator for basic blocks with all uses
> > since constants and function arguments aren't changed. For broadcast
> > from variables with a single definition, the single definition is
> > replaced with the widest broadcast.
> >
> > gcc/
> >
> > PR target/92080
> > * config/i386/i386-expand.cc (ix86_expand_call): Set
> > recursive_function to true for recursive call.
> > * config/i386/i386-features.cc (ix86_place_single_vector_set):
> > Add an argument for inner scalar, default to nullptr. Set the
> > source from inner scalar if not nullptr.
> > (ix86_get_vector_load_mode): Add an argument for scalar mode and
> > handle integer and float scalar modes.
> > (replace_vector_const): Add an argument for scalar mode and pass
> > it to ix86_get_vector_load_mode.
> > (redundant_load_kind): New.
> > (redundant_load): Likewise.
> > (ix86_broadcast_inner): Likewise.
> > (remove_redundant_vector_load): Also support const0_rtx and
> > constm1_rtx broadcasts. Handle vector broadcasts from constant
> > and variable scalars.
> > * config/i386/i386.h (machine_function): Add recursive_function.
> >
> > gcc/testsuite/
> >
> > * gcc.target/i386/keylocker-aesdecwide128kl.c: Updated to expect
> > movdqa instead pxor.
> > * gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise.
> > * gcc.target/i386/keylocker-aesencwide128kl.c: Likewise.
> > * gcc.target/i386/keylocker-aesencwide256kl.c: Likewise.
> > * gcc.target/i386/pr92080-4.c: New test.
> > * gcc.target/i386/pr92080-5.c: Likewise.
> > * gcc.target/i386/pr92080-6.c: Likewise.
> > * gcc.target/i386/pr92080-7.c: Likewise.
> > * gcc.target/i386/pr92080-8.c: Likewise.
> > * gcc.target/i386/pr92080-9.c: Likewise.
> > * gcc.target/i386/pr92080-10.c: Likewise.
> > * gcc.target/i386/pr92080-11.c: Likewise.
> > * gcc.target/i386/pr92080-12.c: Likewise.
> > * gcc.target/i386/pr92080-13.c: Likewise.
> > * gcc.target/i386/pr92080-14.c: Likewise.
> > * gcc.target/i386/pr92080-15.c: Likewise.
> > * gcc.target/i386/pr92080-16.c: Likewise.
>
> >+ machine_mode mode = VOIDmode;
> >+ fixed_size_mode candidate;
> >+ FOR_EACH_MODE_IN_CLASS (mode, vklass)
> >+ if (is_a<fixed_size_mode> (mode, &candidate)
> >+ && GET_MODE_INNER (candidate) == scalar_mode
> >+ && GET_MODE_SIZE (candidate) == size)
> >+ return mode;
> >+
> >+ gcc_unreachable ();
>
> Can we just use default_vectorize_related_mode to get the wanted mode,
> or reuse the code in it?
Changed to
static machine_mode
ix86_get_vector_load_mode (unsigned int size, machine_mode smode)
{
scalar_mode s_mode = as_a <scalar_mode> (smode);
poly_uint64 nunits = size / GET_MODE_SIZE (smode);
machine_mode mode = mode_for_vector (s_mode, nunits).require ();
return mode;
}
> 1591/* The default implementation of TARGET_VECTORIZE_RELATED_MODE.
> */
> 1592
> 1593opt_machine_mode
> 1594default_vectorize_related_mode (machine_mode vector_mode,
> 1595 scalar_mode element_mode,
> 1596 poly_uint64 nunits)
> 1597{
> 1598 machine_mode result_mode;
> 1599 if ((maybe_ne (nunits, 0U)
> 1600 || multiple_p (GET_MODE_SIZE (vector_mode),
> 1601 GET_MODE_SIZE (element_mode), &nunits))
> 1602 && mode_for_vector (element_mode, nunits).exists
> (&result_mode)
> 1603 && VECTOR_MODE_P (result_mode)
> 1604 && targetm.vector_mode_supported_p (result_mode))
> 1605 return result_mode;
> 1606
> 1607 return opt_machine_mode ();
> 1608}
>
> > + else if (CONST_VECTOR_P (op))
> >+ {
> >+ rtx first = XVECEXP (op, 0, 0);
> >+ for (int i = 1; i < nunits; ++i)
> >+ {
> >+ rtx tmp = XVECEXP (op, 0, i);
> >+ /* Vector duplicate value. */
> >+ if (!rtx_equal_p (tmp, first))
> >+ return nullptr;
> >+ }
> >+ if (!CONSTANT_P (first))
> >+ return nullptr;
> Is it really needed? Do we have a case where CONST_VECTOR_P has a
> non-constant component?
Dropped.
> Also I assume that allsame && CONST_VECTOR_P is already handled in
> ix86_expand_vector_init, so do we really need to handle it here?
It handles different vector sizes so that a single ZMM register can be used
with XMM, YMM and ZMM.
> 17725 /* If all values are identical, broadcast the value. */
> 17726 if (all_same
> 17727 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
> 17728 XVECEXP (vals, 0,
> 0)))
> 17729 return;
>
>
> >+ /* Check if there is a matching redundant vector load. */
> >+ bool matched = false;
> >+ FOR_EACH_VEC_ELT (loads, i, load)
> It's expensive. Can we try with a hash like cse_insn?
Each broadcast has different base element values and modes. It may
belong to a different basic block. I don't think hash works here.
> >+ if (load->val
> >+ && load->kind == kind
> >+ && load->mode == scalar_mode
> >+ /* Since CONST_INT load doesn't need memory, it must
> >+ be in the same basic block if it is in a recursive
> >+ call. */
> This part is a bit tricky, It's used to avoid some regression which I
> guess just exposes some latent issue?
> And I didn't see any reason why recursive_call_p needs to be excluded
> for CONST_INT load and same bb.
It has been changed to
FOR_EACH_VEC_ELT (loads, i, load)
if (load->val
&& load->kind == kind
&& load->mode == scalar_mode
&& (load->bb == bb
|| kind < LOAD_VECTOR
/* Non all 0s/1s vector load must be in the same
basic block if it is in a recursive call. */
|| !recursive_call_p)
&& rtx_equal_p (load->val, val))
> >+ && (!recursive_call_p
> >+ || load->bb == bb
> >+ || !(CONST_INT_P (load->val)
> >+ && load->kind == LOAD_VECTOR))
> >+ && rtx_equal_p (load->val, val))
> >+ {
>
>
Here is the v2 patch. OK for master?
--
H.J.
From 21e8711027293073a99fa2a7f3de2ae838dddace Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <[email protected]>
Date: Fri, 9 May 2025 07:17:07 +0800
Subject: [PATCH v2] x86: Extend the remove_redundant_vector pass
Extend the remove_redundant_vector pass to handle vector broadcasts from
constant and variable scalars. When broadcasting from constants and
function arguments, we can place a single widest vector broadcast at
entry of the nearest common dominator for basic blocks with all uses
since constants and function arguments aren't changed. For broadcast
from variables with a single definition, the single definition is
replaced with the widest broadcast.
gcc/
PR target/92080
* config/i386/i386-expand.cc (ix86_expand_call): Set
recursive_function to true for recursive call.
* config/i386/i386-features.cc (ix86_place_single_vector_set):
Add an argument for inner scalar, default to nullptr. Set the
source from inner scalar if not nullptr.
(ix86_get_vector_load_mode): Add an argument for scalar mode and
handle integer and float scalar modes.
(replace_vector_const): Add an argument for scalar mode and pass
it to ix86_get_vector_load_mode.
(redundant_load_kind): New.
(redundant_load): Likewise.
(ix86_broadcast_inner): Likewise.
(remove_redundant_vector_load): Also support const0_rtx and
constm1_rtx broadcasts. Handle vector broadcasts from constant
and variable scalars.
* config/i386/i386.h (machine_function): Add recursive_function.
gcc/testsuite/
* gcc.target/i386/keylocker-aesdecwide128kl.c: Updated to expect
movdqa instead pxor.
* gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise.
* gcc.target/i386/keylocker-aesencwide128kl.c: Likewise.
* gcc.target/i386/keylocker-aesencwide256kl.c: Likewise.
* gcc.target/i386/pr92080-4.c: New test.
* gcc.target/i386/pr92080-5.c: Likewise.
* gcc.target/i386/pr92080-6.c: Likewise.
* gcc.target/i386/pr92080-7.c: Likewise.
* gcc.target/i386/pr92080-8.c: Likewise.
* gcc.target/i386/pr92080-9.c: Likewise.
* gcc.target/i386/pr92080-10.c: Likewise.
* gcc.target/i386/pr92080-11.c: Likewise.
* gcc.target/i386/pr92080-12.c: Likewise.
* gcc.target/i386/pr92080-13.c: Likewise.
* gcc.target/i386/pr92080-14.c: Likewise.
* gcc.target/i386/pr92080-15.c: Likewise.
* gcc.target/i386/pr92080-16.c: Likewise.
Signed-off-by: H.J. Lu <[email protected]>
---
gcc/config/i386/i386-expand.cc | 3 +
gcc/config/i386/i386-features.cc | 413 ++++++++++++++----
gcc/config/i386/i386.h | 3 +
.../i386/keylocker-aesdecwide128kl.c | 14 +-
.../i386/keylocker-aesdecwide256kl.c | 14 +-
.../i386/keylocker-aesencwide128kl.c | 14 +-
.../i386/keylocker-aesencwide256kl.c | 14 +-
gcc/testsuite/gcc.target/i386/pr92080-10.c | 13 +
gcc/testsuite/gcc.target/i386/pr92080-11.c | 33 ++
gcc/testsuite/gcc.target/i386/pr92080-12.c | 16 +
gcc/testsuite/gcc.target/i386/pr92080-13.c | 32 ++
gcc/testsuite/gcc.target/i386/pr92080-14.c | 31 ++
gcc/testsuite/gcc.target/i386/pr92080-15.c | 25 ++
gcc/testsuite/gcc.target/i386/pr92080-16.c | 26 ++
gcc/testsuite/gcc.target/i386/pr92080-4.c | 50 +++
gcc/testsuite/gcc.target/i386/pr92080-5.c | 109 +++++
gcc/testsuite/gcc.target/i386/pr92080-6.c | 19 +
gcc/testsuite/gcc.target/i386/pr92080-7.c | 20 +
gcc/testsuite/gcc.target/i386/pr92080-8.c | 16 +
gcc/testsuite/gcc.target/i386/pr92080-9.c | 81 ++++
20 files changed, 826 insertions(+), 120 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-10.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-11.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-12.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-13.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-14.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-15.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-16.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-4.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-5.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-6.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-7.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-8.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-9.c
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 181e64a86bf..6dda0c93fc2 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -10122,6 +10122,9 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
else if (lookup_attribute ("no_callee_saved_registers",
TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
call_no_callee_saved_registers = true;
+ if (fndecl == current_function_decl
+ && decl_binds_to_current_def_p (fndecl))
+ cfun->machine->recursive_function = true;
}
}
else
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index b1682c2fad4..8eff1eb68f5 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3088,10 +3088,12 @@ ix86_rpad_gate ()
/* Generate a vector set, DEST = SRC, at entry of the nearest dominator
for basic block map BBS, which is in the fake loop that contains the
whole function, so that there is only a single vector set in the
- whole function. */
+ whole function. If not nullptr, INNER_SCALAR is the inner scalar of
+ SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)). */
static void
-ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
+ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
+ rtx inner_scalar = nullptr)
{
basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
while (bb->loop_father->latch
@@ -3112,10 +3114,23 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
insn = NEXT_INSN (insn);
}
+ rtx_insn *set_insn;
if (insn == BB_HEAD (bb))
- emit_insn_before (set, insn);
+ set_insn = emit_insn_before (set, insn);
else
- emit_insn_after (set, insn ? PREV_INSN (insn) : BB_END (bb));
+ set_insn = emit_insn_after (set,
+ insn ? PREV_INSN (insn) : BB_END (bb));
+
+ if (inner_scalar)
+ {
+ /* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */
+ rtx reg = XEXP (src, 0);
+ if ((REG_P (inner_scalar) || MEM_P (inner_scalar))
+ && GET_MODE (reg) != GET_MODE (inner_scalar))
+ inner_scalar = gen_rtx_SUBREG (GET_MODE (reg), inner_scalar, 0);
+ rtx set = gen_rtx_SET (reg, inner_scalar);
+ emit_insn_before (set, set_insn);
+ }
}
/* At entry of the nearest common dominator for basic blocks with
@@ -3346,26 +3361,15 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
return new pass_remove_partial_avx_dependency (ctxt);
}
-/* Return a machine mode suitable for vector SIZE. */
+/* Return a machine mode suitable for vector SIZE with SMODE inner
+ mode. */
static machine_mode
-ix86_get_vector_load_mode (unsigned int size)
+ix86_get_vector_load_mode (unsigned int size, machine_mode smode)
{
- machine_mode mode;
- if (size == 64)
- mode = V64QImode;
- else if (size == 32)
- mode = V32QImode;
- else if (size == 16)
- mode = V16QImode;
- else if (size == 8)
- mode = V8QImode;
- else if (size == 4)
- mode = V4QImode;
- else if (size == 2)
- mode = V2QImode;
- else
- gcc_unreachable ();
+ scalar_mode s_mode = as_a <scalar_mode> (smode);
+ poly_uint64 nunits = size / GET_MODE_SIZE (smode);
+ machine_mode mode = mode_for_vector (s_mode, nunits).require ();
return mode;
}
@@ -3374,7 +3378,8 @@ ix86_get_vector_load_mode (unsigned int size)
static void
replace_vector_const (machine_mode vector_mode, rtx vector_const,
- auto_bitmap &vector_insns)
+ auto_bitmap &vector_insns,
+ machine_mode scalar_mode)
{
bitmap_iterator bi;
unsigned int id;
@@ -3386,7 +3391,8 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
/* Get the single SET instruction. */
rtx set = single_set (insn);
rtx src = SET_SRC (set);
- machine_mode mode = GET_MODE (src);
+ rtx dest = SET_DEST (set);
+ machine_mode mode = GET_MODE (dest);
rtx replace;
/* Replace the source operand with VECTOR_CONST. */
@@ -3400,7 +3406,8 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
/* If the mode size is smaller than its natural size,
first insert an extra move with a QI vector SUBREG
of the same size to avoid validate_subreg failure. */
- machine_mode vmode = ix86_get_vector_load_mode (size);
+ machine_mode vmode
+ = ix86_get_vector_load_mode (size, scalar_mode);
rtx vreg;
if (mode == vmode)
vreg = vector_const;
@@ -3426,6 +3433,172 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
}
}
+enum redundant_load_kind
+{
+ LOAD_CONST0_VECTOR,
+ LOAD_CONSTM1_VECTOR,
+ LOAD_VECTOR
+};
+
+struct redundant_load
+{
+ /* Bitmap of basic blocks with broadcast instructions. */
+ auto_bitmap bbs;
+ /* Bitmap of broadcast instructions. */
+ auto_bitmap insns;
+ /* The broadcast inner scalar. */
+ rtx val;
+ /* The inner scalar mode. */
+ machine_mode mode;
+ /* The instruction which sets the inner scalar. Nullptr if the inner
+ scalar is applied to the whole function, instead of within the same
+ block. */
+ rtx_insn *def_insn;
+ /* The widest broadcast source. */
+ rtx broadcast_source;
+ /* The widest broadcast register. */
+ rtx broadcast_reg;
+ /* The basic block of the broadcast instruction. */
+ basic_block bb;
+ /* The number of broadcast instructions with the same inner scalar. */
+ unsigned HOST_WIDE_INT count;
+ /* The threshold of broadcast instructions with the same inner
+ scalar. */
+ unsigned int threshold;
+ /* The widest broadcast size in bytes. */
+ unsigned int size;
+ /* Load kind. */
+ redundant_load_kind kind;
+};
+
+/* Return the inner scalar if OP is a broadcast, else return nullptr. */
+
+static rtx
+ix86_broadcast_inner (rtx op, machine_mode mode,
+ machine_mode *scalar_mode_p,
+ redundant_load_kind *kind_p, rtx_insn **insn_p)
+{
+ if (op == const0_rtx || op == CONST0_RTX (mode))
+ {
+ *scalar_mode_p = QImode;
+ *kind_p = LOAD_CONST0_VECTOR;
+ *insn_p = nullptr;
+ return const0_rtx;
+ }
+ else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ && (op == constm1_rtx || op == CONSTM1_RTX (mode)))
+ {
+ *scalar_mode_p = QImode;
+ *kind_p = LOAD_CONSTM1_VECTOR;
+ *insn_p = nullptr;
+ return constm1_rtx;
+ }
+
+ mode = GET_MODE (op);
+ int nunits = GET_MODE_NUNITS (mode);
+ if (nunits < 2)
+ return nullptr;
+
+ *kind_p = LOAD_VECTOR;
+
+ rtx reg;
+ if (GET_CODE (op) == VEC_DUPLICATE)
+ {
+ /* Only
+ (vec_duplicate:V4SI (reg:SI 99))
+ (vec_duplicate:V2DF (mem/u/c:DF (symbol_ref/u:DI ("*.LC1") [flags 0x2]) [0 S8 A64]))
+ are supported. */
+ op = XEXP (op, 0);
+ reg = op;
+ if (SUBREG_P (op))
+ reg = SUBREG_REG (op);
+ if (!REG_P (reg))
+ {
+ if (MEM_P (op)
+ && SYMBOL_REF_P (XEXP (op, 0))
+ && CONSTANT_POOL_ADDRESS_P (XEXP (op, 0)))
+ {
+ /* Handle constant broadcast from memory. */
+ *scalar_mode_p = GET_MODE_INNER (mode);
+ *insn_p = nullptr;
+ return op;
+ }
+ return nullptr;
+ }
+ }
+ else if (CONST_VECTOR_P (op))
+ {
+ rtx first = XVECEXP (op, 0, 0);
+ for (int i = 1; i < nunits; ++i)
+ {
+ rtx tmp = XVECEXP (op, 0, i);
+ /* Vector duplicate value. */
+ if (!rtx_equal_p (tmp, first))
+ return nullptr;
+ }
+ *scalar_mode_p = GET_MODE (first);
+ *insn_p = nullptr;
+ return first;
+ }
+ else
+ return nullptr;
+
+ mode = GET_MODE (op);
+
+ /* Only single def chain is supported. */
+ df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
+ if (!ref || DF_REF_NEXT_REG (ref) != nullptr)
+ return nullptr;
+
+ if (DF_REF_IS_ARTIFICIAL (ref))
+ {
+ *scalar_mode_p = mode;
+ *insn_p = nullptr;
+ return op;
+ }
+
+ rtx_insn *insn = DF_REF_INSN (ref);
+ rtx set = single_set (insn);
+ if (!set)
+ return nullptr;
+
+ rtx dest = SET_DEST (set);
+
+ op = SET_SRC (set);
+ /* Set *INSN_P if the scalar source isn't a constant nor an incoming
+ argument. */
+ if (CONST_INT_P (op) || CONST_DOUBLE_P (op))
+ *insn_p = nullptr;
+ else if (REG_P (op)
+ && REG_EXPR (op)
+ && TREE_CODE (REG_EXPR (op)) == PARM_DECL)
+ *insn_p = nullptr;
+ else if (MEM_P (op)
+ && MEM_EXPR (op)
+ && TREE_CODE (get_base_address (MEM_EXPR (op))) == PARM_DECL)
+ *insn_p = nullptr;
+ else
+ {
+ while (SUBREG_P (dest))
+ dest = SUBREG_REG (dest);
+
+ /* Skip if the SET destination mode doesn't match. */
+ if (GET_MODE (dest) != mode)
+ return nullptr;
+
+ /* Set the inner scalar to the SET destination. */
+ op = dest;
+ *insn_p = insn;
+ }
+
+ *scalar_mode_p = mode;
+ if (CONSTANT_P (op))
+ *insn_p = nullptr;
+ else
+ *insn_p = insn;
+ return op;
+}
+
/* At entry of the nearest common dominator for basic blocks with vector
CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
@@ -3440,20 +3613,16 @@ remove_redundant_vector_load (void)
{
timevar_push (TV_MACH_DEP);
- auto_bitmap zero_bbs;
- auto_bitmap m1_bbs;
- auto_bitmap zero_insns;
- auto_bitmap m1_insns;
-
+ auto_vec<redundant_load *> loads;
+ redundant_load *load;
basic_block bb;
rtx_insn *insn;
- unsigned HOST_WIDE_INT zero_count = 0;
- unsigned HOST_WIDE_INT m1_count = 0;
- unsigned int zero_size = 0;
- unsigned int m1_size = 0;
+ unsigned int i;
df_set_flags (DF_DEFER_INSN_RESCAN);
+ bool recursive_call_p = cfun->machine->recursive_function;
+
FOR_EACH_BB_FN (bb, cfun)
{
FOR_BB_INSNS (bb, insn)
@@ -3481,79 +3650,139 @@ remove_redundant_vector_load (void)
if (!REG_P (dest) && !SUBREG_P (dest))
continue;
- if (src == CONST0_RTX (mode))
- {
- /* Record vector instruction with CONST0_RTX. */
- bitmap_set_bit (zero_insns, INSN_UID (insn));
+ rtx_insn *def_insn;
+ machine_mode scalar_mode;
+ redundant_load_kind kind;
+ rtx val = ix86_broadcast_inner (src, mode, &scalar_mode,
+ &kind, &def_insn);
+ if (!val)
+ continue;
- /* Record the maximum vector size. */
- if (zero_size < GET_MODE_SIZE (mode))
- zero_size = GET_MODE_SIZE (mode);
+ /* Remove redundant register loads if there are more than 2
+ loads will be used. */
+ unsigned int threshold = 2;
+
+ /* Check if there is a matching redundant vector load. */
+ bool matched = false;
+ FOR_EACH_VEC_ELT (loads, i, load)
+ if (load->val
+ && load->kind == kind
+ && load->mode == scalar_mode
+ && (load->bb == bb
+ || kind < LOAD_VECTOR
+ /* Non all 0s/1s vector load must be in the same
+ basic block if it is in a recursive call. */
+ || !recursive_call_p)
+ && rtx_equal_p (load->val, val))
+ {
+ /* Record vector instruction. */
+ bitmap_set_bit (load->insns, INSN_UID (insn));
- /* Record the basic block with CONST0_RTX. */
- bitmap_set_bit (zero_bbs, bb->index);
- zero_count++;
- }
- else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
- && src == CONSTM1_RTX (mode))
- {
- /* Record vector instruction with CONSTM1_RTX. */
- bitmap_set_bit (m1_insns, INSN_UID (insn));
+ /* Record the maximum vector size. */
+ if (load->size < GET_MODE_SIZE (mode))
+ load->size = GET_MODE_SIZE (mode);
- /* Record the maximum vector size. */
- if (m1_size < GET_MODE_SIZE (mode))
- m1_size = GET_MODE_SIZE (mode);
+ /* Record the basic block. */
+ bitmap_set_bit (load->bbs, bb->index);
+ load->count++;
+ matched = true;
+ break;
+ }
- /* Record the basic block with CONSTM1_RTX. */
- bitmap_set_bit (m1_bbs, bb->index);
- m1_count++;
- }
- }
- }
+ if (matched)
+ continue;
- if (zero_count > 1 || m1_count > 1)
- {
- machine_mode zero_mode, m1_mode;
- rtx vector_const0, vector_constm1;
+ /* We see this vector broadcast the first time. */
+ load = new redundant_load;
- if (zero_count > 1)
- {
- zero_mode = ix86_get_vector_load_mode (zero_size);
- vector_const0 = gen_reg_rtx (zero_mode);
- replace_vector_const (zero_mode, vector_const0, zero_insns);
- }
- else
- {
- zero_mode = VOIDmode;
- vector_const0 = nullptr;
- }
+ load->val = copy_rtx (val);
+ load->mode = scalar_mode;
+ load->size = GET_MODE_SIZE (mode);
+ load->def_insn = def_insn;
+ load->count = 1;
+ load->threshold = threshold;
+ load->bb = BLOCK_FOR_INSN (insn);
+ load->kind = kind;
- if (m1_count > 1)
- {
- m1_mode = ix86_get_vector_load_mode (m1_size);
- vector_constm1 = gen_reg_rtx (m1_mode);
- replace_vector_const (m1_mode, vector_constm1, m1_insns);
- }
- else
- {
- m1_mode = VOIDmode;
- vector_constm1 = nullptr;
+ bitmap_set_bit (load->insns, INSN_UID (insn));
+ bitmap_set_bit (load->bbs, bb->index);
+
+ loads.safe_push (load);
}
+ }
+
+ bool replaced = false;
+ rtx reg, broadcast_source, broadcast_reg;
+ FOR_EACH_VEC_ELT (loads, i, load)
+ if (load->count >= load->threshold)
+ {
+ machine_mode mode
+ = ix86_get_vector_load_mode (load->size, load->mode);
+ broadcast_reg = gen_reg_rtx (mode);
+ if (load->def_insn)
+ {
+ /* Replace redundant vector loads with a single vector load
+ in the same basic block. */
+ reg = load->val;
+ if (load->mode != GET_MODE (reg))
+ reg = gen_rtx_SUBREG (load->mode, reg, 0);
+ broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
+ replace_vector_const (mode, broadcast_reg, load->insns,
+ load->mode);
+ }
+ else
+ {
+ /* This is a constant integer/double vector. If the
+ inner scalar is 0 or -1, set vector to CONST0_RTX
+ or CONSTM1_RTX directly. */
+ rtx reg;
+ switch (load->kind)
+ {
+ case LOAD_CONST0_VECTOR:
+ broadcast_source = CONST0_RTX (mode);
+ break;
+ case LOAD_CONSTM1_VECTOR:
+ broadcast_source = CONSTM1_RTX (mode);
+ break;
+ default:
+ reg = gen_reg_rtx (load->mode);
+ broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
+ break;
+ }
+ replace_vector_const (mode, broadcast_reg, load->insns,
+ load->mode);
+ }
+ load->broadcast_source = broadcast_source;
+ load->broadcast_reg = broadcast_reg;
+ replaced = true;
+ }
+ if (replaced)
+ {
/* (Re-)discover loops so that bb->loop_father can be used in the
analysis below. */
calculate_dominance_info (CDI_DOMINATORS);
loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
- if (vector_const0)
- ix86_place_single_vector_set (vector_const0,
- CONST0_RTX (zero_mode),
- zero_bbs);
-
- if (vector_constm1)
- ix86_place_single_vector_set (vector_constm1,
- CONSTM1_RTX (m1_mode),
- m1_bbs);
+ FOR_EACH_VEC_ELT (loads, i, load)
+ if (load->count >= load->threshold)
+ {
+ if (load->def_insn)
+ {
+ /* Insert a broadcast after the original scalar
+ definition. */
+ rtx set = gen_rtx_SET (load->broadcast_reg,
+ load->broadcast_source);
+ insn = emit_insn_after (set, load->def_insn);
+ }
+ else
+ ix86_place_single_vector_set (load->broadcast_reg,
+ load->broadcast_source,
+ load->bbs,
+ (load->kind >= LOAD_VECTOR
+ ? load->val
+ : nullptr));
+ }
loop_optimizer_finalize ();
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index d32d9ad997e..59881723a8d 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2920,6 +2920,9 @@ struct GTY(()) machine_function {
/* True if inline asm with redzone clobber has been seen. */
BOOL_BITFIELD asm_redzone_clobber_seen : 1;
+ /* True if this is a recursive function. */
+ BOOL_BITFIELD recursive_function : 1;
+
/* The largest alignment, in bytes, of stack slot actually used. */
unsigned int max_used_stack_alignment;
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
index 93806e51508..e73ba35ddd1 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
@@ -19,14 +19,14 @@
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
#include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c
index f9ccc82c7ca..33cd998bfdf 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c
@@ -19,14 +19,14 @@
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
#include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c
index c0fcd28fb07..75106e59b77 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c
@@ -19,14 +19,14 @@
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
#include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c
index 31463a8b2da..2787732229a 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c
@@ -19,14 +19,14 @@
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
#include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-10.c b/gcc/testsuite/gcc.target/i386/pr92080-10.c
new file mode 100644
index 00000000000..b67f9d8d285
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-10.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=sapphirerapids -Ofast" } */
+/* { dg-final { scan-assembler-times "vpbroadcastw" 1 } } */
+
+extern short write_picture_p_Vid_0;
+extern unsigned short *write_picture_p_2_0_0;
+extern int write_picture_p_0, write_picture_p_1, write_picture_i;
+void write_picture() {
+ unsigned short cr_val = 1 << write_picture_p_Vid_0;
+ for (; write_picture_p_1;)
+ for (; write_picture_i < write_picture_p_0; write_picture_i++)
+ write_picture_p_2_0_0[write_picture_i] = cr_val;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-11.c b/gcc/testsuite/gcc.target/i386/pr92080-11.c
new file mode 100644
index 00000000000..8747fc47640
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-11.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target { avx512f_runtime } } } */
+/* { dg-options "-mavx512f -mtune=icelake-server -O3" } */
+
+struct s {
+ char s[sizeof(long double)];
+};
+
+union u {
+ long double d;
+ struct s s;
+};
+
+int main()
+{
+ union u x = {0};
+#if __SIZEOF_LONG_DOUBLE__ == 16
+ x.s = (struct s){"xxxxxxxxxxxxxxxx"};
+#elif __SIZEOF_LONG_DOUBLE__ == 12
+ x.s = (struct s){"xxxxxxxxxxxx"};
+#elif __SIZEOF_LONG_DOUBLE__ == 8
+ x.s = (struct s){"xxxxxxxx"};
+#elif __SIZEOF_LONG_DOUBLE__ == 4
+ x.s = (struct s){"xxxx"};
+#endif
+
+ union u y = x;
+
+ for (unsigned char *p = (unsigned char *)&y + sizeof y;
+ p-- > (unsigned char *)&y;)
+ if (*p != (unsigned char)'x')
+ __builtin_abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-12.c b/gcc/testsuite/gcc.target/i386/pr92080-12.c
new file mode 100644
index 00000000000..cb09eb2f0a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-12.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mno-mmx -march=icelake-server" } */
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+
+signed char a;
+signed char f (int i, int j)
+{
+ signed char c;
+ while (i != 0)
+ {
+ a ^= j;
+ ++c;
+ ++i;
+ }
+ return c;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-13.c b/gcc/testsuite/gcc.target/i386/pr92080-13.c
new file mode 100644
index 00000000000..24b7616c894
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-13.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target { avx512f_runtime } } } */
+/* { dg-options "-mavx512f -mtune=icelake-server -O2 -save-temps" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 2 } } */
+
+#include <assert.h>
+
+#define CONTAINER_KIND union
+
+typedef CONTAINER_KIND container { int value; } container;
+
+void move(container* end, container* start) {
+ container* p;
+ for (p = end; p > start; p--) {
+ (p)->value = (p-1)->value;
+ }
+}
+
+#define N 100
+
+int main(int argc, char* argv[]) {
+ container vals[N];
+ int i;
+ for (i=0; i<N; i++) {
+ vals[i].value = argc + i;
+ }
+ move(&vals[N-1], &vals[0]);
+ assert(vals[0].value == argc + 0);
+ for (i=1; i<N; i++) {
+ assert(vals[i].value == argc + i - 1);
+ }
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-14.c b/gcc/testsuite/gcc.target/i386/pr92080-14.c
new file mode 100644
index 00000000000..6be41b63400
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-14.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+
+extern v16si sinksz;
+extern v8si sinksy;
+extern v4si sinksx;
+extern v4si sinksx1;
+
+extern void bar (void);
+
+void
+foo (char c, int i)
+{
+ sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ if (i == 1)
+ {
+ sinksy = __extension__(v8si){c,c,c,c,c,c,c,c};
+ bar ();
+ }
+ else if (i == 2)
+ {
+ sinksx = __extension__(v4si){c,c,c,c};
+ bar ();
+ }
+ sinksx1 = __extension__(v4si){c,c,c,c};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-15.c b/gcc/testsuite/gcc.target/i386/pr92080-15.c
new file mode 100644
index 00000000000..fa55d82e48e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-15.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 3 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v16si __attribute__((vector_size(64)));
+
+extern v4si *s1;
+extern v8si *s2;
+extern v16si *s3;
+
+int
+foo (int i, int j)
+{
+ if (j == 1)
+ s1[i] = __extension__(v4si){34, 34, 34, 34};
+ else if (i == 1)
+ s2[j] = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34};
+ if ((i + j) == 1234)
+ i = foo (j, i);
+ s3[i + j] = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+ return i - j;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-16.c b/gcc/testsuite/gcc.target/i386/pr92080-16.c
new file mode 100644
index 00000000000..c8ab084b714
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-16.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v16si __attribute__((vector_size(64)));
+
+extern v4si *s1;
+extern v8si *s2;
+extern v16si *s3;
+
+int
+foo (int i, int j)
+{
+ if (j == 1)
+ {
+ s1[i] = __extension__(v4si){34, 34, 34, 34};
+ s2[j] = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34};
+ s3[i + j] = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+ }
+ if ((i + j) == 1234)
+ i = foo (j, i);
+ return i - j;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-4.c b/gcc/testsuite/gcc.target/i386/pr92080-4.c
new file mode 100644
index 00000000000..ebe1384c691
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-4.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastw" 1 } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+
+typedef short v32hi __attribute__((vector_size(64)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+
+typedef char v64qi __attribute__((vector_size(64)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+
+extern v16si sinksz;
+extern v8si sinksy;
+extern v4si sinksx;
+extern v32hi sinkhz;
+extern v16hi sinkhy;
+extern v8hi sinkhx;
+extern v64qi sinkbz;
+extern v32qi sinkby;
+extern v16qi sinkbx;
+
+void foo(char c) {
+ sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinksy = __extension__(v8si){c,c,c,c,c,c,c,c};
+ sinksx = __extension__(v4si){c,c,c,c};
+}
+
+void foo1(char c) {
+ sinkhz = __extension__(v32hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinkhy = __extension__(v16hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinkhx = __extension__(v8hi){c,c,c,c,c,c,c,c};
+}
+
+void foo2(char c) {
+ sinkbz = __extension__(v64qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinkby = __extension__(v32qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinkbx = __extension__(v16qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-5.c b/gcc/testsuite/gcc.target/i386/pr92080-5.c
new file mode 100644
index 00000000000..380cd337e09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-5.c
@@ -0,0 +1,109 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 3 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastsd" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss" 1 } } */
+
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef long long v8di __attribute__((vector_size(64)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef short v32hi __attribute__((vector_size(64)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v64qi __attribute__((vector_size(64)));
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef float v16sf __attribute__((vector_size(64)));
+typedef double v2df __attribute__((vector_size(16)));
+typedef double v4df __attribute__((vector_size(32)));
+typedef double v8df __attribute__((vector_size(64)));
+
+extern v16qi b1;
+extern v8hi h1;
+extern v4si s1;
+extern v2di l1;
+extern v4sf f1;
+extern v2df d1;
+extern v32qi b2;
+extern v16hi h2;
+extern v8si s2;
+extern v4di l2;
+extern v8sf f2;
+extern v4df d2;
+extern v64qi b3;
+extern v32hi h3;
+extern v16si s3;
+extern v8di l3;
+extern v16sf f3;
+extern v8df d3;
+
+void
+foo1 ()
+{
+ b1 = __extension__(v16qi){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+ b2 = __extension__(v32qi){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+ b3 = __extension__(v64qi){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo2 ()
+{
+ h1 = __extension__(v8hi){34, 34, 34, 34, 34, 34, 34, 34};
+ h2 = __extension__(v16hi){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+ h3 = __extension__(v32hi){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo3 ()
+{
+ s1 = __extension__(v4si){34, 34, 34, 34};
+ s2 = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34};
+ s3 = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo4 ()
+{
+ l1 = __extension__(v2di){34, 34};
+ l2 = __extension__(v4di){34, 34, 34, 34};
+ l3 = __extension__(v8di){34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo5 ()
+{
+ f1 = __extension__(v4sf){34, 34, 34, 34};
+ f2 = __extension__(v8sf){34, 34, 34, 34, 34, 34, 34, 34};
+ f3 = __extension__(v16sf){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo6 ()
+{
+ d1 = __extension__(v2df){34, 34};
+ d2 = __extension__(v4df){34, 34, 34, 34};
+ d3 = __extension__(v8df){34, 34, 34, 34, 34, 34, 34, 34};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-6.c b/gcc/testsuite/gcc.target/i386/pr92080-6.c
new file mode 100644
index 00000000000..e4cdbee55be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-6.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+
+#include <immintrin.h>
+
+extern __m512i sinkz;
+extern __m256i sinky;
+extern char f;
+
+void
+foo(char c, int x)
+{
+ c += f;
+ sinkz = _mm512_set1_epi8(c);
+ if (x == 2)
+ f += 3;
+ sinky = _mm256_set1_epi8(c);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-7.c b/gcc/testsuite/gcc.target/i386/pr92080-7.c
new file mode 100644
index 00000000000..8691684e96b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-7.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+
+#include <immintrin.h>
+
+extern __m512i sinkz;
+extern __m256i sinky;
+extern char f;
+extern void bar (void);
+
+void
+foo(char c, int x)
+{
+ c += f;
+ sinkz = _mm512_set1_epi8(c);
+ if (x == 2)
+ bar ();
+ sinky = _mm256_set1_epi8(c);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-8.c b/gcc/testsuite/gcc.target/i386/pr92080-8.c
new file mode 100644
index 00000000000..7ebb62cea75
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-8.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef long long int v2di __attribute__((vector_size(16)));
+extern v4si s;
+extern v2di l;
+
+void
+foo(void)
+{
+ l = __extension__(v2di){2,2};
+ s = __extension__(v4si){2,2,2,2};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-9.c b/gcc/testsuite/gcc.target/i386/pr92080-9.c
new file mode 100644
index 00000000000..f44ab563f54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-9.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]+" 8 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[\\t \]+" 3 } } */
+/* { dg-final { scan-assembler-times "vmovdqa32\[\\t \]+" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef long long int v2di __attribute__((vector_size(16)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef long long v8di __attribute__((vector_size(64)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef short v32hi __attribute__((vector_size(64)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v64qi __attribute__((vector_size(64)));
+
+extern v16qi b1;
+extern v8hi h1;
+extern v4si s1;
+extern v2di l1;
+extern v32qi b2;
+extern v16hi h2;
+extern v8si s2;
+extern v4di l2;
+extern v64qi b3;
+extern v32hi h3;
+extern v16si s3;
+extern v8di l3;
+
+void
+foo(void)
+{
+ b1 = __extension__(v16qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22};
+ h1 = __extension__(v8hi){0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222};
+ s1 = __extension__(v4si){0x22222222,0x22222222,0x22222222,0x22222222};
+ l1 = __extension__(v2di){0x2222222222222222ULL,0x2222222222222222ULL};
+ b2 = __extension__(v32qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22};
+ h2 = __extension__(v16hi){0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222};
+ s2 = __extension__(v8si){0x22222222,0x22222222,0x22222222,0x22222222,
+ 0x22222222,0x22222222,0x22222222,0x22222222};
+ l2 = __extension__(v4di){0x2222222222222222ULL,0x2222222222222222ULL,
+ 0x2222222222222222ULL,0x2222222222222222ULL};
+ b3 = __extension__(v64qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22};
+ h3 = __extension__(v32hi){0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222};
+ s3 = __extension__(v16si){0x22222222,0x22222222,0x22222222,0x22222222,
+ 0x22222222,0x22222222,0x22222222,0x22222222,
+ 0x22222222,0x22222222,0x22222222,0x22222222,
+ 0x22222222,0x22222222,0x22222222,0x22222222};
+ l3 = __extension__(v8di){0x2222222222222222ULL,0x2222222222222222ULL,
+ 0x2222222222222222ULL,0x2222222222222222ULL,
+ 0x2222222222222222ULL,0x2222222222222222ULL,
+ 0x2222222222222222ULL,0x2222222222222222ULL};
+}
--
2.49.0
From 21e8711027293073a99fa2a7f3de2ae838dddace Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <[email protected]>
Date: Fri, 9 May 2025 07:17:07 +0800
Subject: [PATCH v2] x86: Extend the remove_redundant_vector pass
Extend the remove_redundant_vector pass to handle vector broadcasts from
constant and variable scalars. When broadcasting from constants and
function arguments, we can place a single widest vector broadcast at
entry of the nearest common dominator for basic blocks with all uses
since constants and function arguments aren't changed. For broadcast
from variables with a single definition, the single definition is
replaced with the widest broadcast.
gcc/
PR target/92080
* config/i386/i386-expand.cc (ix86_expand_call): Set
recursive_function to true for recursive call.
* config/i386/i386-features.cc (ix86_place_single_vector_set):
Add an argument for inner scalar, default to nullptr. Set the
source from inner scalar if not nullptr.
(ix86_get_vector_load_mode): Add an argument for scalar mode and
handle integer and float scalar modes.
(replace_vector_const): Add an argument for scalar mode and pass
it to ix86_get_vector_load_mode.
(redundant_load_kind): New.
(redundant_load): Likewise.
(ix86_broadcast_inner): Likewise.
(remove_redundant_vector_load): Also support const0_rtx and
constm1_rtx broadcasts. Handle vector broadcasts from constant
and variable scalars.
* config/i386/i386.h (machine_function): Add recursive_function.
gcc/testsuite/
* gcc.target/i386/keylocker-aesdecwide128kl.c: Updated to expect
movdqa instead pxor.
* gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise.
* gcc.target/i386/keylocker-aesencwide128kl.c: Likewise.
* gcc.target/i386/keylocker-aesencwide256kl.c: Likewise.
* gcc.target/i386/pr92080-4.c: New test.
* gcc.target/i386/pr92080-5.c: Likewise.
* gcc.target/i386/pr92080-6.c: Likewise.
* gcc.target/i386/pr92080-7.c: Likewise.
* gcc.target/i386/pr92080-8.c: Likewise.
* gcc.target/i386/pr92080-9.c: Likewise.
* gcc.target/i386/pr92080-10.c: Likewise.
* gcc.target/i386/pr92080-11.c: Likewise.
* gcc.target/i386/pr92080-12.c: Likewise.
* gcc.target/i386/pr92080-13.c: Likewise.
* gcc.target/i386/pr92080-14.c: Likewise.
* gcc.target/i386/pr92080-15.c: Likewise.
* gcc.target/i386/pr92080-16.c: Likewise.
Signed-off-by: H.J. Lu <[email protected]>
---
gcc/config/i386/i386-expand.cc | 3 +
gcc/config/i386/i386-features.cc | 413 ++++++++++++++----
gcc/config/i386/i386.h | 3 +
.../i386/keylocker-aesdecwide128kl.c | 14 +-
.../i386/keylocker-aesdecwide256kl.c | 14 +-
.../i386/keylocker-aesencwide128kl.c | 14 +-
.../i386/keylocker-aesencwide256kl.c | 14 +-
gcc/testsuite/gcc.target/i386/pr92080-10.c | 13 +
gcc/testsuite/gcc.target/i386/pr92080-11.c | 33 ++
gcc/testsuite/gcc.target/i386/pr92080-12.c | 16 +
gcc/testsuite/gcc.target/i386/pr92080-13.c | 32 ++
gcc/testsuite/gcc.target/i386/pr92080-14.c | 31 ++
gcc/testsuite/gcc.target/i386/pr92080-15.c | 25 ++
gcc/testsuite/gcc.target/i386/pr92080-16.c | 26 ++
gcc/testsuite/gcc.target/i386/pr92080-4.c | 50 +++
gcc/testsuite/gcc.target/i386/pr92080-5.c | 109 +++++
gcc/testsuite/gcc.target/i386/pr92080-6.c | 19 +
gcc/testsuite/gcc.target/i386/pr92080-7.c | 20 +
gcc/testsuite/gcc.target/i386/pr92080-8.c | 16 +
gcc/testsuite/gcc.target/i386/pr92080-9.c | 81 ++++
20 files changed, 826 insertions(+), 120 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-10.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-11.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-12.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-13.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-14.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-15.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-16.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-4.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-5.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-6.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-7.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-8.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-9.c
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 181e64a86bf..6dda0c93fc2 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -10122,6 +10122,9 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
else if (lookup_attribute ("no_callee_saved_registers",
TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
call_no_callee_saved_registers = true;
+ if (fndecl == current_function_decl
+ && decl_binds_to_current_def_p (fndecl))
+ cfun->machine->recursive_function = true;
}
}
else
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index b1682c2fad4..8eff1eb68f5 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3088,10 +3088,12 @@ ix86_rpad_gate ()
/* Generate a vector set, DEST = SRC, at entry of the nearest dominator
for basic block map BBS, which is in the fake loop that contains the
whole function, so that there is only a single vector set in the
- whole function. */
+ whole function. If not nullptr, INNER_SCALAR is the inner scalar of
+ SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)). */
static void
-ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
+ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
+ rtx inner_scalar = nullptr)
{
basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
while (bb->loop_father->latch
@@ -3112,10 +3114,23 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
insn = NEXT_INSN (insn);
}
+ rtx_insn *set_insn;
if (insn == BB_HEAD (bb))
- emit_insn_before (set, insn);
+ set_insn = emit_insn_before (set, insn);
else
- emit_insn_after (set, insn ? PREV_INSN (insn) : BB_END (bb));
+ set_insn = emit_insn_after (set,
+ insn ? PREV_INSN (insn) : BB_END (bb));
+
+ if (inner_scalar)
+ {
+ /* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */
+ rtx reg = XEXP (src, 0);
+ if ((REG_P (inner_scalar) || MEM_P (inner_scalar))
+ && GET_MODE (reg) != GET_MODE (inner_scalar))
+ inner_scalar = gen_rtx_SUBREG (GET_MODE (reg), inner_scalar, 0);
+ rtx set = gen_rtx_SET (reg, inner_scalar);
+ emit_insn_before (set, set_insn);
+ }
}
/* At entry of the nearest common dominator for basic blocks with
@@ -3346,26 +3361,15 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
return new pass_remove_partial_avx_dependency (ctxt);
}
-/* Return a machine mode suitable for vector SIZE. */
+/* Return a machine mode suitable for vector SIZE with SMODE inner
+ mode. */
static machine_mode
-ix86_get_vector_load_mode (unsigned int size)
+ix86_get_vector_load_mode (unsigned int size, machine_mode smode)
{
- machine_mode mode;
- if (size == 64)
- mode = V64QImode;
- else if (size == 32)
- mode = V32QImode;
- else if (size == 16)
- mode = V16QImode;
- else if (size == 8)
- mode = V8QImode;
- else if (size == 4)
- mode = V4QImode;
- else if (size == 2)
- mode = V2QImode;
- else
- gcc_unreachable ();
+ scalar_mode s_mode = as_a <scalar_mode> (smode);
+ poly_uint64 nunits = size / GET_MODE_SIZE (smode);
+ machine_mode mode = mode_for_vector (s_mode, nunits).require ();
return mode;
}
@@ -3374,7 +3378,8 @@ ix86_get_vector_load_mode (unsigned int size)
static void
replace_vector_const (machine_mode vector_mode, rtx vector_const,
- auto_bitmap &vector_insns)
+ auto_bitmap &vector_insns,
+ machine_mode scalar_mode)
{
bitmap_iterator bi;
unsigned int id;
@@ -3386,7 +3391,8 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
/* Get the single SET instruction. */
rtx set = single_set (insn);
rtx src = SET_SRC (set);
- machine_mode mode = GET_MODE (src);
+ rtx dest = SET_DEST (set);
+ machine_mode mode = GET_MODE (dest);
rtx replace;
/* Replace the source operand with VECTOR_CONST. */
@@ -3400,7 +3406,8 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
/* If the mode size is smaller than its natural size,
first insert an extra move with a QI vector SUBREG
of the same size to avoid validate_subreg failure. */
- machine_mode vmode = ix86_get_vector_load_mode (size);
+ machine_mode vmode
+ = ix86_get_vector_load_mode (size, scalar_mode);
rtx vreg;
if (mode == vmode)
vreg = vector_const;
@@ -3426,6 +3433,172 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
}
}
+enum redundant_load_kind
+{
+ LOAD_CONST0_VECTOR,
+ LOAD_CONSTM1_VECTOR,
+ LOAD_VECTOR
+};
+
+struct redundant_load
+{
+ /* Bitmap of basic blocks with broadcast instructions. */
+ auto_bitmap bbs;
+ /* Bitmap of broadcast instructions. */
+ auto_bitmap insns;
+ /* The broadcast inner scalar. */
+ rtx val;
+ /* The inner scalar mode. */
+ machine_mode mode;
+ /* The instruction which sets the inner scalar. Nullptr if the inner
+ scalar is applied to the whole function, instead of within the same
+ block. */
+ rtx_insn *def_insn;
+ /* The widest broadcast source. */
+ rtx broadcast_source;
+ /* The widest broadcast register. */
+ rtx broadcast_reg;
+ /* The basic block of the broadcast instruction. */
+ basic_block bb;
+ /* The number of broadcast instructions with the same inner scalar. */
+ unsigned HOST_WIDE_INT count;
+ /* The threshold of broadcast instructions with the same inner
+ scalar. */
+ unsigned int threshold;
+ /* The widest broadcast size in bytes. */
+ unsigned int size;
+ /* Load kind. */
+ redundant_load_kind kind;
+};
+
+/* Return the inner scalar if OP is a broadcast, else return nullptr. */
+
+static rtx
+ix86_broadcast_inner (rtx op, machine_mode mode,
+ machine_mode *scalar_mode_p,
+ redundant_load_kind *kind_p, rtx_insn **insn_p)
+{
+ if (op == const0_rtx || op == CONST0_RTX (mode))
+ {
+ *scalar_mode_p = QImode;
+ *kind_p = LOAD_CONST0_VECTOR;
+ *insn_p = nullptr;
+ return const0_rtx;
+ }
+ else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ && (op == constm1_rtx || op == CONSTM1_RTX (mode)))
+ {
+ *scalar_mode_p = QImode;
+ *kind_p = LOAD_CONSTM1_VECTOR;
+ *insn_p = nullptr;
+ return constm1_rtx;
+ }
+
+ mode = GET_MODE (op);
+ int nunits = GET_MODE_NUNITS (mode);
+ if (nunits < 2)
+ return nullptr;
+
+ *kind_p = LOAD_VECTOR;
+
+ rtx reg;
+ if (GET_CODE (op) == VEC_DUPLICATE)
+ {
+ /* Only
+ (vec_duplicate:V4SI (reg:SI 99))
+ (vec_duplicate:V2DF (mem/u/c:DF (symbol_ref/u:DI ("*.LC1") [flags 0x2]) [0 S8 A64]))
+ are supported. */
+ op = XEXP (op, 0);
+ reg = op;
+ if (SUBREG_P (op))
+ reg = SUBREG_REG (op);
+ if (!REG_P (reg))
+ {
+ if (MEM_P (op)
+ && SYMBOL_REF_P (XEXP (op, 0))
+ && CONSTANT_POOL_ADDRESS_P (XEXP (op, 0)))
+ {
+ /* Handle constant broadcast from memory. */
+ *scalar_mode_p = GET_MODE_INNER (mode);
+ *insn_p = nullptr;
+ return op;
+ }
+ return nullptr;
+ }
+ }
+ else if (CONST_VECTOR_P (op))
+ {
+ rtx first = XVECEXP (op, 0, 0);
+ for (int i = 1; i < nunits; ++i)
+ {
+ rtx tmp = XVECEXP (op, 0, i);
+ /* Vector duplicate value. */
+ if (!rtx_equal_p (tmp, first))
+ return nullptr;
+ }
+ *scalar_mode_p = GET_MODE (first);
+ *insn_p = nullptr;
+ return first;
+ }
+ else
+ return nullptr;
+
+ mode = GET_MODE (op);
+
+ /* Only single def chain is supported. */
+ df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
+ if (!ref || DF_REF_NEXT_REG (ref) != nullptr)
+ return nullptr;
+
+ if (DF_REF_IS_ARTIFICIAL (ref))
+ {
+ *scalar_mode_p = mode;
+ *insn_p = nullptr;
+ return op;
+ }
+
+ rtx_insn *insn = DF_REF_INSN (ref);
+ rtx set = single_set (insn);
+ if (!set)
+ return nullptr;
+
+ rtx dest = SET_DEST (set);
+
+ op = SET_SRC (set);
+ /* Set *INSN_P if the scalar source isn't a constant nor an incoming
+ argument. */
+ if (CONST_INT_P (op) || CONST_DOUBLE_P (op))
+ *insn_p = nullptr;
+ else if (REG_P (op)
+ && REG_EXPR (op)
+ && TREE_CODE (REG_EXPR (op)) == PARM_DECL)
+ *insn_p = nullptr;
+ else if (MEM_P (op)
+ && MEM_EXPR (op)
+ && TREE_CODE (get_base_address (MEM_EXPR (op))) == PARM_DECL)
+ *insn_p = nullptr;
+ else
+ {
+ while (SUBREG_P (dest))
+ dest = SUBREG_REG (dest);
+
+ /* Skip if the SET destination mode doesn't match. */
+ if (GET_MODE (dest) != mode)
+ return nullptr;
+
+ /* Set the inner scalar to the SET destination. */
+ op = dest;
+ *insn_p = insn;
+ }
+
+ *scalar_mode_p = mode;
+ if (CONSTANT_P (op))
+ *insn_p = nullptr;
+ else
+ *insn_p = insn;
+ return op;
+}
+
/* At entry of the nearest common dominator for basic blocks with vector
CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
@@ -3440,20 +3613,16 @@ remove_redundant_vector_load (void)
{
timevar_push (TV_MACH_DEP);
- auto_bitmap zero_bbs;
- auto_bitmap m1_bbs;
- auto_bitmap zero_insns;
- auto_bitmap m1_insns;
-
+ auto_vec<redundant_load *> loads;
+ redundant_load *load;
basic_block bb;
rtx_insn *insn;
- unsigned HOST_WIDE_INT zero_count = 0;
- unsigned HOST_WIDE_INT m1_count = 0;
- unsigned int zero_size = 0;
- unsigned int m1_size = 0;
+ unsigned int i;
df_set_flags (DF_DEFER_INSN_RESCAN);
+ bool recursive_call_p = cfun->machine->recursive_function;
+
FOR_EACH_BB_FN (bb, cfun)
{
FOR_BB_INSNS (bb, insn)
@@ -3481,79 +3650,139 @@ remove_redundant_vector_load (void)
if (!REG_P (dest) && !SUBREG_P (dest))
continue;
- if (src == CONST0_RTX (mode))
- {
- /* Record vector instruction with CONST0_RTX. */
- bitmap_set_bit (zero_insns, INSN_UID (insn));
+ rtx_insn *def_insn;
+ machine_mode scalar_mode;
+ redundant_load_kind kind;
+ rtx val = ix86_broadcast_inner (src, mode, &scalar_mode,
+ &kind, &def_insn);
+ if (!val)
+ continue;
- /* Record the maximum vector size. */
- if (zero_size < GET_MODE_SIZE (mode))
- zero_size = GET_MODE_SIZE (mode);
+ /* Remove redundant register loads if there are more than 2
+ loads will be used. */
+ unsigned int threshold = 2;
+
+ /* Check if there is a matching redundant vector load. */
+ bool matched = false;
+ FOR_EACH_VEC_ELT (loads, i, load)
+ if (load->val
+ && load->kind == kind
+ && load->mode == scalar_mode
+ && (load->bb == bb
+ || kind < LOAD_VECTOR
+ /* Non all 0s/1s vector load must be in the same
+ basic block if it is in a recursive call. */
+ || !recursive_call_p)
+ && rtx_equal_p (load->val, val))
+ {
+ /* Record vector instruction. */
+ bitmap_set_bit (load->insns, INSN_UID (insn));
- /* Record the basic block with CONST0_RTX. */
- bitmap_set_bit (zero_bbs, bb->index);
- zero_count++;
- }
- else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
- && src == CONSTM1_RTX (mode))
- {
- /* Record vector instruction with CONSTM1_RTX. */
- bitmap_set_bit (m1_insns, INSN_UID (insn));
+ /* Record the maximum vector size. */
+ if (load->size < GET_MODE_SIZE (mode))
+ load->size = GET_MODE_SIZE (mode);
- /* Record the maximum vector size. */
- if (m1_size < GET_MODE_SIZE (mode))
- m1_size = GET_MODE_SIZE (mode);
+ /* Record the basic block. */
+ bitmap_set_bit (load->bbs, bb->index);
+ load->count++;
+ matched = true;
+ break;
+ }
- /* Record the basic block with CONSTM1_RTX. */
- bitmap_set_bit (m1_bbs, bb->index);
- m1_count++;
- }
- }
- }
+ if (matched)
+ continue;
- if (zero_count > 1 || m1_count > 1)
- {
- machine_mode zero_mode, m1_mode;
- rtx vector_const0, vector_constm1;
+ /* We see this vector broadcast the first time. */
+ load = new redundant_load;
- if (zero_count > 1)
- {
- zero_mode = ix86_get_vector_load_mode (zero_size);
- vector_const0 = gen_reg_rtx (zero_mode);
- replace_vector_const (zero_mode, vector_const0, zero_insns);
- }
- else
- {
- zero_mode = VOIDmode;
- vector_const0 = nullptr;
- }
+ load->val = copy_rtx (val);
+ load->mode = scalar_mode;
+ load->size = GET_MODE_SIZE (mode);
+ load->def_insn = def_insn;
+ load->count = 1;
+ load->threshold = threshold;
+ load->bb = BLOCK_FOR_INSN (insn);
+ load->kind = kind;
- if (m1_count > 1)
- {
- m1_mode = ix86_get_vector_load_mode (m1_size);
- vector_constm1 = gen_reg_rtx (m1_mode);
- replace_vector_const (m1_mode, vector_constm1, m1_insns);
- }
- else
- {
- m1_mode = VOIDmode;
- vector_constm1 = nullptr;
+ bitmap_set_bit (load->insns, INSN_UID (insn));
+ bitmap_set_bit (load->bbs, bb->index);
+
+ loads.safe_push (load);
}
+ }
+
+ bool replaced = false;
+ rtx reg, broadcast_source, broadcast_reg;
+ FOR_EACH_VEC_ELT (loads, i, load)
+ if (load->count >= load->threshold)
+ {
+ machine_mode mode
+ = ix86_get_vector_load_mode (load->size, load->mode);
+ broadcast_reg = gen_reg_rtx (mode);
+ if (load->def_insn)
+ {
+ /* Replace redundant vector loads with a single vector load
+ in the same basic block. */
+ reg = load->val;
+ if (load->mode != GET_MODE (reg))
+ reg = gen_rtx_SUBREG (load->mode, reg, 0);
+ broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
+ replace_vector_const (mode, broadcast_reg, load->insns,
+ load->mode);
+ }
+ else
+ {
+ /* This is a constant integer/double vector. If the
+ inner scalar is 0 or -1, set vector to CONST0_RTX
+ or CONSTM1_RTX directly. */
+ rtx reg;
+ switch (load->kind)
+ {
+ case LOAD_CONST0_VECTOR:
+ broadcast_source = CONST0_RTX (mode);
+ break;
+ case LOAD_CONSTM1_VECTOR:
+ broadcast_source = CONSTM1_RTX (mode);
+ break;
+ default:
+ reg = gen_reg_rtx (load->mode);
+ broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
+ break;
+ }
+ replace_vector_const (mode, broadcast_reg, load->insns,
+ load->mode);
+ }
+ load->broadcast_source = broadcast_source;
+ load->broadcast_reg = broadcast_reg;
+ replaced = true;
+ }
+ if (replaced)
+ {
/* (Re-)discover loops so that bb->loop_father can be used in the
analysis below. */
calculate_dominance_info (CDI_DOMINATORS);
loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
- if (vector_const0)
- ix86_place_single_vector_set (vector_const0,
- CONST0_RTX (zero_mode),
- zero_bbs);
-
- if (vector_constm1)
- ix86_place_single_vector_set (vector_constm1,
- CONSTM1_RTX (m1_mode),
- m1_bbs);
+ FOR_EACH_VEC_ELT (loads, i, load)
+ if (load->count >= load->threshold)
+ {
+ if (load->def_insn)
+ {
+ /* Insert a broadcast after the original scalar
+ definition. */
+ rtx set = gen_rtx_SET (load->broadcast_reg,
+ load->broadcast_source);
+ insn = emit_insn_after (set, load->def_insn);
+ }
+ else
+ ix86_place_single_vector_set (load->broadcast_reg,
+ load->broadcast_source,
+ load->bbs,
+ (load->kind >= LOAD_VECTOR
+ ? load->val
+ : nullptr));
+ }
loop_optimizer_finalize ();
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index d32d9ad997e..59881723a8d 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2920,6 +2920,9 @@ struct GTY(()) machine_function {
/* True if inline asm with redzone clobber has been seen. */
BOOL_BITFIELD asm_redzone_clobber_seen : 1;
+ /* True if this is a recursive function. */
+ BOOL_BITFIELD recursive_function : 1;
+
/* The largest alignment, in bytes, of stack slot actually used. */
unsigned int max_used_stack_alignment;
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
index 93806e51508..e73ba35ddd1 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
@@ -19,14 +19,14 @@
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
#include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c
index f9ccc82c7ca..33cd998bfdf 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c
@@ -19,14 +19,14 @@
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
#include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c
index c0fcd28fb07..75106e59b77 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c
@@ -19,14 +19,14 @@
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
#include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c
index 31463a8b2da..2787732229a 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c
@@ -19,14 +19,14 @@
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
#include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-10.c b/gcc/testsuite/gcc.target/i386/pr92080-10.c
new file mode 100644
index 00000000000..b67f9d8d285
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-10.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=sapphirerapids -Ofast" } */
+/* { dg-final { scan-assembler-times "vpbroadcastw" 1 } } */
+
+extern short write_picture_p_Vid_0;
+extern unsigned short *write_picture_p_2_0_0;
+extern int write_picture_p_0, write_picture_p_1, write_picture_i;
+void write_picture() {
+ unsigned short cr_val = 1 << write_picture_p_Vid_0;
+ for (; write_picture_p_1;)
+ for (; write_picture_i < write_picture_p_0; write_picture_i++)
+ write_picture_p_2_0_0[write_picture_i] = cr_val;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-11.c b/gcc/testsuite/gcc.target/i386/pr92080-11.c
new file mode 100644
index 00000000000..8747fc47640
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-11.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target { avx512f_runtime } } } */
+/* { dg-options "-mavx512f -mtune=icelake-server -O3" } */
+
+struct s {
+ char s[sizeof(long double)];
+};
+
+union u {
+ long double d;
+ struct s s;
+};
+
+int main()
+{
+ union u x = {0};
+#if __SIZEOF_LONG_DOUBLE__ == 16
+ x.s = (struct s){"xxxxxxxxxxxxxxxx"};
+#elif __SIZEOF_LONG_DOUBLE__ == 12
+ x.s = (struct s){"xxxxxxxxxxxx"};
+#elif __SIZEOF_LONG_DOUBLE__ == 8
+ x.s = (struct s){"xxxxxxxx"};
+#elif __SIZEOF_LONG_DOUBLE__ == 4
+ x.s = (struct s){"xxxx"};
+#endif
+
+ union u y = x;
+
+ for (unsigned char *p = (unsigned char *)&y + sizeof y;
+ p-- > (unsigned char *)&y;)
+ if (*p != (unsigned char)'x')
+ __builtin_abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-12.c b/gcc/testsuite/gcc.target/i386/pr92080-12.c
new file mode 100644
index 00000000000..cb09eb2f0a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-12.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mno-mmx -march=icelake-server" } */
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+
+signed char a;
+signed char f (int i, int j)
+{
+ signed char c;
+ while (i != 0)
+ {
+ a ^= j;
+ ++c;
+ ++i;
+ }
+ return c;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-13.c b/gcc/testsuite/gcc.target/i386/pr92080-13.c
new file mode 100644
index 00000000000..24b7616c894
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-13.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target { avx512f_runtime } } } */
+/* { dg-options "-mavx512f -mtune=icelake-server -O2 -save-temps" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 2 } } */
+
+#include <assert.h>
+
+#define CONTAINER_KIND union
+
+typedef CONTAINER_KIND container { int value; } container;
+
+void move(container* end, container* start) {
+ container* p;
+ for (p = end; p > start; p--) {
+ (p)->value = (p-1)->value;
+ }
+}
+
+#define N 100
+
+int main(int argc, char* argv[]) {
+ container vals[N];
+ int i;
+ for (i=0; i<N; i++) {
+ vals[i].value = argc + i;
+ }
+ move(&vals[N-1], &vals[0]);
+ assert(vals[0].value == argc + 0);
+ for (i=1; i<N; i++) {
+ assert(vals[i].value == argc + i - 1);
+ }
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-14.c b/gcc/testsuite/gcc.target/i386/pr92080-14.c
new file mode 100644
index 00000000000..6be41b63400
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-14.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+
+extern v16si sinksz;
+extern v8si sinksy;
+extern v4si sinksx;
+extern v4si sinksx1;
+
+extern void bar (void);
+
+void
+foo (char c, int i)
+{
+ sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ if (i == 1)
+ {
+ sinksy = __extension__(v8si){c,c,c,c,c,c,c,c};
+ bar ();
+ }
+ else if (i == 2)
+ {
+ sinksx = __extension__(v4si){c,c,c,c};
+ bar ();
+ }
+ sinksx1 = __extension__(v4si){c,c,c,c};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-15.c b/gcc/testsuite/gcc.target/i386/pr92080-15.c
new file mode 100644
index 00000000000..fa55d82e48e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-15.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 3 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v16si __attribute__((vector_size(64)));
+
+extern v4si *s1;
+extern v8si *s2;
+extern v16si *s3;
+
+int
+foo (int i, int j)
+{
+ if (j == 1)
+ s1[i] = __extension__(v4si){34, 34, 34, 34};
+ else if (i == 1)
+ s2[j] = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34};
+ if ((i + j) == 1234)
+ i = foo (j, i);
+ s3[i + j] = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+ return i - j;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-16.c b/gcc/testsuite/gcc.target/i386/pr92080-16.c
new file mode 100644
index 00000000000..c8ab084b714
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-16.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v16si __attribute__((vector_size(64)));
+
+extern v4si *s1;
+extern v8si *s2;
+extern v16si *s3;
+
+int
+foo (int i, int j)
+{
+ if (j == 1)
+ {
+ s1[i] = __extension__(v4si){34, 34, 34, 34};
+ s2[j] = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34};
+ s3[i + j] = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+ }
+ if ((i + j) == 1234)
+ i = foo (j, i);
+ return i - j;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-4.c b/gcc/testsuite/gcc.target/i386/pr92080-4.c
new file mode 100644
index 00000000000..ebe1384c691
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-4.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastw" 1 } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+
+typedef short v32hi __attribute__((vector_size(64)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+
+typedef char v64qi __attribute__((vector_size(64)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+
+extern v16si sinksz;
+extern v8si sinksy;
+extern v4si sinksx;
+extern v32hi sinkhz;
+extern v16hi sinkhy;
+extern v8hi sinkhx;
+extern v64qi sinkbz;
+extern v32qi sinkby;
+extern v16qi sinkbx;
+
+void foo(char c) {
+ sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinksy = __extension__(v8si){c,c,c,c,c,c,c,c};
+ sinksx = __extension__(v4si){c,c,c,c};
+}
+
+void foo1(char c) {
+ sinkhz = __extension__(v32hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinkhy = __extension__(v16hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinkhx = __extension__(v8hi){c,c,c,c,c,c,c,c};
+}
+
+void foo2(char c) {
+ sinkbz = __extension__(v64qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinkby = __extension__(v32qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinkbx = __extension__(v16qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-5.c b/gcc/testsuite/gcc.target/i386/pr92080-5.c
new file mode 100644
index 00000000000..380cd337e09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-5.c
@@ -0,0 +1,109 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 3 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastsd" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss" 1 } } */
+
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef long long v8di __attribute__((vector_size(64)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef short v32hi __attribute__((vector_size(64)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v64qi __attribute__((vector_size(64)));
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef float v16sf __attribute__((vector_size(64)));
+typedef double v2df __attribute__((vector_size(16)));
+typedef double v4df __attribute__((vector_size(32)));
+typedef double v8df __attribute__((vector_size(64)));
+
+extern v16qi b1;
+extern v8hi h1;
+extern v4si s1;
+extern v2di l1;
+extern v4sf f1;
+extern v2df d1;
+extern v32qi b2;
+extern v16hi h2;
+extern v8si s2;
+extern v4di l2;
+extern v8sf f2;
+extern v4df d2;
+extern v64qi b3;
+extern v32hi h3;
+extern v16si s3;
+extern v8di l3;
+extern v16sf f3;
+extern v8df d3;
+
+void
+foo1 ()
+{
+ b1 = __extension__(v16qi){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+ b2 = __extension__(v32qi){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+ b3 = __extension__(v64qi){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo2 ()
+{
+ h1 = __extension__(v8hi){34, 34, 34, 34, 34, 34, 34, 34};
+ h2 = __extension__(v16hi){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+ h3 = __extension__(v32hi){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo3 ()
+{
+ s1 = __extension__(v4si){34, 34, 34, 34};
+ s2 = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34};
+ s3 = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo4 ()
+{
+ l1 = __extension__(v2di){34, 34};
+ l2 = __extension__(v4di){34, 34, 34, 34};
+ l3 = __extension__(v8di){34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo5 ()
+{
+ f1 = __extension__(v4sf){34, 34, 34, 34};
+ f2 = __extension__(v8sf){34, 34, 34, 34, 34, 34, 34, 34};
+ f3 = __extension__(v16sf){34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo6 ()
+{
+ d1 = __extension__(v2df){34, 34};
+ d2 = __extension__(v4df){34, 34, 34, 34};
+ d3 = __extension__(v8df){34, 34, 34, 34, 34, 34, 34, 34};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-6.c b/gcc/testsuite/gcc.target/i386/pr92080-6.c
new file mode 100644
index 00000000000..e4cdbee55be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-6.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+
+#include <immintrin.h>
+
+extern __m512i sinkz;
+extern __m256i sinky;
+extern char f;
+
+void
+foo(char c, int x)
+{
+ c += f;
+ sinkz = _mm512_set1_epi8(c);
+ if (x == 2)
+ f += 3;
+ sinky = _mm256_set1_epi8(c);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-7.c b/gcc/testsuite/gcc.target/i386/pr92080-7.c
new file mode 100644
index 00000000000..8691684e96b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-7.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+
+#include <immintrin.h>
+
+extern __m512i sinkz;
+extern __m256i sinky;
+extern char f;
+extern void bar (void);
+
+void
+foo(char c, int x)
+{
+ c += f;
+ sinkz = _mm512_set1_epi8(c);
+ if (x == 2)
+ bar ();
+ sinky = _mm256_set1_epi8(c);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-8.c b/gcc/testsuite/gcc.target/i386/pr92080-8.c
new file mode 100644
index 00000000000..7ebb62cea75
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-8.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef long long int v2di __attribute__((vector_size(16)));
+extern v4si s;
+extern v2di l;
+
+void
+foo(void)
+{
+ l = __extension__(v2di){2,2};
+ s = __extension__(v4si){2,2,2,2};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-9.c b/gcc/testsuite/gcc.target/i386/pr92080-9.c
new file mode 100644
index 00000000000..f44ab563f54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-9.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]+" 8 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[\\t \]+" 3 } } */
+/* { dg-final { scan-assembler-times "vmovdqa32\[\\t \]+" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef long long int v2di __attribute__((vector_size(16)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef long long v8di __attribute__((vector_size(64)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef short v32hi __attribute__((vector_size(64)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v64qi __attribute__((vector_size(64)));
+
+extern v16qi b1;
+extern v8hi h1;
+extern v4si s1;
+extern v2di l1;
+extern v32qi b2;
+extern v16hi h2;
+extern v8si s2;
+extern v4di l2;
+extern v64qi b3;
+extern v32hi h3;
+extern v16si s3;
+extern v8di l3;
+
+void
+foo(void)
+{
+ b1 = __extension__(v16qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22};
+ h1 = __extension__(v8hi){0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222};
+ s1 = __extension__(v4si){0x22222222,0x22222222,0x22222222,0x22222222};
+ l1 = __extension__(v2di){0x2222222222222222ULL,0x2222222222222222ULL};
+ b2 = __extension__(v32qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22};
+ h2 = __extension__(v16hi){0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222};
+ s2 = __extension__(v8si){0x22222222,0x22222222,0x22222222,0x22222222,
+ 0x22222222,0x22222222,0x22222222,0x22222222};
+ l2 = __extension__(v4di){0x2222222222222222ULL,0x2222222222222222ULL,
+ 0x2222222222222222ULL,0x2222222222222222ULL};
+ b3 = __extension__(v64qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22};
+ h3 = __extension__(v32hi){0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222,
+ 0x2222, 0x2222, 0x2222, 0x2222};
+ s3 = __extension__(v16si){0x22222222,0x22222222,0x22222222,0x22222222,
+ 0x22222222,0x22222222,0x22222222,0x22222222,
+ 0x22222222,0x22222222,0x22222222,0x22222222,
+ 0x22222222,0x22222222,0x22222222,0x22222222};
+ l3 = __extension__(v8di){0x2222222222222222ULL,0x2222222222222222ULL,
+ 0x2222222222222222ULL,0x2222222222222222ULL,
+ 0x2222222222222222ULL,0x2222222222222222ULL,
+ 0x2222222222222222ULL,0x2222222222222222ULL};
+}
--
2.49.0