1. Update move expanders to convert the CONST_WIDE_INT and CONST_VECTO
operands to vector broadcast from an integer with AVX2.
2. Add ix86_gen_scratch_sse_rtx to return a scratch SSE register which
won't increase stack alignment requirement and blocks transformation by
the combine pass.
3. Update PR 87767 tests to expect integer broadcast instead of broadcast
from memory.
4. Update avx512f_cond_move.c to expect integer broadcast.

A small benchmark:

https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/memset/broadcast

shows that broadcast is a little bit faster on Intel Core i7-8559U:

$ make
gcc -g -I. -O2   -c -o test.o test.c
gcc -g   -c -o memory.o memory.S
gcc -g   -c -o broadcast.o broadcast.S
gcc -g   -c -o vec_dup_sse2.o vec_dup_sse2.S
gcc -o test test.o memory.o broadcast.o vec_dup_sse2.o
./test
memory      : 147215
broadcast   : 121213
vec_dup_sse2: 171366
$

broadcast is also smaller:

$ size memory.o broadcast.o
   text    data     bss     dec     hex filename
    132       0       0     132      84 memory.o
    122       0       0     122      7a broadcast.o
$

gcc/

        PR target/100865
        * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate):
        New prototype.
        (ix86_byte_broadcast): New function.
        (ix86_convert_const_wide_int_to_broadcast): Likewise.
        (ix86_expand_move): Convert CONST_WIDE_INT to broadcast if mode
        size is 16 bytes or bigger.
        (ix86_broadcast_from_integer_constant): New function.
        (ix86_expand_vector_move): Convert CONST_WIDE_INT and CONST_VECTOR
        to broadcast if mode size is 16 bytes or bigger.
        * config/i386/i386-protos.h (ix86_gen_scratch_sse_rtx): New
        prototype.
        * config/i386/i386.c (ix86_gen_scratch_sse_rtx): New function.

gcc/testsuite/

        PR target/100865
        * gcc.target/i386/avx512f-broadcast-pr87767-1.c: Expect integer
        broadcast.
        * gcc.target/i386/avx512f-broadcast-pr87767-5.c: Likewise.
        * gcc.target/i386/avx512vl-broadcast-pr87767-1.c: Likewise.
        * gcc.target/i386/avx512vl-broadcast-pr87767-5.c: Likewise.
        * gcc.target/i386/avx512f_cond_move.c: Also pass
        -mprefer-vector-width=512 and expect integer broadcast.
        * gcc.target/i386/pr100865-1.c: New test.
        * gcc.target/i386/pr100865-2.c: Likewise.
        * gcc.target/i386/pr100865-3.c: Likewise.
        * gcc.target/i386/pr100865-4a.c: Likewise.
        * gcc.target/i386/pr100865-4b.c: Likewise.
        * gcc.target/i386/pr100865-5a.c: Likewise.
        * gcc.target/i386/pr100865-5b.c: Likewise.
        * gcc.target/i386/pr100865-6a.c: Likewise.
        * gcc.target/i386/pr100865-6b.c: Likewise.
        * gcc.target/i386/pr100865-7a.c: Likewise.
        * gcc.target/i386/pr100865-7b.c: Likewise.
        * gcc.target/i386/pr100865-8a.c: Likewise.
        * gcc.target/i386/pr100865-8b.c: Likewise.
        * gcc.target/i386/pr100865-9a.c: Likewise.
        * gcc.target/i386/pr100865-9b.c: Likewise.
        * gcc.target/i386/pr100865-10a.c: Likewise.
        * gcc.target/i386/pr100865-10b.c: Likewise.
---
 gcc/config/i386/i386-expand.c                 | 192 ++++++++++++++++--
 gcc/config/i386/i386-protos.h                 |   2 +
 gcc/config/i386/i386.c                        |  31 +++
 .../i386/avx512f-broadcast-pr87767-1.c        |   7 +-
 .../i386/avx512f-broadcast-pr87767-5.c        |   5 +-
 .../gcc.target/i386/avx512f_cond_move.c       |   4 +-
 .../i386/avx512vl-broadcast-pr87767-1.c       |  12 +-
 .../i386/avx512vl-broadcast-pr87767-5.c       |   9 +-
 gcc/testsuite/gcc.target/i386/pr100865-1.c    |  13 ++
 gcc/testsuite/gcc.target/i386/pr100865-10a.c  |  33 +++
 gcc/testsuite/gcc.target/i386/pr100865-10b.c  |   7 +
 gcc/testsuite/gcc.target/i386/pr100865-2.c    |  14 ++
 gcc/testsuite/gcc.target/i386/pr100865-3.c    |  15 ++
 gcc/testsuite/gcc.target/i386/pr100865-4a.c   |  16 ++
 gcc/testsuite/gcc.target/i386/pr100865-4b.c   |   9 +
 gcc/testsuite/gcc.target/i386/pr100865-5a.c   |  16 ++
 gcc/testsuite/gcc.target/i386/pr100865-5b.c   |   9 +
 gcc/testsuite/gcc.target/i386/pr100865-6a.c   |  16 ++
 gcc/testsuite/gcc.target/i386/pr100865-6b.c   |   9 +
 gcc/testsuite/gcc.target/i386/pr100865-7a.c   |  17 ++
 gcc/testsuite/gcc.target/i386/pr100865-7b.c   |   9 +
 gcc/testsuite/gcc.target/i386/pr100865-8a.c   |  24 +++
 gcc/testsuite/gcc.target/i386/pr100865-8b.c   |   7 +
 gcc/testsuite/gcc.target/i386/pr100865-9a.c   |  25 +++
 gcc/testsuite/gcc.target/i386/pr100865-9b.c   |   7 +
 25 files changed, 484 insertions(+), 24 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-10a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-10b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-5b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-6a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-6b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-7a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-7b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-8a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-8b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-9a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-9b.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index c3ce21b4387..29d96805d9d 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -93,6 +93,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "i386-builtins.h"
 #include "i386-expand.h"
 
+static bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
+                                              rtx);
+
 /* Split one or more double-mode RTL references into pairs of half-mode
    references.  The RTL can be REG, offsettable MEM, integer constant, or
    CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
@@ -190,6 +193,88 @@ ix86_expand_clear (rtx dest)
   emit_insn (tmp);
 }
 
+/* Return true if V can be broadcasted from an integer of WIDTH bits
+   which is returned in VAL_BROADCAST.  Otherwise, return false.  */
+
+static bool
+ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
+               HOST_WIDE_INT &val_broadcast)
+{
+  wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
+  val_broadcast = wi::extract_uhwi (val, 0, width);
+  for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
+    {
+      HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
+      if (val_broadcast != each)
+       return false;
+    }
+  val_broadcast = sext_hwi (val_broadcast, width);
+  return true;
+}
+
+/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE.  */
+
+static rtx
+ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
+{
+  /* Don't use integer vector broadcast if we can't move from GPR to SSE
+     register directly.  */
+  if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
+    return nullptr;
+
+  /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
+     broadcast only if vector broadcast is available.  */
+  if (!TARGET_AVX2
+      || !CONST_WIDE_INT_P (op)
+      || standard_sse_constant_p (op, mode))
+    return nullptr;
+
+  HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
+  HOST_WIDE_INT val_broadcast;
+  scalar_int_mode broadcast_mode;
+  if (ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
+                     val_broadcast))
+    broadcast_mode = QImode;
+  else if (ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
+                          val_broadcast))
+    broadcast_mode = HImode;
+  else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
+                          val_broadcast))
+    broadcast_mode = SImode;
+  else if (TARGET_64BIT
+          && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
+                             val_broadcast))
+    {
+      /* NB: MOVQ takes a 32-bit signed immediate operand.  */
+      if (trunc_int_for_mode (val_broadcast, SImode) != val_broadcast)
+       return nullptr;
+      broadcast_mode = DImode;
+    }
+  else
+    return nullptr;
+
+  /* Check if OP can be broadcasted from VAL.  */
+  for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
+    if (val != CONST_WIDE_INT_ELT (op, i))
+      return nullptr;
+
+  unsigned int nunits = (GET_MODE_SIZE (mode)
+                        / GET_MODE_SIZE (broadcast_mode));
+  machine_mode vector_mode;
+  if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
+    gcc_unreachable ();
+  rtx target = ix86_gen_scratch_sse_rtx (vector_mode, true);
+  if (!ix86_expand_vector_init_duplicate (false, vector_mode, target,
+                                         GEN_INT (val_broadcast)))
+    gcc_unreachable ();
+  if (REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_rtx_REG (mode, REGNO (target));
+  else
+    target = convert_to_mode (mode, target, 1);
+
+  return target;
+}
+
 void
 ix86_expand_move (machine_mode mode, rtx operands[])
 {
@@ -347,20 +432,29 @@ ix86_expand_move (machine_mode mode, rtx operands[])
          && optimize)
        op1 = copy_to_mode_reg (mode, op1);
 
-      if (can_create_pseudo_p ()
-         && CONST_DOUBLE_P (op1))
+      if (can_create_pseudo_p ())
        {
-         /* If we are loading a floating point constant to a register,
-            force the value to memory now, since we'll get better code
-            out the back end.  */
+         if (CONST_DOUBLE_P (op1))
+           {
+             /* If we are loading a floating point constant to a
+                register, force the value to memory now, since we'll
+                get better code out the back end.  */
 
-         op1 = validize_mem (force_const_mem (mode, op1));
-         if (!register_operand (op0, mode))
+             op1 = validize_mem (force_const_mem (mode, op1));
+             if (!register_operand (op0, mode))
+               {
+                 rtx temp = gen_reg_rtx (mode);
+                 emit_insn (gen_rtx_SET (temp, op1));
+                 emit_move_insn (op0, temp);
+                 return;
+               }
+           }
+         else if (GET_MODE_SIZE (mode) >= 16)
            {
-             rtx temp = gen_reg_rtx (mode);
-             emit_insn (gen_rtx_SET (temp, op1));
-             emit_move_insn (op0, temp);
-             return;
+             rtx tmp = ix86_convert_const_wide_int_to_broadcast
+               (GET_MODE (op0), op1);
+             if (tmp != nullptr)
+               op1 = tmp;
            }
        }
     }
@@ -368,6 +462,54 @@ ix86_expand_move (machine_mode mode, rtx operands[])
   emit_insn (gen_rtx_SET (op0, op1));
 }
 
+static rtx
+ix86_broadcast_from_integer_constant (machine_mode mode, rtx op)
+{
+  int nunits = GET_MODE_NUNITS (mode);
+  if (nunits < 2)
+    return nullptr;
+
+  /* Don't use integer vector broadcast if we can't move from GPR to SSE
+     register directly.  */
+  if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
+    return nullptr;
+
+  /* Don't broadcast from a standard SSE constant integer.  */
+  if (standard_sse_constant_p (op, mode))
+    return nullptr;
+
+  /* Don't broadcast from a 64-bit integer constant in 32-bit mode.  */
+  if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT)
+    return nullptr;
+
+  rtx constant = get_pool_constant (XEXP (op, 0));
+  if (GET_CODE (constant) != CONST_VECTOR)
+    return nullptr;
+
+  /* There could be some rtx like
+     (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
+     but with "*.LC1" refer to V2DI constant vector.  */
+  if (GET_MODE (constant) != mode)
+    {
+      constant = simplify_subreg (mode, constant, GET_MODE (constant),
+                                 0);
+      if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
+       return nullptr;
+    }
+
+  rtx first = XVECEXP (constant, 0, 0);
+
+  for (int i = 1; i < nunits; ++i)
+    {
+      rtx tmp = XVECEXP (constant, 0, i);
+      /* Vector duplicate value.  */
+      if (!rtx_equal_p (tmp, first))
+       return nullptr;
+    }
+
+  return first;
+}
+
 void
 ix86_expand_vector_move (machine_mode mode, rtx operands[])
 {
@@ -407,7 +549,33 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
          op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
        }
       else
-       op1 = validize_mem (force_const_mem (mode, op1));
+       {
+         machine_mode mode = GET_MODE (op0);
+         rtx tmp = ix86_convert_const_wide_int_to_broadcast
+           (mode, op1);
+         if (tmp == nullptr)
+           op1 = validize_mem (force_const_mem (mode, op1));
+         else
+           op1 = tmp;
+       }
+    }
+
+  rtx first;
+
+  if (can_create_pseudo_p ()
+      && GET_MODE_SIZE (mode) >= 16
+      && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+      && (MEM_P (op1)
+         && SYMBOL_REF_P (XEXP (op1, 0))
+         && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0)))
+      && (first = ix86_broadcast_from_integer_constant (mode, op1)))
+    {
+      /* Broadcast to XMM/YMM/ZMM register from an integer constant.  */
+      op1 = ix86_gen_scratch_sse_rtx (mode, false);
+      if (!ix86_expand_vector_init_duplicate (false, mode, op1, first))
+       gcc_unreachable ();
+      emit_move_insn (op0, op1);
+      return;
     }
 
   /* We need to check memory alignment for SSE mode since attribute
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index e6ac9390777..578750a2532 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -50,6 +50,8 @@ extern void ix86_reset_previous_fndecl (void);
 
 extern bool ix86_using_red_zone (void);
 
+extern rtx ix86_gen_scratch_sse_rtx (machine_mode, bool);
+
 extern unsigned int ix86_regmode_natural_size (machine_mode);
 #ifdef RTX_CODE
 extern int standard_80387_constant_p (rtx);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index b0d19a61a76..d1c8ed41d1d 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -23109,6 +23109,37 @@ ix86_optab_supported_p (int op, machine_mode mode1, 
machine_mode,
     }
 }
 
+/* Return a scratch register in MODE for vector load and store.  If
+   CONSTANT_INT_BROADCAST is true, it is used to hold constant integer
+   broadcast result.  */
+
+rtx
+ix86_gen_scratch_sse_rtx (machine_mode mode,
+                         bool constant_int_broadcast)
+{
+  rtx target;
+
+  /* NB: Choose a hard scratch SSE register:
+     1. Avoid increasing stack alignment requirement.
+     2. For integer constant broadcast in 64-bit mode, avoid
+       transformation by the combine pass.
+   */
+  if (GET_MODE_SIZE (mode) >= 16
+      && !COMPLEX_MODE_P (mode)
+      && (SCALAR_INT_MODE_P (mode)
+         || GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+      && ((constant_int_broadcast
+          && TARGET_64BIT
+          && GET_MODE_SIZE (mode) == 16)
+         || GET_MODE_ALIGNMENT (mode) > crtl->stack_alignment_estimated))
+    target = gen_rtx_REG (mode, (TARGET_64BIT
+                                ? LAST_REX_SSE_REG
+                                : LAST_SSE_REG));
+  else
+    target = gen_reg_rtx (mode);
+  return target;
+}
+
 /* Address space support.
 
    This is not "far pointers" in the 16-bit sense, but an easy way
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c 
b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
index 0563e696316..a2664d87f29 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
@@ -2,8 +2,11 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512f -mavx512dq" } */
 /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } 
} }
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 5 } }  */
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 5 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 { target { ! 
ia32 } } } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 5 { target ia32 } 
} } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
%zmm\[0-9\]+" 3 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, 
%zmm\[0-9\]+" 3 { target { ! ia32 } } } } */
 
 typedef int v16si  __attribute__ ((vector_size (64)));
 typedef long long v8di  __attribute__ ((vector_size (64)));
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c 
b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
index ffbe95980ca..477f9ca1282 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
@@ -2,8 +2,9 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512f" } */
 /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } 
} }
-/* { dg-final { scan-assembler-times "\[^n\n\]*\\\{1to8\\\}" 4 } }  */
-/* { dg-final { scan-assembler-times "\[^n\n\]*\\\{1to16\\\}" 4 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 4 { target ia32 } 
} } */
+/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
%zmm\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, 
%zmm\[0-9\]+" 4 { target { ! ia32 } } } } */
 
 typedef int v16si  __attribute__ ((vector_size (64)));
 typedef long long v8di  __attribute__ ((vector_size (64)));
diff --git a/gcc/testsuite/gcc.target/i386/avx512f_cond_move.c 
b/gcc/testsuite/gcc.target/i386/avx512f_cond_move.c
index 99a89f51202..ca49a585232 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f_cond_move.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f_cond_move.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
-/* { dg-options "-O3 -mavx512f" } */
-/* { dg-final { scan-assembler-times "(?:vpblendmd|vmovdqa32)\[ 
\\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 8 } } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vmovdqa32)\[ 
\\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 8 } } */
 
 unsigned int x[128];
 int y[128];
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
index c06369d93fd..f8eb99f0b5f 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
@@ -2,9 +2,15 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512f -mavx512vl -mavx512dq" } */
 /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } 
} }
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 5 } }  */
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 10 } }  */
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 5 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 2 { target { ! 
ia32 } } } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 4 { target { ! 
ia32 } } } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 5 { target ia32 } 
} } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 7 { target ia32 } 
} } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
%xmm\[0-9\]+" 3 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
%ymm\[0-9\]+" 3 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, 
%xmm\[0-9\]+" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, 
%ymm\[0-9\]+" 3 { target { ! ia32 } } } } */
 
 typedef int v4si  __attribute__ ((vector_size (16)));
 typedef int v8si  __attribute__ ((vector_size (32)));
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
index 4998a9b8d51..32f6ac81841 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
@@ -2,9 +2,12 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512f -mavx512vl" } */
 /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } 
} }
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 4 } }  */
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 8 } }  */
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 4 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 4 { target ia32 } 
} } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 4 { target ia32 } 
} } */
+/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
%xmm\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
%ymm\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, 
%xmm\[0-9\]+" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, 
%ymm\[0-9\]+" 4 { target { ! ia32 } } } } */
 
 typedef int v4si  __attribute__ ((vector_size (16)));
 typedef int v8si  __attribute__ ((vector_size (32)));
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-1.c 
b/gcc/testsuite/gcc.target/i386/pr100865-1.c
new file mode 100644
index 00000000000..6c3097fb2a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=x86-64" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 16);
+}
+
+/* { dg-final { scan-assembler-times "movdqa\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[\\t \]%xmm\[0-9\]+, 
\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10a.c 
b/gcc/testsuite/gcc.target/i386/pr100865-10a.c
new file mode 100644
index 00000000000..7ffc19e56a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-10a.c
@@ -0,0 +1,33 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O3 -march=skylake" } */
+
+extern __int128 array[16];
+
+#define MK_CONST128_BROADCAST(A) \
+  ((((unsigned __int128) (unsigned char) A) << 120) \
+   | (((unsigned __int128) (unsigned char) A) << 112) \
+   | (((unsigned __int128) (unsigned char) A) << 104) \
+   | (((unsigned __int128) (unsigned char) A) << 96) \
+   | (((unsigned __int128) (unsigned char) A) << 88) \
+   | (((unsigned __int128) (unsigned char) A) << 80) \
+   | (((unsigned __int128) (unsigned char) A) << 72) \
+   | (((unsigned __int128) (unsigned char) A) << 64) \
+   | (((unsigned __int128) (unsigned char) A) << 56) \
+   | (((unsigned __int128) (unsigned char) A) << 48) \
+   | (((unsigned __int128) (unsigned char) A) << 40) \
+   | (((unsigned __int128) (unsigned char) A) << 32) \
+   | (((unsigned __int128) (unsigned char) A) << 24) \
+   | (((unsigned __int128) (unsigned char) A) << 16) \
+   | (((unsigned __int128) (unsigned char) A) << 8) \
+   | ((unsigned __int128) (unsigned char) A) )
+
+void
+foo (void)
+{
+  int i;
+  for (i = 0; i < sizeof (array) / sizeof (array[0]); i++)
+    array[i] = MK_CONST128_BROADCAST (0x1f);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+\[^\n\]*, 
%xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c 
b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
new file mode 100644
index 00000000000..edf52765c60
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O3 -march=skylake-avx512" } */
+
+#include "pr100865-10a.c"
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, 
%xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-2.c 
b/gcc/testsuite/gcc.target/i386/pr100865-2.c
new file mode 100644
index 00000000000..17efe2d72a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 16);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, 
%xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, 
\\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c 
b/gcc/testsuite/gcc.target/i386/pr100865-3.c
new file mode 100644
index 00000000000..b6dbcf7809b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c
@@ -0,0 +1,15 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 16);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, 
%xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, 
\\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, 
%xmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4a.c 
b/gcc/testsuite/gcc.target/i386/pr100865-4a.c
new file mode 100644
index 00000000000..f55883598f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern char array[64];
+
+void
+foo (void)
+{
+  int i;
+  for (i = 0; i < sizeof (array) / sizeof (array[0]); i++)
+    array[i] = -45;
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, 
%xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c 
b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
new file mode 100644
index 00000000000..f41e6147b4c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
@@ -0,0 +1,9 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+#include "pr100865-4a.c"
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, 
%xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */
+/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, 
%xmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-5a.c 
b/gcc/testsuite/gcc.target/i386/pr100865-5a.c
new file mode 100644
index 00000000000..4149797fe81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-5a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=skylake" } */
+
+extern short array[64];
+
+void
+foo (void)
+{
+  int i;
+  for (i = 0; i < sizeof (array) / sizeof (array[0]); i++)
+    array[i] = -45;
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastw\[\\t \]+%xmm\[0-9\]+, 
%ymm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 4 } } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-5b.c 
b/gcc/testsuite/gcc.target/i386/pr100865-5b.c
new file mode 100644
index 00000000000..ded41b680d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-5b.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=skylake-avx512" } */
+
+#include "pr100865-5a.c"
+
+/* { dg-final { scan-assembler-times "vpbroadcastw\[\\t \]+%(?:r|e)\[^\n\]*, 
%ymm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu16\[\\t \]%ymm\[0-9\]+, " 4 } } */
+/* { dg-final { scan-assembler-not "vpbroadcastw\[\\t \]+%xmm\[0-9\]+, 
%ymm\[0-9\]+" } } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-6a.c 
b/gcc/testsuite/gcc.target/i386/pr100865-6a.c
new file mode 100644
index 00000000000..3fde549a10d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-6a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=skylake" } */
+
+extern int array[64];
+
+void
+foo (void)
+{
+  int i;
+  for (i = 0; i < sizeof (array) / sizeof (array[0]); i++)
+    array[i] = -45;
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%xmm\[0-9\]+, 
%ymm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-6b.c 
b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
new file mode 100644
index 00000000000..44e74c64e55
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=skylake-avx512" } */
+
+#include "pr100865-6a.c"
+
+/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
%ymm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
+/* { dg-final { scan-assembler-not "vpbroadcastd\[\\t \]+%xmm\[0-9\]+, 
%ymm\[0-9\]+" } } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-7a.c 
b/gcc/testsuite/gcc.target/i386/pr100865-7a.c
new file mode 100644
index 00000000000..f6f2be91120
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-7a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=skylake" } */
+
+extern long long int array[64];
+
+void
+foo (void)
+{
+  int i;
+  for (i = 0; i < sizeof (array) / sizeof (array[0]); i++)
+    array[i] = -45;
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, 
%ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-not "vpbroadcastq" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovdqa" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-7b.c 
b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
new file mode 100644
index 00000000000..0a68820aa32
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=skylake-avx512" } */
+
+#include "pr100865-7a.c"
+
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, 
%ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, 
%ymm\[0-9\]+" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8a.c 
b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
new file mode 100644
index 00000000000..96e9f13204c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
@@ -0,0 +1,24 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O3 -march=skylake" } */
+
+extern __int128 array[16];
+
+#define MK_CONST128_BROADCAST(A) \
+  ((((unsigned __int128) (unsigned int) A) << 96) \
+   | (((unsigned __int128) (unsigned int) A) << 64) \
+   | (((unsigned __int128) (unsigned int) A) << 32) \
+   | ((unsigned __int128) (unsigned int) A) )
+
+#define MK_CONST128_BROADCAST_SIGNED(A) \
+  ((__int128) MK_CONST128_BROADCAST (A))
+
+void
+foo (void)
+{
+  int i;
+  for (i = 0; i < sizeof (array) / sizeof (array[0]); i++)
+    array[i] = MK_CONST128_BROADCAST_SIGNED (-45);
+}
+
+/* { dg-final { scan-assembler-times "(?:vpbroadcastq|vpshufd)\[\\t 
\]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8b.c 
b/gcc/testsuite/gcc.target/i386/pr100865-8b.c
new file mode 100644
index 00000000000..99a10ad83bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-8b.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O3 -march=skylake-avx512" } */
+
+#include "pr100865-8a.c"
+
+/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
%xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9a.c 
b/gcc/testsuite/gcc.target/i386/pr100865-9a.c
new file mode 100644
index 00000000000..45d0e0d0e2e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-9a.c
@@ -0,0 +1,25 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O3 -march=skylake" } */
+
+extern __int128 array[16];
+
+#define MK_CONST128_BROADCAST(A) \
+  ((((unsigned __int128) (unsigned short) A) << 112) \
+   | (((unsigned __int128) (unsigned short) A) << 96) \
+   | (((unsigned __int128) (unsigned short) A) << 80) \
+   | (((unsigned __int128) (unsigned short) A) << 64) \
+   | (((unsigned __int128) (unsigned short) A) << 48) \
+   | (((unsigned __int128) (unsigned short) A) << 32) \
+   | (((unsigned __int128) (unsigned short) A) << 16) \
+   | ((unsigned __int128) (unsigned short) A) )
+
+void
+foo (void)
+{
+  int i;
+  for (i = 0; i < sizeof (array) / sizeof (array[0]); i++)
+    array[i] = MK_CONST128_BROADCAST (0x1fff);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastw\[\\t \]+%xmm\[0-9\]+, 
%xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9b.c 
b/gcc/testsuite/gcc.target/i386/pr100865-9b.c
new file mode 100644
index 00000000000..14696248525
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100865-9b.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O3 -march=skylake-avx512" } */
+
+#include "pr100865-9a.c"
+
+/* { dg-final { scan-assembler-times "vpbroadcastw\[\\t \]+%(?:r|e)\[^\n\]*, 
%xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
-- 
2.31.1

Reply via email to