On Wed, Sep 2, 2020 at 5:58 PM Jakub Jelinek <ja...@redhat.com> wrote:
>
> On Wed, Sep 02, 2020 at 09:57:08AM +0800, Hongtao Liu via Gcc-patches wrote:
> > +
> > +      first = XVECEXP (constant, 0, 0);
> > +      /* There could be some rtx like
> > +      (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
> > +      but with "*.LC1" refer to V2DI constant vector.  */
> > +      if (GET_MODE (constant) != mode)
> > +     {
> > +       constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
> > +       if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR)
> > +         return;
> > +     }
>
> The
>       first = XVECEXP (constant, 0, 0);
> line needs to be after this if, not before it, otherwise it will miscompile
> things or just ICE.
>

Changed.

> > @@ -2197,6 +2272,10 @@ remove_partial_avx_dependency (void)
> >         if (!NONDEBUG_INSN_P (insn))
> >           continue;
> >
> > +       /* Hanlde AVX512 embedded broadcast here to save compile time.  */
>
> s/Hanlde/Handle/
>

Changed, sorry for the typo.

> > +  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
> > +    {
> > +      if (!INSN_P (insn))
> > +     continue;
> > +      replace_constant_pool_with_broadcast (insn);
> > +    }
>
> Perhaps instead do:
>   for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
>     if (INSN_P (insn))
>       replace_constant_pool_with_broadcast (insn);
> ?
>

Changed.

> > +  /* opt_pass methods: */
> > +  virtual bool gate (function *)
> > +    {
> > +      /* Return false if rpad pass gate is true.
> > +      replace_constant_pool_with_broadcast is called
> > +      from both this pass and rpad pass.  */
> > +      return (TARGET_AVX512F
> > +           && !(TARGET_AVX
> > +                && TARGET_SSE_PARTIAL_REG_DEPENDENCY
> > +                && TARGET_SSE_MATH
> > +                && optimize
> > +                && optimize_function_for_speed_p (cfun)));
>
> I think this could be a maintainance nightmare.
> Perhaps instead add
>

Yes, a common interface should be added as bellow, changed.

> static bool
> remove_partial_avx_dependency_gate ()
> {
>   return (TARGET_AVX
>           && TARGET_SSE_PARTIAL_REG_DEPENDENCY
>           && TARGET_SSE_MATH
>           && optimize
>           && optimize_function_for_speed_p (cfun));
> }
> after the remove_partial_avx_dependency function definition,
> change pass_remove_partial_avx_dependency gate body to
>       return remove_partial_avx_dependency_gate ();
> and in pass_constant_pool_broadcast::gate do
>       return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ();
> (with the comment you have there)?
>
> LGTM with those changes.
>
>         Jakub
>

Thanks for the review, update patch.

-- 
BR,
Hongtao
From acf3825279190ca0540bb4704f66568fdbe06ce8 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao....@intel.com>
Date: Wed, 8 Jul 2020 17:14:36 +0800
Subject: [PATCH] Optimize memory broadcast for constant vector under AVX512.

For constant vector having one duplicated value, there's no need to put
whole vector in the constant pool, using embedded broadcast instead.

2020-07-09  Hongtao Liu  <hongtao....@intel.com>

gcc/ChangeLog:

	PR target/87767
	* config/i386/i386-features.c
	(replace_constant_pool_with_broadcast): New function.
	(constant_pool_broadcast): Ditto.
	(class pass_constant_pool_broadcast): New pass.
	(make_pass_constant_pool_broadcast): Ditto.
	(remove_partial_avx_dependency): Call
	replace_constant_pool_with_broadcast under TARGET_AVX512F, it
	would save compile time when both pass rpad and cpb are
	available.
	(remove_partial_avx_dependency_gate): New function.
	(class pass_remove_partial_avx_dependency::gate): Call
	remove_partial_avx_dependency_gate.
	* config/i386/i386-passes.def: Insert new pass after combine.
	* config/i386/i386-protos.h
	(make_pass_constant_pool_broadcast): Declare.
	* config/i386/sse.md (*avx512dq_mul<mode>3<mask_name>_bcst):
	New define_insn.
	(*avx512f_mul<mode>3<mask_name>_bcst): Ditto.
	* config/i386/avx512fintrin.h (_mm512_set1_ps,
	_mm512_set1_pd,_mm512_set1_epi32, _mm512_set1_epi64): Adjusted.

gcc/testsuite/ChangeLog:

	PR target/87767
	* gcc.target/i386/avx2-broadcast-pr87767-1.c: New test.
	* gcc.target/i386/avx512f-broadcast-pr87767-1.c: New test.
	* gcc.target/i386/avx512f-broadcast-pr87767-2.c: New test.
	* gcc.target/i386/avx512f-broadcast-pr87767-3.c: New test.
	* gcc.target/i386/avx512f-broadcast-pr87767-4.c: New test.
	* gcc.target/i386/avx512f-broadcast-pr87767-5.c: New test.
	* gcc.target/i386/avx512f-broadcast-pr87767-6.c: New test.
	* gcc.target/i386/avx512f-broadcast-pr87767-7.c: New test.
	* gcc.target/i386/avx512vl-broadcast-pr87767-1.c: New test.
	* gcc.target/i386/avx512vl-broadcast-pr87767-1.c: New test.
	* gcc.target/i386/avx512vl-broadcast-pr87767-2.c: New test.
	* gcc.target/i386/avx512vl-broadcast-pr87767-3.c: New test.
	* gcc.target/i386/avx512vl-broadcast-pr87767-4.c: New test.
	* gcc.target/i386/avx512vl-broadcast-pr87767-5.c: New test.
	* gcc.target/i386/avx512vl-broadcast-pr87767-6.c: New test.
---
 gcc/config/i386/avx512fintrin.h               |  27 +--
 gcc/config/i386/i386-features.c               | 157 +++++++++++++++++-
 gcc/config/i386/i386-passes.def               |   1 +
 gcc/config/i386/i386-protos.h                 |   1 +
 gcc/config/i386/sse.md                        |  24 +++
 .../i386/avx2-broadcast-pr87767-1.c           |  40 +++++
 .../i386/avx512f-broadcast-pr87767-1.c        |  30 ++++
 .../i386/avx512f-broadcast-pr87767-2.c        |  42 +++++
 .../i386/avx512f-broadcast-pr87767-3.c        |  30 ++++
 .../i386/avx512f-broadcast-pr87767-4.c        |  42 +++++
 .../i386/avx512f-broadcast-pr87767-5.c        |  26 +++
 .../i386/avx512f-broadcast-pr87767-6.c        |  41 +++++
 .../i386/avx512f-broadcast-pr87767-7.c        | 121 ++++++++++++++
 .../i386/avx512vl-broadcast-pr87767-1.c       |  45 +++++
 .../i386/avx512vl-broadcast-pr87767-2.c       |  59 +++++++
 .../i386/avx512vl-broadcast-pr87767-3.c       |  37 +++++
 .../i386/avx512vl-broadcast-pr87767-4.c       |  56 +++++++
 .../i386/avx512vl-broadcast-pr87767-5.c       |  37 +++++
 .../i386/avx512vl-broadcast-pr87767-6.c       |  55 ++++++
 19 files changed, 848 insertions(+), 23 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-broadcast-pr87767-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-6.c

diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h
index 0d53dda3a27..729d5686d68 100644
--- a/gcc/config/i386/avx512fintrin.h
+++ b/gcc/config/i386/avx512fintrin.h
@@ -239,22 +239,17 @@ extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_set1_pd (double __A)
 {
-  return (__m512d) __builtin_ia32_broadcastsd512 (__extension__
-						  (__v2df) { __A, },
-						  (__v8df)
-						  _mm512_undefined_pd (),
-						  (__mmask8) -1);
+  return __extension__ (__m512d)(__v8df)
+    { __A, __A, __A, __A, __A, __A, __A, __A };
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_set1_ps (float __A)
 {
-  return (__m512) __builtin_ia32_broadcastss512 (__extension__
-						 (__v4sf) { __A, },
-						 (__v16sf)
-						 _mm512_undefined_ps (),
-						 (__mmask16) -1);
+  return __extension__ (__m512)(__v16sf)
+    { __A, __A, __A, __A, __A, __A, __A, __A,
+      __A, __A, __A, __A, __A, __A, __A, __A };
 }
 
 /* Create the vector [A B C D A B C D A B C D A B C D].  */
@@ -4072,10 +4067,9 @@ extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_set1_epi32 (int __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
-							   (__v16si)
-							   _mm512_undefined_epi32 (),
-							   (__mmask16)(-1));
+  return (__m512i)(__v16si)
+    { __A, __A, __A, __A, __A, __A, __A, __A,
+      __A, __A, __A, __A, __A, __A, __A, __A };
 }
 
 extern __inline __m512i
@@ -4128,10 +4122,7 @@ extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_set1_epi64 (long long __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
-							   (__v8di)
-							   _mm512_undefined_epi32 (),
-							   (__mmask8)(-1));
+  return (__m512i)(__v8di) { __A, __A, __A, __A, __A, __A, __A, __A };
 }
 
 extern __inline __m512i
diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index 535fc7e981d..620f7f157f4 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -2162,6 +2162,81 @@ make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
   return new pass_insert_endbr_and_patchable_area (ctxt);
 }
 
+/* Replace all one-value const vector that are referenced by SYMBOL_REFs in x
+   with embedded broadcast. i.e.transform
+
+     vpaddq .LC0(%rip), %zmm0, %zmm0
+     ret
+  .LC0:
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+
+    to
+
+     vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0
+     ret
+  .LC0:
+    .quad 3  */
+static void
+replace_constant_pool_with_broadcast (rtx_insn *insn)
+{
+  subrtx_ptr_iterator::array_type array;
+  FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL)
+    {
+      rtx *loc = *iter;
+      rtx x = *loc;
+      rtx broadcast_mem, vec_dup, constant, first;
+      machine_mode mode;
+
+      /* Constant pool.  */
+      if (!MEM_P (x)
+	  || !SYMBOL_REF_P (XEXP (x, 0))
+	  || !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0)))
+	continue;
+
+      /* Const vector.  */
+      mode = GET_MODE (x);
+      if (!VECTOR_MODE_P (mode))
+	return;
+      constant = get_pool_constant (XEXP (x, 0));
+      if (GET_CODE (constant) != CONST_VECTOR)
+	return;
+
+      /* There could be some rtx like
+	 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
+	 but with "*.LC1" refer to V2DI constant vector.  */
+      if (GET_MODE (constant) != mode)
+	{
+	  constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
+	  if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR)
+	    return;
+	}
+      first = XVECEXP (constant, 0, 0);
+
+      for (int i = 1; i < GET_MODE_NUNITS (mode); ++i)
+	{
+	  rtx tmp = XVECEXP (constant, 0, i);
+	  /* Vector duplicate value.  */
+	  if (!rtx_equal_p (tmp, first))
+	    return;
+	}
+
+      /* Replace with embedded broadcast.  */
+      broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first);
+      vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem);
+      validate_change (insn, loc, vec_dup, 0);
+
+      /* At most 1 memory_operand in an insn.  */
+      return;
+    }
+}
+
 /* At entry of the nearest common dominator for basic blocks with
    conversions, generate a single
 	vxorps %xmmN, %xmmN, %xmmN
@@ -2197,6 +2272,10 @@ remove_partial_avx_dependency (void)
 	  if (!NONDEBUG_INSN_P (insn))
 	    continue;
 
+	  /* Handle AVX512 embedded broadcast here to save compile time.  */
+	  if (TARGET_AVX512F)
+	    replace_constant_pool_with_broadcast (insn);
+
 	  set = single_set (insn);
 	  if (!set)
 	    continue;
@@ -2333,6 +2412,16 @@ remove_partial_avx_dependency (void)
   return 0;
 }
 
+static bool
+remove_partial_avx_dependency_gate ()
+{
+  return (TARGET_AVX
+	  && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+	  && TARGET_SSE_MATH
+	  && optimize
+	  && optimize_function_for_speed_p (cfun));
+}
+
 namespace {
 
 const pass_data pass_data_remove_partial_avx_dependency =
@@ -2358,11 +2447,7 @@ public:
   /* opt_pass methods: */
   virtual bool gate (function *)
     {
-      return (TARGET_AVX
-	      && TARGET_SSE_PARTIAL_REG_DEPENDENCY
-	      && TARGET_SSE_MATH
-	      && optimize
-	      && optimize_function_for_speed_p (cfun));
+      return remove_partial_avx_dependency_gate ();
     }
 
   virtual unsigned int execute (function *)
@@ -2379,6 +2464,68 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
   return new pass_remove_partial_avx_dependency (ctxt);
 }
 
+/* For const vector having one duplicated value, there's no need to put
+   whole vector in the constant pool when target supports embedded broadcast. */
+static unsigned int
+constant_pool_broadcast (void)
+{
+  timevar_push (TV_MACH_DEP);
+  rtx_insn *insn;
+
+  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+    {
+      if (INSN_P (insn))
+	replace_constant_pool_with_broadcast (insn);
+    }
+  timevar_pop (TV_MACH_DEP);
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_constant_pool_broadcast =
+{
+  RTL_PASS, /* type */
+  "cpb", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_constant_pool_broadcast : public rtl_opt_pass
+{
+public:
+  pass_constant_pool_broadcast (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      /* Return false if rpad pass gate is true.
+	 replace_constant_pool_with_broadcast is called
+	 from both this pass and rpad pass.  */
+      return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ());
+    }
+
+  virtual unsigned int execute (function *)
+    {
+      return constant_pool_broadcast ();
+    }
+}; // class pass_cpb
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_constant_pool_broadcast (gcc::context *ctxt)
+{
+  return new pass_constant_pool_broadcast (ctxt);
+}
+
 /* This compares the priority of target features in function DECL1
    and DECL2.  It returns positive value if DECL1 is higher priority,
    negative value if DECL2 is higher priority and 0 if they are the
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index d83c7b956b1..07ecf8e790f 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -33,3 +33,4 @@ along with GCC; see the file COPYING3.  If not see
   INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area);
 
   INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
+  INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index b6088f22d55..c5b700efd0e 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -386,3 +386,4 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
   (gcc::context *);
 extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
   (gcc::context *);
+extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8250325e1a3..a728b979f01 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -12120,6 +12120,18 @@ (define_insn "avx512dq_mul<mode>3<mask_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*avx512dq_mul<mode>3<mask_name>_bcst"
+  [(set (match_operand:VI8_AVX512VL 0 "register_operand" "=v")
+	(mult:VI8_AVX512VL
+	  (vec_duplicate:VI8_AVX512VL
+	    (match_operand:<ssescalarmode> 1 "memory_operand" "m"))
+	  (match_operand:VI8_AVX512VL 2 "register_operand" "v")))]
+  "TARGET_AVX512DQ"
+  "vpmullq\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<avx512bcst>}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_expand "mul<mode>3<mask_name>"
   [(set (match_operand:VI4_AVX512F 0 "register_operand")
 	(mult:VI4_AVX512F
@@ -12160,6 +12172,18 @@ (define_insn "*<sse4_1_avx2>_mul<mode>3<mask_name>"
    (set_attr "btver2_decode" "vector,vector,vector")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*avx512f_mul<mode>3<mask_name>_bcst"
+  [(set (match_operand:VI4_AVX512VL 0 "register_operand" "=v")
+	(mult:VI4_AVX512VL
+	  (vec_duplicate:VI4_AVX512VL
+	    (match_operand:<ssescalarmode> 1 "memory_operand" "m"))
+	  (match_operand:VI4_AVX512VL 2 "register_operand" "v")))]
+  "TARGET_AVX512F"
+   "vpmulld\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<avx512bcst>}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_expand "mul<mode>3"
   [(set (match_operand:VI8_AVX2_AVX512F 0 "register_operand")
 	(mult:VI8_AVX2_AVX512F
diff --git a/gcc/testsuite/gcc.target/i386/avx2-broadcast-pr87767-1.c b/gcc/testsuite/gcc.target/i386/avx2-broadcast-pr87767-1.c
new file mode 100644
index 00000000000..aee1680ed98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-broadcast-pr87767-1.c
@@ -0,0 +1,40 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-not "\\\{1to\[248\]\\\}" } }  */
+/* { dg-final { scan-assembler-not "\\\{1to16\\\}" } }  */
+
+typedef int v4si  __attribute__ ((vector_size (16)));
+typedef int v8si  __attribute__ ((vector_size (32)));
+typedef long long v2di  __attribute__ ((vector_size (16)));
+typedef long long v4di  __attribute__ ((vector_size (32)));
+typedef float v4sf  __attribute__ ((vector_size (16)));
+typedef float v8sf  __attribute__ ((vector_size (32)));
+typedef double v2df  __attribute__ ((vector_size (16)));
+typedef double v4df  __attribute__ ((vector_size (32)));
+
+#define FOO(VTYPE, OP_NAME, OP)			\
+VTYPE						\
+ __attribute__ ((noipa))			\
+foo_##OP_NAME##_##VTYPE (VTYPE a)		\
+{						\
+  return a OP 101;				\
+}						\
+
+FOO (v4si, add, +);
+FOO (v8si, add, +);
+FOO (v2di, add, +);
+FOO (v4di, add, +);
+FOO (v4sf, add, +);
+FOO (v8sf, add, +);
+FOO (v2df, add, +);
+FOO (v4df, add, +);
+
+FOO (v4si, mul, *);
+FOO (v8si, mul, *);
+FOO (v2di, mul, *);
+FOO (v4di, mul, *);
+FOO (v4sf, mul, *);
+FOO (v8sf, mul, *);
+FOO (v2df, mul, *);
+FOO (v4df, mul, *);
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
new file mode 100644
index 00000000000..a8ee5f5faf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
@@ -0,0 +1,30 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512dq" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 5 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 5 } }  */
+
+typedef int v16si  __attribute__ ((vector_size (64)));
+typedef long long v8di  __attribute__ ((vector_size (64)));
+typedef float v16sf  __attribute__ ((vector_size (64)));
+typedef double v8df  __attribute__ ((vector_size (64)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP)			\
+VTYPE						\
+ __attribute__ ((noipa))			\
+foo_##OP_NAME##_##VTYPE (VTYPE a)		\
+{						\
+  return a OP CONSTANT;				\
+}						\
+
+FOO (v16si, add, +);
+FOO (v8di, add, +);
+FOO (v16sf, add, +);
+FOO (v8df, add, +);
+FOO (v16si, sub, -);
+FOO (v8di, sub, -);
+FOO (v16si, mul, *);
+FOO (v8di, mul, *);
+FOO (v16sf, mul, *);
+FOO (v8df, mul, *);
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-2.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-2.c
new file mode 100644
index 00000000000..30cf5809c3d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-2.c
@@ -0,0 +1,42 @@
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f -mavx512dq" } */
+/* { dg-require-effective-target avx512dq } */
+
+#define AVX512DQ
+#include "avx512f-helper.h"
+
+#include "avx512f-broadcast-pr87767-1.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP)		\
+  do							\
+    {							\
+      TYPE exp[N], src[N];				\
+      VTYPE res;					\
+      for (int i = 0; i < N; i++)			\
+	src[i] = i * i * 107;				\
+      res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src[0]);	\
+      for (int i = 0; i < N; i ++)			\
+	exp[i] = src[i] OP CONSTANT;			\
+      for (int j = 0; j < N; j++)			\
+	{						\
+	  if (res[j] != exp[j])				\
+	    abort();					\
+	}						\
+    }							\
+  while (0)
+
+void
+test_512 (void)
+{
+  RTEST (v16si, int, 16, add, +);
+  RTEST (v8di, long long, 8, add, +);
+  RTEST (v16sf, float, 16, add, +);
+  RTEST (v8df, double, 8, add, +);
+  RTEST (v16si, int, 16, sub, -);
+  RTEST (v8di, long long, 8, sub, -);
+  RTEST (v16si, int, 16, mul, *);
+  RTEST (v8di, long long, 8, mul, *);
+  RTEST (v16sf, float, 16, mul, *);
+  RTEST (v8df, double, 8, mul, *);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-3.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-3.c
new file mode 100644
index 00000000000..c2f22c4ee5a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-3.c
@@ -0,0 +1,30 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 4 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 4 } }  */
+
+typedef float v4sf  __attribute__ ((vector_size (16)));
+typedef float v8sf  __attribute__ ((vector_size (32)));
+typedef float v16sf  __attribute__ ((vector_size (64)));
+typedef double v2df  __attribute__ ((vector_size (16)));
+typedef double v4df  __attribute__ ((vector_size (32)));
+typedef double v8df  __attribute__ ((vector_size (64)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP1, OP2)		\
+VTYPE						\
+ __attribute__ ((noipa))			\
+ foo_##OP_NAME##_##VTYPE (VTYPE a, VTYPE b)	\
+{						\
+  return (OP1 a * b) OP2 CONSTANT;		\
+}						\
+
+FOO (v16sf, fma,, +);
+FOO (v8df, fma,, +);
+FOO (v16sf, fms,, -);
+FOO (v8df, fms,, -);
+FOO (v16sf, fnma, -, +);
+FOO (v8df, fnma, -, +);
+FOO (v16sf, fnms, -, -);
+FOO (v8df, fnms, -, -);
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-4.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-4.c
new file mode 100644
index 00000000000..dabe91b11c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-4.c
@@ -0,0 +1,42 @@
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#include "avx512f-helper.h"
+
+#include "avx512f-broadcast-pr87767-3.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP1, OP2)				\
+  do									\
+    {									\
+      TYPE exp[N], src1[N], src2[N];					\
+      VTYPE res;							\
+      for (int i = 0; i < N; i++)					\
+	{								\
+	  src1[i] = i * i * 107.2f;					\
+	  src2[i] = i * 2.f - 404.f;					\
+	}								\
+      res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src1[0], *(VTYPE*)&src2[0]); \
+      for (int i = 0; i < N; i ++)					\
+	exp[i] = (OP1 src1[i] * src2[i]) OP2 CONSTANT;			\
+      for (int j = 0; j < N; j++)					\
+	{								\
+	  if (res[j] != exp[j])						\
+	    abort();							\
+	}								\
+    }									\
+  while (0)
+
+void
+test_512 (void)
+{
+  RTEST (v16sf, float, 16, fma,, +);
+  RTEST (v8df, double, 8, fma,, +);
+  RTEST (v16sf, float, 16, fms,, -);
+  RTEST (v8df, double, 8, fms,, -);
+  RTEST (v16sf, float, 16, fnma,-, +);
+  RTEST (v8df, double, 8, fnma,-, +);
+  RTEST (v16sf, float, 16, fnms,-, -);
+  RTEST (v8df, double, 8, fnms,-, -);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
new file mode 100644
index 00000000000..72e1098ccbe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
@@ -0,0 +1,26 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-final { scan-assembler-times "\[^n\n\]*\\\{1to8\\\}" 4 } }  */
+/* { dg-final { scan-assembler-times "\[^n\n\]*\\\{1to16\\\}" 4 } }  */
+
+typedef int v16si  __attribute__ ((vector_size (64)));
+typedef long long v8di  __attribute__ ((vector_size (64)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP1, OP2)		\
+VTYPE						\
+ __attribute__ ((noipa))			\
+ foo_##OP_NAME##_##VTYPE (VTYPE a)		\
+{						\
+  return (OP1 a) OP2 CONSTANT;			\
+}						\
+
+FOO (v16si, andnot, ~, &);
+FOO (v8di, andnot, ~, &);
+FOO (v16si, and,, &);
+FOO (v8di, and,, &);
+FOO (v16si, or,, |);
+FOO (v8di, or,, |);
+FOO (v16si, xor,, ^);
+FOO (v8di, xor,, ^);
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-6.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-6.c
new file mode 100644
index 00000000000..f288f83158c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-6.c
@@ -0,0 +1,41 @@
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f} */
+
+#include "avx512f-helper.h"
+
+#include "avx512f-broadcast-pr87767-5.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP1, OP2)				\
+  do									\
+    {									\
+      TYPE exp[N], src[N];						\
+      VTYPE res;							\
+      for (int i = 0; i < N; i++)					\
+	{								\
+	  src[i] = i * i * 107;						\
+	}								\
+      res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src[0]);			\
+      for (int i = 0; i < N; i ++)					\
+	exp[i] = (OP1 src[i]) OP2 CONSTANT;				\
+      for (int j = 0; j < N; j++)					\
+	{								\
+	  if (res[j] != exp[j])						\
+	    abort();							\
+	}								\
+    }									\
+  while (0)
+
+void
+test_512 (void)
+{
+  RTEST (v16si, int, 16, andnot, ~, &);
+  RTEST (v8di, long long, 8, andnot, ~, &);
+  RTEST (v16si, int, 16, and,, &);
+  RTEST (v8di, long long, 8, and,, &);
+  RTEST (v16si, int, 16, or,, |);
+  RTEST (v8di, long long, 8, or,, |);
+  RTEST (v16si, int, 16, xor,, ^);
+  RTEST (v8di, long long, 8, xor,, ^);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-7.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-7.c
new file mode 100644
index 00000000000..a8f145d8589
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-7.c
@@ -0,0 +1,121 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to16\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vsub\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vsub\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vsub\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vsub\[^\n\]*\\\{1to16\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to16\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vdiv\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vdiv\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vdiv\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vdiv\[^\n\]*\\\{1to16\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfmadd\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfmadd\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfmadd\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfmadd\[^\n\]*\\\{1to16\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfmsub\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfmsub\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfmsub\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfmsub\[^\n\]*\\\{1to16\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfnmadd\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfnmadd\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfnmadd\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfnmadd\[^\n\]*\\\{1to16\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfnmsub\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfnmsub\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfnmsub\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfnmsub\[^\n\]*\\\{1to16\\\}" 1 } }  */
+
+#include<immintrin.h>
+
+#define CONSTANT 101
+
+#define FOO(VTYPE, OP_NAME, LEN, SUFFIX, MTYPE)			\
+  VTYPE									\
+  __attribute__ ((noipa))						\
+  _mm##LEN##_foo_##OP_NAME##_##SUFFIX (VTYPE dst, VTYPE src, MTYPE m)	\
+  {									\
+    return  _mm##LEN##_mask_##OP_NAME##_##SUFFIX (dst, m, src,		\
+						  _mm##LEN##_set1_##SUFFIX (CONSTANT)); \
+  }									\
+
+#define FOOZ(VTYPE, OP_NAME, LEN, SUFFIX, MTYPE)			\
+  VTYPE									\
+  __attribute__ ((noipa))						\
+  _mm##LEN##_fooz_##OP_NAME##_##SUFFIX (VTYPE dst, VTYPE src, MTYPE m)	\
+  {									\
+    return  _mm##LEN##_maskz_##OP_NAME##_##SUFFIX (m, dst, src,		\
+						  _mm##LEN##_set1_##SUFFIX (CONSTANT)); \
+  }									\
+
+FOO (__m512, add, 512, ps, __mmask16);
+FOO (__m256, add, 256, ps, __mmask8);
+FOO (__m128, add,, ps, __mmask8);
+
+FOO (__m512, sub, 512, ps, __mmask16);
+FOO (__m256, sub, 256, ps, __mmask8);
+FOO (__m128, sub,, ps, __mmask8);
+
+FOO (__m512, mul, 512, ps, __mmask16);
+FOO (__m256, mul, 256, ps, __mmask8);
+FOO (__m128, mul,, ps, __mmask8);
+
+FOO (__m512, div, 512, ps, __mmask16);
+FOO (__m256, div, 256, ps, __mmask8);
+FOO (__m128, div,, ps, __mmask8);
+
+FOOZ (__m512, fmadd, 512, ps, __mmask16);
+FOOZ (__m256, fmadd, 256, ps, __mmask8);
+FOOZ (__m128, fmadd,, ps, __mmask8);
+
+FOOZ (__m512, fmsub, 512, ps, __mmask16);
+FOOZ (__m256, fmsub, 256, ps, __mmask8);
+FOOZ (__m128, fmsub,, ps, __mmask8);
+
+FOOZ (__m512, fnmadd, 512, ps, __mmask16);
+FOOZ (__m256, fnmadd, 256, ps, __mmask8);
+FOOZ (__m128, fnmadd,, ps, __mmask8);
+
+FOOZ (__m512, fnmsub, 512, ps, __mmask16);
+FOOZ (__m256, fnmsub, 256, ps, __mmask8);
+FOOZ (__m128, fnmsub,, ps, __mmask8);
+
+FOO (__m512d, add, 512, pd, __mmask8);
+FOO (__m256d, add, 256, pd, __mmask8);
+FOO (__m128d, add,, pd, __mmask8);
+
+FOO (__m512d, sub, 512, pd, __mmask8);
+FOO (__m256d, sub, 256, pd, __mmask8);
+FOO (__m128d, sub,, pd, __mmask8);
+
+FOO (__m512d, mul, 512, pd, __mmask8);
+FOO (__m256d, mul, 256, pd, __mmask8);
+FOO (__m128d, mul,, pd, __mmask8);
+
+FOO (__m512d, div, 512, pd, __mmask8);
+FOO (__m256d, div, 256, pd, __mmask8);
+FOO (__m128d, div,, pd, __mmask8);
+
+FOOZ (__m512d, fmadd, 512, pd, __mmask8);
+FOOZ (__m256d, fmadd, 256, pd, __mmask8);
+FOOZ (__m128d, fmadd,, pd, __mmask8);
+
+FOOZ (__m512d, fmsub, 512, pd, __mmask8);
+FOOZ (__m256d, fmsub, 256, pd, __mmask8);
+FOOZ (__m128d, fmsub,, pd, __mmask8);
+
+FOOZ (__m512d, fnmadd, 512, pd, __mmask8);
+FOOZ (__m256d, fnmadd, 256, pd, __mmask8);
+FOOZ (__m128d, fnmadd,, pd, __mmask8);
+
+FOOZ (__m512d, fnmsub, 512, pd, __mmask8);
+FOOZ (__m256d, fnmsub, 256, pd, __mmask8);
+FOOZ (__m128d, fnmsub,, pd, __mmask8);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
new file mode 100644
index 00000000000..397e287134c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
@@ -0,0 +1,45 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512vl -mavx512dq" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 5 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 10 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 5 } }  */
+
+typedef int v4si  __attribute__ ((vector_size (16)));
+typedef int v8si  __attribute__ ((vector_size (32)));
+typedef long long v2di  __attribute__ ((vector_size (16)));
+typedef long long v4di  __attribute__ ((vector_size (32)));
+typedef float v4sf  __attribute__ ((vector_size (16)));
+typedef float v8sf  __attribute__ ((vector_size (32)));
+typedef double v2df  __attribute__ ((vector_size (16)));
+typedef double v4df  __attribute__ ((vector_size (32)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP)			\
+VTYPE						\
+ __attribute__ ((noipa))			\
+foo_##OP_NAME##_##VTYPE (VTYPE a)		\
+{						\
+  return a OP CONSTANT;				\
+}						\
+
+FOO (v4si, add, +);
+FOO (v8si, add, +);
+FOO (v2di, add, +);
+FOO (v4di, add, +);
+FOO (v4si, sub, -);
+FOO (v8si, sub, -);
+FOO (v2di, sub, -);
+FOO (v4di, sub, -);
+FOO (v4sf, add, +);
+FOO (v8sf, add, +);
+FOO (v2df, add, +);
+FOO (v4df, add, +);
+FOO (v4si, mul, *);
+FOO (v8si, mul, *);
+FOO (v2di, mul, *);
+FOO (v4di, mul, *);
+FOO (v4sf, mul, *);
+FOO (v8sf, mul, *);
+FOO (v2df, mul, *);
+FOO (v4df, mul, *);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-2.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-2.c
new file mode 100644
index 00000000000..9b796ac7124
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-2.c
@@ -0,0 +1,59 @@
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f -mavx512dq -mavx512vl" } */
+/* { dg-require-effective-target avx512dq } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512DQ
+#define AVX512VL
+#include "avx512f-helper.h"
+
+#include "avx512vl-broadcast-pr87767-1.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP)		\
+  do							\
+    {							\
+      TYPE exp[N], src[N];				\
+      VTYPE res;					\
+      for (int i = 0; i < N; i++)			\
+	src[i] = i * i * 107;				\
+      res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src[0]);	\
+      for (int i = 0; i < N; i ++)			\
+	exp[i] = src[i] OP CONSTANT;			\
+      for (int j = 0; j < N; j++)			\
+	{						\
+	  if (res[j] != exp[j])				\
+	    abort();					\
+	}						\
+    }							\
+  while (0)
+
+void
+test_256 (void)
+{
+  RTEST (v8si, int, 8, add, +);
+  RTEST (v4di, long long, 4, add, +);
+  RTEST (v8sf, float, 8, add, +);
+  RTEST (v4df, double, 4, add, +);
+  RTEST (v8si, int, 8, sub, -);
+  RTEST (v4di, long long, 4, sub, -);
+  RTEST (v8si, int, 8, mul, *);
+  RTEST (v4di, long long, 4, mul, *);
+  RTEST (v8sf, float, 8, mul, *);
+  RTEST (v4df, double, 4, mul, *);
+}
+
+void
+test_128 (void)
+{
+  RTEST (v4si, int, 4, add, +);
+  RTEST (v2di, long long, 2, add, +);
+  RTEST (v4sf, float, 4, add, +);
+  RTEST (v2df, double, 2, add, +);
+  RTEST (v4si, int, 4, sub, -);
+  RTEST (v2di, long long, 2, sub, -);
+  RTEST (v4si, int, 4, mul, *);
+  RTEST (v2di, long long, 2, mul, *);
+  RTEST (v4sf, float, 4, mul, *);
+  RTEST (v2df, double, 2, mul, *);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-3.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-3.c
new file mode 100644
index 00000000000..aedfb16016e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-3.c
@@ -0,0 +1,37 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 4 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 8 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 4 } }  */
+
+typedef float v4sf  __attribute__ ((vector_size (16)));
+typedef float v8sf  __attribute__ ((vector_size (32)));
+typedef double v2df  __attribute__ ((vector_size (16)));
+typedef double v4df  __attribute__ ((vector_size (32)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP1, OP2)		\
+VTYPE						\
+ __attribute__ ((noipa))			\
+ foo_##OP_NAME##_##VTYPE (VTYPE a, VTYPE b)	\
+{						\
+  return (OP1 a * b) OP2 CONSTANT;		\
+}						\
+
+FOO (v4sf, fma,, +);
+FOO (v8sf, fma,, +);
+FOO (v2df, fma,, +);
+FOO (v4df, fma,, +);
+FOO (v4sf, fms,, -);
+FOO (v8sf, fms,, -);
+FOO (v2df, fms,, -);
+FOO (v4df, fms,, -);
+FOO (v4sf, fnma, -, +);
+FOO (v8sf, fnma, -, +);
+FOO (v2df, fnma, -, +);
+FOO (v4df, fnma, -, +);
+FOO (v4sf, fnms, -, -);
+FOO (v8sf, fnms, -, -);
+FOO (v2df, fnms, -, -);
+FOO (v4df, fnms, -, -);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-4.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-4.c
new file mode 100644
index 00000000000..40b8eb9929d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-4.c
@@ -0,0 +1,56 @@
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#include "avx512f-helper.h"
+
+#include "avx512vl-broadcast-pr87767-3.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP1, OP2)				\
+  do									\
+    {									\
+      TYPE exp[N], src1[N], src2[N];					\
+      VTYPE res;							\
+      for (int i = 0; i < N; i++)					\
+	{								\
+	  src1[i] = i * i * 107.2f;					\
+	  src2[i] = i * 2.f - 404.f;					\
+	}								\
+      res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src1[0], *(VTYPE*)&src2[0]); \
+      for (int i = 0; i < N; i ++)					\
+	exp[i] = (OP1 src1[i] * src2[i]) OP2 CONSTANT;			\
+      for (int j = 0; j < N; j++)					\
+	{								\
+	  if (res[j] != exp[j])						\
+	    abort();							\
+	}								\
+    }									\
+  while (0)
+
+void
+test_256 (void)
+{
+  RTEST (v8sf, float, 8, fma,, +);
+  RTEST (v4df, double, 4, fma,, +);
+  RTEST (v8sf, float, 8, fms,, -);
+  RTEST (v4df, double, 4, fms,, -);
+  RTEST (v8sf, float, 8, fnma,-, +);
+  RTEST (v4df, double, 4, fnma,-, +);
+  RTEST (v8sf, float, 8, fnms,-, -);
+  RTEST (v4df, double, 4, fnms,-, -);
+}
+
+void
+test_128 (void)
+{
+  RTEST (v4sf, float, 4, fma,, +);
+  RTEST (v2df, double, 2, fma,, +);
+  RTEST (v4sf, float, 4, fms,, -);
+  RTEST (v2df, double, 2, fms,, -);
+  RTEST (v4sf, float, 4, fnma,-, +);
+  RTEST (v2df, double, 2, fnma,-, +);
+  RTEST (v4sf, float, 4, fnms,-, -);
+  RTEST (v2df, double, 2, fnms,-, -);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
new file mode 100644
index 00000000000..1e9460faa9d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
@@ -0,0 +1,37 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 4 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 8 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 4 } }  */
+
+typedef int v4si  __attribute__ ((vector_size (16)));
+typedef int v8si  __attribute__ ((vector_size (32)));
+typedef long long v2di  __attribute__ ((vector_size (16)));
+typedef long long v4di  __attribute__ ((vector_size (32)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP1, OP2)		\
+VTYPE						\
+ __attribute__ ((noipa))			\
+ foo_##OP_NAME##_##VTYPE (VTYPE a)		\
+{						\
+  return (OP1 a) OP2 CONSTANT;			\
+}						\
+
+FOO (v4si, andnot, ~, &);
+FOO (v8si, andnot, ~, &);
+FOO (v2di, andnot, ~, &);
+FOO (v4di, andnot, ~, &);
+FOO (v4si, and,, &);
+FOO (v8si, and,, &);
+FOO (v2di, and,, &);
+FOO (v4di, and,, &);
+FOO (v4si, or,, |);
+FOO (v8si, or,, |);
+FOO (v2di, or,, |);
+FOO (v4di, or,, |);
+FOO (v4si, xor,, ^);
+FOO (v8si, xor,, ^);
+FOO (v2di, xor,, ^);
+FOO (v4di, xor,, ^);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-6.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-6.c
new file mode 100644
index 00000000000..493a76f0917
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-6.c
@@ -0,0 +1,55 @@
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#include "avx512f-helper.h"
+
+#include "avx512vl-broadcast-pr87767-5.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP1, OP2)				\
+  do									\
+    {									\
+      TYPE exp[N], src[N];						\
+      VTYPE res;							\
+      for (int i = 0; i < N; i++)					\
+	{								\
+	  src[i] = i * i * 107;						\
+	}								\
+      res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src[0]);			\
+      for (int i = 0; i < N; i ++)					\
+	exp[i] = (OP1 src[i]) OP2 CONSTANT;				\
+      for (int j = 0; j < N; j++)					\
+	{								\
+	  if (res[j] != exp[j])						\
+	    abort();							\
+	}								\
+    }									\
+  while (0)
+
+void
+test_256 (void)
+{
+  RTEST (v8si, int, 8, andnot, ~, &);
+  RTEST (v4di, long long, 4, andnot, ~, &);
+  RTEST (v8si, int, 8, and,, &);
+  RTEST (v4di, long long, 4, and,, &);
+  RTEST (v8si, int, 8, or,, |);
+  RTEST (v4di, long long, 4, or,, |);
+  RTEST (v8si, int, 8, xor,, ^);
+  RTEST (v4di, long long, 4, xor,, ^);
+}
+
+void
+test_128 (void)
+{
+  RTEST (v4si, int, 4, andnot, ~, &);
+  RTEST (v2di, long long, 2, andnot, ~, &);
+  RTEST (v4si, int, 4, and,, &);
+  RTEST (v2di, long long, 2, and,, &);
+  RTEST (v4si, int, 4, or,, |);
+  RTEST (v2di, long long, 2, or,, |);
+  RTEST (v4si, int, 4, xor,, ^);
+  RTEST (v2di, long long, 2, xor,, ^);
+}
-- 
2.18.1

Reply via email to