Hi!

As mentioned in the PR, the addition of vec_addsubv2sf3 expander caused
the testcase to be vectorized and no longer to use fma.
The following patch adds new expanders so that it can be vectorized
again with the alternating add/sub fma instructions.

There is some bug on the slp cost computation side which causes it
not to count some scalar multiplication costs, but I think the patch
is desirable anyway before that is fixed and the testcase for now just
uses -fvect-cost-model=unlimited.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2024-12-13  Jakub Jelinek  <ja...@redhat.com>

        PR target/116979
        * config/i386/mmx.md (vec_fmaddsubv2sf4, vec_fmsubaddv2sf4): New
        define_expand patterns.

        * gcc.target/i386/pr116979.c: New test.

--- gcc/config/i386/mmx.md.jj   2024-12-12 19:46:50.651306295 +0100
+++ gcc/config/i386/mmx.md      2024-12-12 20:15:39.502007436 +0100
@@ -1132,6 +1132,54 @@ (define_expand "vec_addsubv2sf3"
   DONE;
 })
 
+(define_expand "vec_fmaddsubv2sf4"
+  [(match_operand:V2SF 0 "register_operand")
+   (match_operand:V2SF 1 "nonimmediate_operand")
+   (match_operand:V2SF 2 "nonimmediate_operand")
+   (match_operand:V2SF 3 "nonimmediate_operand")]
+  "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
+   && TARGET_MMX_WITH_SSE
+   && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V4SFmode);
+  rtx op2 = gen_reg_rtx (V4SFmode);
+  rtx op1 = gen_reg_rtx (V4SFmode);
+  rtx op0 = gen_reg_rtx (V4SFmode);
+
+  emit_insn (gen_movq_v2sf_to_sse (op3, operands[3]));
+  emit_insn (gen_movq_v2sf_to_sse (op2, operands[2]));
+  emit_insn (gen_movq_v2sf_to_sse (op1, operands[1]));
+
+  emit_insn (gen_vec_fmaddsubv4sf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (V2SFmode, op0, V4SFmode));
+  DONE;
+})
+
+(define_expand "vec_fmsubaddv2sf4"
+  [(match_operand:V2SF 0 "register_operand")
+   (match_operand:V2SF 1 "nonimmediate_operand")
+   (match_operand:V2SF 2 "nonimmediate_operand")
+   (match_operand:V2SF 3 "nonimmediate_operand")]
+  "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
+   && TARGET_MMX_WITH_SSE
+   && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V4SFmode);
+  rtx op2 = gen_reg_rtx (V4SFmode);
+  rtx op1 = gen_reg_rtx (V4SFmode);
+  rtx op0 = gen_reg_rtx (V4SFmode);
+
+  emit_insn (gen_movq_v2sf_to_sse (op3, operands[3]));
+  emit_insn (gen_movq_v2sf_to_sse (op2, operands[2]));
+  emit_insn (gen_movq_v2sf_to_sse (op1, operands[1]));
+
+  emit_insn (gen_vec_fmsubaddv4sf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (V2SFmode, op0, V4SFmode));
+  DONE;
+})
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel single-precision floating point comparisons
--- gcc/testsuite/gcc.target/i386/pr116979.c.jj 2024-12-12 20:19:18.179934902 
+0100
+++ gcc/testsuite/gcc.target/i386/pr116979.c    2024-12-12 20:21:31.685059095 
+0100
@@ -0,0 +1,24 @@
+/* PR target/116979 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mfma -fvect-cost-model=unlimited" } */
+/* { dg-final { scan-assembler "vfmaddsub(?:132|213|231)pd" } } */
+/* { dg-final { scan-assembler "vfmaddsub(?:132|213|231)ps" { target lp64 } } 
} */
+
+struct S { __complex__ float f; };
+struct T { __complex__ double f; };
+
+struct S
+foo (const struct S *a, const struct S *b)
+{
+  struct S r;
+  r.f = a->f * b->f;
+  return r;
+}
+
+struct T
+bar (const struct T *a, const struct T *b)
+{
+  struct T r;
+  r.f = a->f * b->f;
+  return r;
+}

        Jakub

Reply via email to