The only issue with FMA instructions is that there are _a lot_ of them
(30 opcodes, each of which comes in up to 4 versions depending on VEX.W
and VEX.L).

We can reduce the number of helpers to one third by passing four operands
(one output and three inputs); the reordering of which operands go to
the multiply and which go to the add is done in emit.c.

Scalar versions do not do any merging; they only affect the bottom 32
or 64 bits of the output operand.  Therefore, there is no separate XMM
and YMM of the scalar helpers.

Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
---
 target/i386/cpu.c                |  5 ++-
 target/i386/ops_sse.h            | 63 ++++++++++++++++++++++++++++++++
 target/i386/ops_sse_header.h     | 28 ++++++++++++++
 target/i386/tcg/decode-new.c.inc | 38 +++++++++++++++++++
 target/i386/tcg/decode-new.h     |  1 +
 target/i386/tcg/emit.c.inc       | 43 ++++++++++++++++++++++
 tests/tcg/i386/test-avx.py       |  2 +-
 7 files changed, 177 insertions(+), 3 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 6292b7e12f..22b681ca37 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -625,10 +625,11 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
           CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \
           CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */   \
           CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \
-          CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C)
+          CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C | \
+          CPUID_EXT_FMA)
           /* missing:
           CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX,
-          CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA,
+          CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID,
           CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA,
           CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER */
 
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 33c61896ee..041a048a70 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -2522,6 +2522,69 @@ void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
 }
 #endif
 
+/* FMA3 op helpers */
+#if SHIFT == 1
+#define SSE_HELPER_FMAS(name, elem, F)                                         
\
+    void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c)                
\
+    {                                                                          
\
+        d->elem(0) = F(a->elem(0), b->elem(0), c->elem(0));                    
\
+    }
+#define SSE_HELPER_FMAP(name, elem, num, F)                                    
\
+    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c)  
\
+    {                                                                          
\
+        int i;                                                                 
\
+        for (i = 0; i < num; i++) {                                            
\
+            d->elem(i) = F(a->elem(i), b->elem(i), c->elem(i));                
\
+        }                                                                      
\
+    }
+
+#define FMADD32(a, b, c) float32_muladd(a, b, c, 0, &env->sse_status)
+#define FMADD64(a, b, c) float64_muladd(a, b, c, 0, &env->sse_status)
+
+#define FMNADD32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_product, 
&env->sse_status)
+#define FMNADD64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_product, 
&env->sse_status)
+
+#define FMSUB32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_c, 
&env->sse_status)
+#define FMSUB64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_c, 
&env->sse_status)
+
+#define FMNSUB32(a, b, c) float32_muladd(a, b, c, 
float_muladd_negate_c|float_muladd_negate_product, &env->sse_status)
+#define FMNSUB64(a, b, c) float64_muladd(a, b, c, 
float_muladd_negate_c|float_muladd_negate_product, &env->sse_status)
+
+#define FMADDSUB32(a, b, c) float32_muladd(a, b, c, (i & 1) ? 0 : 
float_muladd_negate_c, &env->sse_status)
+#define FMADDSUB64(a, b, c) float64_muladd(a, b, c, (i & 1) ? 0 : 
float_muladd_negate_c, &env->sse_status)
+
+#define FMSUBADD32(a, b, c) float32_muladd(a, b, c, (i & 1) ? 
float_muladd_negate_c : 0, &env->sse_status)
+#define FMSUBADD64(a, b, c) float64_muladd(a, b, c, (i & 1) ? 
float_muladd_negate_c : 0, &env->sse_status)
+
+SSE_HELPER_FMAS(helper_fmaddss,  ZMM_S,             FMADD32)
+SSE_HELPER_FMAS(helper_fmaddsd,  ZMM_D,             FMADD64)
+SSE_HELPER_FMAS(helper_fmnaddss, ZMM_S,             FMNADD32)
+SSE_HELPER_FMAS(helper_fmnaddsd, ZMM_D,             FMNADD64)
+SSE_HELPER_FMAS(helper_fmsubss,  ZMM_S,             FMSUB32)
+SSE_HELPER_FMAS(helper_fmsubsd,  ZMM_D,             FMSUB64)
+SSE_HELPER_FMAS(helper_fmnsubss, ZMM_S,             FMNSUB32)
+SSE_HELPER_FMAS(helper_fmnsubsd, ZMM_D,             FMNSUB64)
+#endif
+
+#if SHIFT >= 1
+SSE_HELPER_FMAP(helper_fmaddps,  ZMM_S, 2 << SHIFT, FMADD32)
+SSE_HELPER_FMAP(helper_fmaddpd,  ZMM_D, 1 << SHIFT, FMADD64)
+
+SSE_HELPER_FMAP(helper_fmnaddps, ZMM_S, 2 << SHIFT, FMNADD32)
+SSE_HELPER_FMAP(helper_fmnaddpd, ZMM_D, 1 << SHIFT, FMNADD64)
+
+SSE_HELPER_FMAP(helper_fmsubps,  ZMM_S, 2 << SHIFT, FMSUB32)
+SSE_HELPER_FMAP(helper_fmsubpd,  ZMM_D, 1 << SHIFT, FMSUB64)
+
+SSE_HELPER_FMAP(helper_fmnsubps, ZMM_S, 2 << SHIFT, FMNSUB32)
+SSE_HELPER_FMAP(helper_fmnsubpd, ZMM_D, 1 << SHIFT, FMNSUB64)
+
+SSE_HELPER_FMAP(helper_fmaddsubps,  ZMM_S, 2 << SHIFT, FMADDSUB32)
+SSE_HELPER_FMAP(helper_fmaddsubpd,  ZMM_D, 1 << SHIFT, FMADDSUB64)
+SSE_HELPER_FMAP(helper_fmsubaddps,  ZMM_S, 2 << SHIFT, FMSUBADD32)
+SSE_HELPER_FMAP(helper_fmsubaddpd,  ZMM_D, 1 << SHIFT, FMSUBADD64)
+#endif
+
 #undef SSE_HELPER_S
 
 #undef LANE_WIDTH
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index c4c41976c0..1f9a5c9e94 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -359,6 +359,34 @@ DEF_HELPER_3(glue(cvtph2ps, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_4(glue(cvtps2ph, SUFFIX), void, env, Reg, Reg, int)
 #endif
 
+/* FMA3 helpers */
+#if SHIFT == 1
+DEF_HELPER_5(fmaddss, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmaddsd, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmnaddss, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmnaddsd, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmsubss, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmsubsd, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmnsubss, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmnsubsd, void, env, Reg, Reg, Reg, Reg)
+#endif
+
+#if SHIFT >= 1
+DEF_HELPER_5(glue(fmaddps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmaddpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmnaddps,SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmnaddpd,SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmsubps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmsubpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmnsubps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmnsubpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+
+DEF_HELPER_5(glue(fmaddsubps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmaddsubpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmsubaddps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmsubaddpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+#endif
+
 /* AVX helpers */
 #if SHIFT >= 1
 DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, Reg)
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 8baee9018a..8a6b0ae37c 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -376,6 +376,15 @@ static const X86OpEntry opcodes_0F38_00toEF[240] = {
     [0x92] = X86_OP_ENTRY3(VPGATHERD, V,x,  H,x,  M,d,  vex12 cpuid(AVX2) 
p_66), /* vgatherdps/d */
     [0x93] = X86_OP_ENTRY3(VPGATHERQ, V,x,  H,x,  M,q,  vex12 cpuid(AVX2) 
p_66), /* vgatherqps/d */
 
+    [0x96] = X86_OP_ENTRY3(VFMADDSUB132Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0x97] = X86_OP_ENTRY3(VFMSUBADD132Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+
+    [0xa6] = X86_OP_ENTRY3(VFMADDSUB213Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xa7] = X86_OP_ENTRY3(VFMSUBADD213Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+
+    [0xb6] = X86_OP_ENTRY3(VFMADDSUB231Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xb7] = X86_OP_ENTRY3(VFMSUBADD231Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+
     [0x08] = X86_OP_ENTRY3(PSIGNB,    V,x,        H,x,  W,x,  vex4 
cpuid(SSSE3) mmx avx2_256 p_00_66),
     [0x09] = X86_OP_ENTRY3(PSIGNW,    V,x,        H,x,  W,x,  vex4 
cpuid(SSSE3) mmx avx2_256 p_00_66),
     [0x0a] = X86_OP_ENTRY3(PSIGND,    V,x,        H,x,  W,x,  vex4 
cpuid(SSSE3) mmx avx2_256 p_00_66),
@@ -421,6 +430,33 @@ static const X86OpEntry opcodes_0F38_00toEF[240] = {
     [0x8c] = X86_OP_ENTRY3(VPMASKMOV,    V,x,  H,x, WM,x, vex6 cpuid(AVX2) 
p_66),
     [0x8e] = X86_OP_ENTRY3(VPMASKMOV_st, M,x,  V,x, H,x,  vex6 cpuid(AVX2) 
p_66),
 
+    [0x98] = X86_OP_ENTRY3(VFMADD132Px,  V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0x99] = X86_OP_ENTRY3(VFMADD132Sx,  V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0x9a] = X86_OP_ENTRY3(VFMSUB132Px,  V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0x9b] = X86_OP_ENTRY3(VFMSUB132Sx,  V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0x9c] = X86_OP_ENTRY3(VFNMADD132Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0x9d] = X86_OP_ENTRY3(VFNMADD132Sx, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0x9e] = X86_OP_ENTRY3(VFNMSUB132Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0x9f] = X86_OP_ENTRY3(VFNMSUB132Sx, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+
+    [0xa8] = X86_OP_ENTRY3(VFMADD213Px,  V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xa9] = X86_OP_ENTRY3(VFMADD213Sx,  V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xaa] = X86_OP_ENTRY3(VFMSUB213Px,  V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xab] = X86_OP_ENTRY3(VFMSUB213Sx,  V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xac] = X86_OP_ENTRY3(VFNMADD213Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xad] = X86_OP_ENTRY3(VFNMADD213Sx, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xae] = X86_OP_ENTRY3(VFNMSUB213Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xaf] = X86_OP_ENTRY3(VFNMSUB213Sx, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+
+    [0xb8] = X86_OP_ENTRY3(VFMADD231Px,  V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xb9] = X86_OP_ENTRY3(VFMADD231Sx,  V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xba] = X86_OP_ENTRY3(VFMSUB231Px,  V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xbb] = X86_OP_ENTRY3(VFMSUB231Sx,  V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xbc] = X86_OP_ENTRY3(VFNMADD231Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xbd] = X86_OP_ENTRY3(VFNMADD231Sx, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xbe] = X86_OP_ENTRY3(VFNMSUB231Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+    [0xbf] = X86_OP_ENTRY3(VFNMSUB231Sx, V,x,  H,x, W,x,  vex6 cpuid(FMA) 
p_66),
+
     [0xdb] = X86_OP_ENTRY3(VAESIMC,     V,dq, None,None, W,dq, vex4 cpuid(AES) 
p_66),
     [0xdc] = X86_OP_ENTRY3(VAESENC,     V,x,  H,x,       W,x,  vex4 cpuid(AES) 
p_66),
     [0xdd] = X86_OP_ENTRY3(VAESENCLAST, V,x,  H,x,       W,x,  vex4 cpuid(AES) 
p_66),
@@ -1350,6 +1386,8 @@ static bool has_cpuid_feature(DisasContext *s, 
X86CPUIDFeature cpuid)
         return true;
     case X86_FEAT_F16C:
         return (s->cpuid_ext_features & CPUID_EXT_F16C);
+    case X86_FEAT_FMA:
+        return (s->cpuid_ext_features & CPUID_EXT_FMA);
     case X86_FEAT_MOVBE:
         return (s->cpuid_ext_features & CPUID_EXT_MOVBE);
     case X86_FEAT_PCLMULQDQ:
diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index 0ef54628ee..cb6b8bcf67 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -105,6 +105,7 @@ typedef enum X86CPUIDFeature {
     X86_FEAT_BMI1,
     X86_FEAT_BMI2,
     X86_FEAT_F16C,
+    X86_FEAT_FMA,
     X86_FEAT_MOVBE,
     X86_FEAT_PCLMULQDQ,
     X86_FEAT_SSE,
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index 9334f0939d..9e234f71f7 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -491,6 +491,49 @@ FP_SSE(VMIN, min)
 FP_SSE(VDIV, div)
 FP_SSE(VMAX, max)
 
+#define FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2)                         
    \
+static void gen_##uname##Px(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode) \
+{                                                                              
    \
+    SSEFunc_0_epppp xmm = s->vex_w ? gen_helper_##lname##pd_xmm : 
gen_helper_##lname##ps_xmm; \
+    SSEFunc_0_epppp ymm = s->vex_w ? gen_helper_##lname##pd_ymm : 
gen_helper_##lname##ps_ymm; \
+    SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm;                                 
    \
+                                                                               
    \
+    fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2);                                    
    \
+}
+
+#define FMA_SSE(uname, lname, ptr0, ptr1, ptr2)                                
    \
+FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2)                                 
    \
+static void gen_##uname##Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode) \
+{                                                                              
    \
+    SSEFunc_0_epppp fn = s->vex_w ? gen_helper_##lname##sd : 
gen_helper_##lname##ss; \
+                                                                               
    \
+    fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2);                                    
    \
+}                                                                              
    \
+
+FMA_SSE(VFMADD231,    fmadd,    OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFMADD213,    fmadd,    OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFMADD132,    fmadd,    OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFNMADD231,   fmnadd,   OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFNMADD213,   fmnadd,   OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFNMADD132,   fmnadd,   OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFMSUB231,    fmsub,    OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFMSUB213,    fmsub,    OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFMSUB132,    fmsub,    OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFNMSUB231,   fmnsub,   OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFNMSUB213,   fmnsub,   OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFNMSUB132,   fmnsub,   OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE_PACKED(VFMADDSUB231, fmaddsub, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE_PACKED(VFMADDSUB213, fmaddsub, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE_PACKED(VFMADDSUB132, fmaddsub, OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE_PACKED(VFMSUBADD231, fmsubadd, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE_PACKED(VFMSUBADD213, fmsubadd, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE_PACKED(VFMSUBADD132, fmsubadd, OP_PTR0, OP_PTR2, OP_PTR1)
+
 #define FP_UNPACK_SSE(uname, lname)                                            
    \
 static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode) \
 {                                                                              
    \
diff --git a/tests/tcg/i386/test-avx.py b/tests/tcg/i386/test-avx.py
index ebb1d99c5e..d9ca00a49e 100755
--- a/tests/tcg/i386/test-avx.py
+++ b/tests/tcg/i386/test-avx.py
@@ -9,7 +9,7 @@
 archs = [
     "SSE", "SSE2", "SSE3", "SSSE3", "SSE4_1", "SSE4_2",
     "AES", "AVX", "AVX2", "AES+AVX", "VAES+AVX",
-    "F16C",
+    "F16C", "FMA",
 ]
 
 ignore = set(["FISTTP",
-- 
2.37.3


Reply via email to