The only issue with FMA instructions is that there are _a lot_ of them
(30 opcodes, each of which comes in up to 4 versions depending on VEX.W
and VEX.L).
We can reduce the number of helpers to one third by passing four operands
(one output and three inputs); the reordering of which operands go to
the multiply and which go to the add is done in emit.c.
Scalar versions do not do any merging; they only affect the bottom 32
or 64 bits of the output operand. Therefore, there is no separate XMM
and YMM of the scalar helpers.
Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
---
target/i386/cpu.c | 5 ++-
target/i386/ops_sse.h | 63 ++++++++++++++++++++++++++++++++
target/i386/ops_sse_header.h | 28 ++++++++++++++
target/i386/tcg/decode-new.c.inc | 38 +++++++++++++++++++
target/i386/tcg/decode-new.h | 1 +
target/i386/tcg/emit.c.inc | 43 ++++++++++++++++++++++
tests/tcg/i386/test-avx.py | 2 +-
7 files changed, 177 insertions(+), 3 deletions(-)
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 6292b7e12f..22b681ca37 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -625,10 +625,11 @@ void x86_cpu_vendor_words2str(char *dst,
uint32_t vendor1,
CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \
CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */ \
CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \
- CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C)
+ CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C | \
+ CPUID_EXT_FMA)
/* missing:
CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX,
CPUID_EXT_SMX,
- CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA,
+ CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID,
CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID,
CPUID_EXT_DCA,
CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER */
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 33c61896ee..041a048a70 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -2522,6 +2522,69 @@ void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
}
#endif
+/* FMA3 op helpers */
+#if SHIFT == 1
+#define SSE_HELPER_FMAS(name, elem,
F) \
+ void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg
*c) \
+
{ \
+ d->elem(0) = F(a->elem(0), b->elem(0),
c->elem(0)); \
+ }
+#define SSE_HELPER_FMAP(name, elem, num,
F) \
+ void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b,
Reg *c) \
+
{ \
+ int
i; \
+ for (i = 0; i < num; i++)
{ \
+ d->elem(i) = F(a->elem(i), b->elem(i),
c->elem(i)); \
+
} \
+ }
+
+#define FMADD32(a, b, c) float32_muladd(a, b, c, 0, &env->sse_status)
+#define FMADD64(a, b, c) float64_muladd(a, b, c, 0, &env->sse_status)
+
+#define FMNADD32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_product,
&env->sse_status)
+#define FMNADD64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_product,
&env->sse_status)
[...]