On 10/20/22 01:06, Paolo Bonzini wrote:
The only issue with FMA instructions is that there are _a lot_ of them
(30 opcodes, each of which comes in up to 4 versions depending on VEX.W
and VEX.L).
We can reduce the number of helpers to one third by passing four operands
(one output and three inputs); the reordering of which operands go to
the multiply and which go to the add is done in emit.c.
Scalar versions do not do any merging; they only affect the bottom 32
or 64 bits of the output operand. Therefore, there is no separate XMM
and YMM of the scalar helpers.
Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
---
target/i386/cpu.c | 5 ++-
target/i386/ops_sse.h | 63 ++++++++++++++++++++++++++++++++
target/i386/ops_sse_header.h | 28 ++++++++++++++
target/i386/tcg/decode-new.c.inc | 38 +++++++++++++++++++
target/i386/tcg/decode-new.h | 1 +
target/i386/tcg/emit.c.inc | 43 ++++++++++++++++++++++
tests/tcg/i386/test-avx.py | 2 +-
7 files changed, 177 insertions(+), 3 deletions(-)
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 6292b7e12f..22b681ca37 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -625,10 +625,11 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \
CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */ \
CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \
- CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C)
+ CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C | \
+ CPUID_EXT_FMA)
/* missing:
CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX,
- CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA,
+ CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID,
CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA,
CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER */
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 33c61896ee..041a048a70 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -2522,6 +2522,69 @@ void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
}
#endif
+/* FMA3 op helpers */
+#if SHIFT == 1
+#define SSE_HELPER_FMAS(name, elem, F)
\
+ void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c)
\
+ {
\
+ d->elem(0) = F(a->elem(0), b->elem(0), c->elem(0));
\
+ }
+#define SSE_HELPER_FMAP(name, elem, num, F)
\
+ void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c)
\
+ {
\
+ int i;
\
+ for (i = 0; i < num; i++) {
\
+ d->elem(i) = F(a->elem(i), b->elem(i), c->elem(i));
\
+ }
\
+ }
+
+#define FMADD32(a, b, c) float32_muladd(a, b, c, 0, &env->sse_status)
+#define FMADD64(a, b, c) float64_muladd(a, b, c, 0, &env->sse_status)
+
+#define FMNADD32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_product,
&env->sse_status)
+#define FMNADD64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_product,
&env->sse_status)
+
+#define FMSUB32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_c,
&env->sse_status)
+#define FMSUB64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_c,
&env->sse_status)
+
+#define FMNSUB32(a, b, c) float32_muladd(a, b, c,
float_muladd_negate_c|float_muladd_negate_product, &env->sse_status)
+#define FMNSUB64(a, b, c) float64_muladd(a, b, c,
float_muladd_negate_c|float_muladd_negate_product, &env->sse_status)
+
+#define FMADDSUB32(a, b, c) float32_muladd(a, b, c, (i & 1) ? 0 :
float_muladd_negate_c, &env->sse_status)
+#define FMADDSUB64(a, b, c) float64_muladd(a, b, c, (i & 1) ? 0 :
float_muladd_negate_c, &env->sse_status)
+
+#define FMSUBADD32(a, b, c) float32_muladd(a, b, c, (i & 1) ? float_muladd_negate_c :
0, &env->sse_status)
+#define FMSUBADD64(a, b, c) float64_muladd(a, b, c, (i & 1) ? float_muladd_negate_c :
0, &env->sse_status)
+
+SSE_HELPER_FMAS(helper_fmaddss, ZMM_S, FMADD32)
+SSE_HELPER_FMAS(helper_fmaddsd, ZMM_D, FMADD64)
+SSE_HELPER_FMAS(helper_fmnaddss, ZMM_S, FMNADD32)
+SSE_HELPER_FMAS(helper_fmnaddsd, ZMM_D, FMNADD64)
+SSE_HELPER_FMAS(helper_fmsubss, ZMM_S, FMSUB32)
+SSE_HELPER_FMAS(helper_fmsubsd, ZMM_D, FMSUB64)
+SSE_HELPER_FMAS(helper_fmnsubss, ZMM_S, FMNSUB32)
+SSE_HELPER_FMAS(helper_fmnsubsd, ZMM_D, FMNSUB64)
Would it be worth passing the muladd constant(s) as a parameter to a reduced number of
helper functions?
E.g.
void fmas_name(..., int flags)
{
d = type_muladd(a, b, c, flags, status);
}
void fmap_name(..., int flags2)
{
int f_even = flags2 & 0xf;
int f_odd = flags2 >> 4;
for (int i = 0; i < num; ) {
d(i) = type_muladd(a(i), b(i), c(i), f_even, status);
i++;
d(i) = type_muladd(a(i), b(i), c(i), f_odd, status);
i++;
}
+#define FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2)
\
+static void gen_##uname##Px(DisasContext *s, CPUX86State *env, X86DecodedInsn
*decode) \
+{
\
+ SSEFunc_0_epppp xmm = s->vex_w ? gen_helper_##lname##pd_xmm :
gen_helper_##lname##ps_xmm; \
+ SSEFunc_0_epppp ymm = s->vex_w ? gen_helper_##lname##pd_ymm :
gen_helper_##lname##ps_ymm; \
+ SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm;
\
+
\
+ fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2);
\
+}
+
+#define FMA_SSE(uname, lname, ptr0, ptr1, ptr2)
\
+FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2)
\
+static void gen_##uname##Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn
*decode) \
+{
\
+ SSEFunc_0_epppp fn = s->vex_w ? gen_helper_##lname##sd :
gen_helper_##lname##ss; \
+
\
+ fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2);
\
+}
\
+
+FMA_SSE(VFMADD231, fmadd, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFMADD213, fmadd, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFMADD132, fmadd, OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFNMADD231, fmnadd, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFNMADD213, fmnadd, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFNMADD132, fmnadd, OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFMSUB231, fmsub, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFMSUB213, fmsub, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFMSUB132, fmsub, OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFNMSUB231, fmnsub, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFNMSUB213, fmnsub, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFNMSUB132, fmnsub, OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE_PACKED(VFMADDSUB231, fmaddsub, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE_PACKED(VFMADDSUB213, fmaddsub, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE_PACKED(VFMADDSUB132, fmaddsub, OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE_PACKED(VFMSUBADD231, fmsubadd, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE_PACKED(VFMSUBADD213, fmsubadd, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE_PACKED(VFMSUBADD132, fmsubadd, OP_PTR0, OP_PTR2, OP_PTR1)
Is it more or less confusing to macroize this further?
#define MULADD_S_VFMADD 0
#define MULADD_S_VFMSUB float_muladd_negate_c
...
#define MULADD_P_VFMADD (MULADD_S_VFMADD * 0x11)
#define MULADD_P_VFMSUB (MULADD_S_VFMSUB * 0x11)
...
#define MULADD_P_VFMADDSUB (MULADD_S_VFMADD * 0x10 + MULADD_S_VFMSUB)
#define MULADD_P_VFMSUBADD (MULADD_S_VFMSUB * 0x10 + MULADD_S_VFMADD)
#define OP_PTR1 OP_PTR1
#define OP_PTR2_231 OP_PTR2
#define OP_PTR3_231 OP_PTR0
...
#define FMA_SSE_PACKED(uname, lname, order) \
static void name(args) { \
fn = select; \
fn(cpu_env, OP_PTR0, \
glue(OP_PTR_1_,order), \
glue(OP_PTR_2_,order), \
glue(OP_PTR_3_,order), \
tcg_constant_i32(glue(MULADD_P_,UNAME))); \
}
#define FMA_SSE(uname, lname, order) \
FMA_SSE_PACKED(uname, lname, order) \
static void name(args) { \
fn = select; \
fn(cpu_env, OP_PTR0, \
glue(OP_PTR_1_,order), \
glue(OP_PTR_2_,order), \
glue(OP_PTR_3_,order), \
tcg_constant_i32(glue(MULADD_S_,UNAME))); \
}
FMA_SSE(VFMADD, fmadd, 231)
FMA_SSE(VFMADD, fmadd, 213)
FMA_SSE(VFMADD, fmadd, 132)
etc.
r~