The only issue with FMA instructions is that there are _a lot_ of them (30 opcodes, each of which comes in up to 4 versions depending on VEX.W and VEX.L).
We can reduce the number of helpers to one third by passing four operands (one output and three inputs); the reordering of which operands go to the multiply and which go to the add is done in emit.c. Scalar versions do not do any merging; they only affect the bottom 32 or 64 bits of the output operand. Therefore, there is no separate XMM and YMM of the scalar helpers. Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> --- target/i386/cpu.c | 5 ++- target/i386/ops_sse.h | 63 ++++++++++++++++++++++++++++++++ target/i386/ops_sse_header.h | 28 ++++++++++++++ target/i386/tcg/decode-new.c.inc | 38 +++++++++++++++++++ target/i386/tcg/decode-new.h | 1 + target/i386/tcg/emit.c.inc | 43 ++++++++++++++++++++++ tests/tcg/i386/test-avx.py | 2 +- 7 files changed, 177 insertions(+), 3 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 6292b7e12f..22b681ca37 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -625,10 +625,11 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1, CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \ CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */ \ CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \ - CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C) + CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C | \ + CPUID_EXT_FMA) /* missing: CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX, - CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA, + CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA, CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER */ diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 33c61896ee..041a048a70 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -2522,6 +2522,69 @@ void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s) } #endif +/* FMA3 op helpers */ +#if SHIFT == 1 +#define SSE_HELPER_FMAS(name, elem, F) \ + void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c) \ + { \ + d->elem(0) = F(a->elem(0), b->elem(0), c->elem(0)); \ + } +#define SSE_HELPER_FMAP(name, elem, num, F) \ + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c) \ + { \ + int i; \ + for (i = 0; i < num; i++) { \ + d->elem(i) = F(a->elem(i), b->elem(i), c->elem(i)); \ + } \ + } + +#define FMADD32(a, b, c) float32_muladd(a, b, c, 0, &env->sse_status) +#define FMADD64(a, b, c) float64_muladd(a, b, c, 0, &env->sse_status) + +#define FMNADD32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_product, &env->sse_status) +#define FMNADD64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_product, &env->sse_status) + +#define FMSUB32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_c, &env->sse_status) +#define FMSUB64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_c, &env->sse_status) + +#define FMNSUB32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_c|float_muladd_negate_product, &env->sse_status) +#define FMNSUB64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_c|float_muladd_negate_product, &env->sse_status) + +#define FMADDSUB32(a, b, c) float32_muladd(a, b, c, (i & 1) ? 0 : float_muladd_negate_c, &env->sse_status) +#define FMADDSUB64(a, b, c) float64_muladd(a, b, c, (i & 1) ? 0 : float_muladd_negate_c, &env->sse_status) + +#define FMSUBADD32(a, b, c) float32_muladd(a, b, c, (i & 1) ? float_muladd_negate_c : 0, &env->sse_status) +#define FMSUBADD64(a, b, c) float64_muladd(a, b, c, (i & 1) ? float_muladd_negate_c : 0, &env->sse_status) + +SSE_HELPER_FMAS(helper_fmaddss, ZMM_S, FMADD32) +SSE_HELPER_FMAS(helper_fmaddsd, ZMM_D, FMADD64) +SSE_HELPER_FMAS(helper_fmnaddss, ZMM_S, FMNADD32) +SSE_HELPER_FMAS(helper_fmnaddsd, ZMM_D, FMNADD64) +SSE_HELPER_FMAS(helper_fmsubss, ZMM_S, FMSUB32) +SSE_HELPER_FMAS(helper_fmsubsd, ZMM_D, FMSUB64) +SSE_HELPER_FMAS(helper_fmnsubss, ZMM_S, FMNSUB32) +SSE_HELPER_FMAS(helper_fmnsubsd, ZMM_D, FMNSUB64) +#endif + +#if SHIFT >= 1 +SSE_HELPER_FMAP(helper_fmaddps, ZMM_S, 2 << SHIFT, FMADD32) +SSE_HELPER_FMAP(helper_fmaddpd, ZMM_D, 1 << SHIFT, FMADD64) + +SSE_HELPER_FMAP(helper_fmnaddps, ZMM_S, 2 << SHIFT, FMNADD32) +SSE_HELPER_FMAP(helper_fmnaddpd, ZMM_D, 1 << SHIFT, FMNADD64) + +SSE_HELPER_FMAP(helper_fmsubps, ZMM_S, 2 << SHIFT, FMSUB32) +SSE_HELPER_FMAP(helper_fmsubpd, ZMM_D, 1 << SHIFT, FMSUB64) + +SSE_HELPER_FMAP(helper_fmnsubps, ZMM_S, 2 << SHIFT, FMNSUB32) +SSE_HELPER_FMAP(helper_fmnsubpd, ZMM_D, 1 << SHIFT, FMNSUB64) + +SSE_HELPER_FMAP(helper_fmaddsubps, ZMM_S, 2 << SHIFT, FMADDSUB32) +SSE_HELPER_FMAP(helper_fmaddsubpd, ZMM_D, 1 << SHIFT, FMADDSUB64) +SSE_HELPER_FMAP(helper_fmsubaddps, ZMM_S, 2 << SHIFT, FMSUBADD32) +SSE_HELPER_FMAP(helper_fmsubaddpd, ZMM_D, 1 << SHIFT, FMSUBADD64) +#endif + #undef SSE_HELPER_S #undef LANE_WIDTH diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index c4c41976c0..1f9a5c9e94 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -359,6 +359,34 @@ DEF_HELPER_3(glue(cvtph2ps, SUFFIX), void, env, Reg, Reg) DEF_HELPER_4(glue(cvtps2ph, SUFFIX), void, env, Reg, Reg, int) #endif +/* FMA3 helpers */ +#if SHIFT == 1 +DEF_HELPER_5(fmaddss, void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(fmaddsd, void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(fmnaddss, void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(fmnaddsd, void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(fmsubss, void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(fmsubsd, void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(fmnsubss, void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(fmnsubsd, void, env, Reg, Reg, Reg, Reg) +#endif + +#if SHIFT >= 1 +DEF_HELPER_5(glue(fmaddps, SUFFIX), void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(glue(fmaddpd, SUFFIX), void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(glue(fmnaddps,SUFFIX), void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(glue(fmnaddpd,SUFFIX), void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(glue(fmsubps, SUFFIX), void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(glue(fmsubpd, SUFFIX), void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(glue(fmnsubps, SUFFIX), void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(glue(fmnsubpd, SUFFIX), void, env, Reg, Reg, Reg, Reg) + +DEF_HELPER_5(glue(fmaddsubps, SUFFIX), void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(glue(fmaddsubpd, SUFFIX), void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(glue(fmsubaddps, SUFFIX), void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(glue(fmsubaddpd, SUFFIX), void, env, Reg, Reg, Reg, Reg) +#endif + /* AVX helpers */ #if SHIFT >= 1 DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, Reg) diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc index 8baee9018a..8a6b0ae37c 100644 --- a/target/i386/tcg/decode-new.c.inc +++ b/target/i386/tcg/decode-new.c.inc @@ -376,6 +376,15 @@ static const X86OpEntry opcodes_0F38_00toEF[240] = { [0x92] = X86_OP_ENTRY3(VPGATHERD, V,x, H,x, M,d, vex12 cpuid(AVX2) p_66), /* vgatherdps/d */ [0x93] = X86_OP_ENTRY3(VPGATHERQ, V,x, H,x, M,q, vex12 cpuid(AVX2) p_66), /* vgatherqps/d */ + [0x96] = X86_OP_ENTRY3(VFMADDSUB132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x97] = X86_OP_ENTRY3(VFMSUBADD132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + + [0xa6] = X86_OP_ENTRY3(VFMADDSUB213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xa7] = X86_OP_ENTRY3(VFMSUBADD213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + + [0xb6] = X86_OP_ENTRY3(VFMADDSUB231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xb7] = X86_OP_ENTRY3(VFMSUBADD231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x08] = X86_OP_ENTRY3(PSIGNB, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), [0x09] = X86_OP_ENTRY3(PSIGNW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), [0x0a] = X86_OP_ENTRY3(PSIGND, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), @@ -421,6 +430,33 @@ static const X86OpEntry opcodes_0F38_00toEF[240] = { [0x8c] = X86_OP_ENTRY3(VPMASKMOV, V,x, H,x, WM,x, vex6 cpuid(AVX2) p_66), [0x8e] = X86_OP_ENTRY3(VPMASKMOV_st, M,x, V,x, H,x, vex6 cpuid(AVX2) p_66), + [0x98] = X86_OP_ENTRY3(VFMADD132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x99] = X86_OP_ENTRY3(VFMADD132Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x9a] = X86_OP_ENTRY3(VFMSUB132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x9b] = X86_OP_ENTRY3(VFMSUB132Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x9c] = X86_OP_ENTRY3(VFNMADD132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x9d] = X86_OP_ENTRY3(VFNMADD132Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x9e] = X86_OP_ENTRY3(VFNMSUB132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x9f] = X86_OP_ENTRY3(VFNMSUB132Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + + [0xa8] = X86_OP_ENTRY3(VFMADD213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xa9] = X86_OP_ENTRY3(VFMADD213Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xaa] = X86_OP_ENTRY3(VFMSUB213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xab] = X86_OP_ENTRY3(VFMSUB213Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xac] = X86_OP_ENTRY3(VFNMADD213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xad] = X86_OP_ENTRY3(VFNMADD213Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xae] = X86_OP_ENTRY3(VFNMSUB213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xaf] = X86_OP_ENTRY3(VFNMSUB213Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + + [0xb8] = X86_OP_ENTRY3(VFMADD231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xb9] = X86_OP_ENTRY3(VFMADD231Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xba] = X86_OP_ENTRY3(VFMSUB231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xbb] = X86_OP_ENTRY3(VFMSUB231Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xbc] = X86_OP_ENTRY3(VFNMADD231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xbd] = X86_OP_ENTRY3(VFNMADD231Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xbe] = X86_OP_ENTRY3(VFNMSUB231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xbf] = X86_OP_ENTRY3(VFNMSUB231Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xdb] = X86_OP_ENTRY3(VAESIMC, V,dq, None,None, W,dq, vex4 cpuid(AES) p_66), [0xdc] = X86_OP_ENTRY3(VAESENC, V,x, H,x, W,x, vex4 cpuid(AES) p_66), [0xdd] = X86_OP_ENTRY3(VAESENCLAST, V,x, H,x, W,x, vex4 cpuid(AES) p_66), @@ -1350,6 +1386,8 @@ static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid) return true; case X86_FEAT_F16C: return (s->cpuid_ext_features & CPUID_EXT_F16C); + case X86_FEAT_FMA: + return (s->cpuid_ext_features & CPUID_EXT_FMA); case X86_FEAT_MOVBE: return (s->cpuid_ext_features & CPUID_EXT_MOVBE); case X86_FEAT_PCLMULQDQ: diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h index 0ef54628ee..cb6b8bcf67 100644 --- a/target/i386/tcg/decode-new.h +++ b/target/i386/tcg/decode-new.h @@ -105,6 +105,7 @@ typedef enum X86CPUIDFeature { X86_FEAT_BMI1, X86_FEAT_BMI2, X86_FEAT_F16C, + X86_FEAT_FMA, X86_FEAT_MOVBE, X86_FEAT_PCLMULQDQ, X86_FEAT_SSE, diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc index 9334f0939d..9e234f71f7 100644 --- a/target/i386/tcg/emit.c.inc +++ b/target/i386/tcg/emit.c.inc @@ -491,6 +491,49 @@ FP_SSE(VMIN, min) FP_SSE(VDIV, div) FP_SSE(VMAX, max) +#define FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2) \ +static void gen_##uname##Px(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + SSEFunc_0_epppp xmm = s->vex_w ? gen_helper_##lname##pd_xmm : gen_helper_##lname##ps_xmm; \ + SSEFunc_0_epppp ymm = s->vex_w ? gen_helper_##lname##pd_ymm : gen_helper_##lname##ps_ymm; \ + SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm; \ + \ + fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2); \ +} + +#define FMA_SSE(uname, lname, ptr0, ptr1, ptr2) \ +FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2) \ +static void gen_##uname##Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + SSEFunc_0_epppp fn = s->vex_w ? gen_helper_##lname##sd : gen_helper_##lname##ss; \ + \ + fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2); \ +} \ + +FMA_SSE(VFMADD231, fmadd, OP_PTR1, OP_PTR2, OP_PTR0) +FMA_SSE(VFMADD213, fmadd, OP_PTR1, OP_PTR0, OP_PTR2) +FMA_SSE(VFMADD132, fmadd, OP_PTR0, OP_PTR2, OP_PTR1) + +FMA_SSE(VFNMADD231, fmnadd, OP_PTR1, OP_PTR2, OP_PTR0) +FMA_SSE(VFNMADD213, fmnadd, OP_PTR1, OP_PTR0, OP_PTR2) +FMA_SSE(VFNMADD132, fmnadd, OP_PTR0, OP_PTR2, OP_PTR1) + +FMA_SSE(VFMSUB231, fmsub, OP_PTR1, OP_PTR2, OP_PTR0) +FMA_SSE(VFMSUB213, fmsub, OP_PTR1, OP_PTR0, OP_PTR2) +FMA_SSE(VFMSUB132, fmsub, OP_PTR0, OP_PTR2, OP_PTR1) + +FMA_SSE(VFNMSUB231, fmnsub, OP_PTR1, OP_PTR2, OP_PTR0) +FMA_SSE(VFNMSUB213, fmnsub, OP_PTR1, OP_PTR0, OP_PTR2) +FMA_SSE(VFNMSUB132, fmnsub, OP_PTR0, OP_PTR2, OP_PTR1) + +FMA_SSE_PACKED(VFMADDSUB231, fmaddsub, OP_PTR1, OP_PTR2, OP_PTR0) +FMA_SSE_PACKED(VFMADDSUB213, fmaddsub, OP_PTR1, OP_PTR0, OP_PTR2) +FMA_SSE_PACKED(VFMADDSUB132, fmaddsub, OP_PTR0, OP_PTR2, OP_PTR1) + +FMA_SSE_PACKED(VFMSUBADD231, fmsubadd, OP_PTR1, OP_PTR2, OP_PTR0) +FMA_SSE_PACKED(VFMSUBADD213, fmsubadd, OP_PTR1, OP_PTR0, OP_PTR2) +FMA_SSE_PACKED(VFMSUBADD132, fmsubadd, OP_PTR0, OP_PTR2, OP_PTR1) + #define FP_UNPACK_SSE(uname, lname) \ static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ { \ diff --git a/tests/tcg/i386/test-avx.py b/tests/tcg/i386/test-avx.py index ebb1d99c5e..d9ca00a49e 100755 --- a/tests/tcg/i386/test-avx.py +++ b/tests/tcg/i386/test-avx.py @@ -9,7 +9,7 @@ archs = [ "SSE", "SSE2", "SSE3", "SSSE3", "SSE4_1", "SSE4_2", "AES", "AVX", "AVX2", "AES+AVX", "VAES+AVX", - "F16C", + "F16C", "FMA", ] ignore = set(["FISTTP", -- 2.37.3