i386: implement FMA instructions

Richard Henderson Wed, 19 Oct 2022 20:07:10 -0700

On 10/20/22 01:06, Paolo Bonzini wrote:

The only issue with FMA instructions is that there are _a lot_ of them
(30 opcodes, each of which comes in up to 4 versions depending on VEX.W
and VEX.L).


We can reduce the number of helpers to one third by passing four operands
(one output and three inputs); the reordering of which operands go to
the multiply and which go to the add is done in emit.c.

Scalar versions do not do any merging; they only affect the bottom 32
or 64 bits of the output operand.  Therefore, there is no separate XMM
and YMM of the scalar helpers.

Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
---
  target/i386/cpu.c                |  5 ++-
  target/i386/ops_sse.h            | 63 ++++++++++++++++++++++++++++++++
  target/i386/ops_sse_header.h     | 28 ++++++++++++++
  target/i386/tcg/decode-new.c.inc | 38 +++++++++++++++++++
  target/i386/tcg/decode-new.h     |  1 +
  target/i386/tcg/emit.c.inc       | 43 ++++++++++++++++++++++
  tests/tcg/i386/test-avx.py       |  2 +-
  7 files changed, 177 insertions(+), 3 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 6292b7e12f..22b681ca37 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -625,10 +625,11 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
            CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \
            CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */   \
            CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \
-          CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C)
+          CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C | \
+          CPUID_EXT_FMA)
            /* missing:
            CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX,
-          CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA,
+          CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID,
            CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA,
            CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER */

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h

index 33c61896ee..041a048a70 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -2522,6 +2522,69 @@ void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
  }
  #endif

+/* FMA3 op helpers */

+#if SHIFT == 1
+#define SSE_HELPER_FMAS(name, elem, F)                                         
\
+    void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c)                
\
+    {                                                                          
\
+        d->elem(0) = F(a->elem(0), b->elem(0), c->elem(0));                    
\
+    }
+#define SSE_HELPER_FMAP(name, elem, num, F)                                    
\
+    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c)  
\
+    {                                                                          
\
+        int i;                                                                 
\
+        for (i = 0; i < num; i++) {                                            
\
+            d->elem(i) = F(a->elem(i), b->elem(i), c->elem(i));                
\
+        }                                                                      
\
+    }
+
+#define FMADD32(a, b, c) float32_muladd(a, b, c, 0, &env->sse_status)
+#define FMADD64(a, b, c) float64_muladd(a, b, c, 0, &env->sse_status)
+
+#define FMNADD32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_product, 
&env->sse_status)
+#define FMNADD64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_product, 
&env->sse_status)
+
+#define FMSUB32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_c, 
&env->sse_status)
+#define FMSUB64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_c, 
&env->sse_status)
+
+#define FMNSUB32(a, b, c) float32_muladd(a, b, c, 
float_muladd_negate_c|float_muladd_negate_product, &env->sse_status)
+#define FMNSUB64(a, b, c) float64_muladd(a, b, c, 
float_muladd_negate_c|float_muladd_negate_product, &env->sse_status)
+
+#define FMADDSUB32(a, b, c) float32_muladd(a, b, c, (i & 1) ? 0 : 
float_muladd_negate_c, &env->sse_status)
+#define FMADDSUB64(a, b, c) float64_muladd(a, b, c, (i & 1) ? 0 : 
float_muladd_negate_c, &env->sse_status)
+
+#define FMSUBADD32(a, b, c) float32_muladd(a, b, c, (i & 1) ? float_muladd_negate_c : 
0, &env->sse_status)
+#define FMSUBADD64(a, b, c) float64_muladd(a, b, c, (i & 1) ? float_muladd_negate_c : 
0, &env->sse_status)
+
+SSE_HELPER_FMAS(helper_fmaddss,  ZMM_S,             FMADD32)
+SSE_HELPER_FMAS(helper_fmaddsd,  ZMM_D,             FMADD64)
+SSE_HELPER_FMAS(helper_fmnaddss, ZMM_S,             FMNADD32)
+SSE_HELPER_FMAS(helper_fmnaddsd, ZMM_D,             FMNADD64)
+SSE_HELPER_FMAS(helper_fmsubss,  ZMM_S,             FMSUB32)
+SSE_HELPER_FMAS(helper_fmsubsd,  ZMM_D,             FMSUB64)
+SSE_HELPER_FMAS(helper_fmnsubss, ZMM_S,             FMNSUB32)
+SSE_HELPER_FMAS(helper_fmnsubsd, ZMM_D,             FMNSUB64)

Would it be worth passing the muladd constant(s) as a parameter to a reduced number ofhelper functions?


E.g.

void fmas_name(..., int flags)
{
    d = type_muladd(a, b, c, flags, status);
}

void fmap_name(..., int flags2)
{
    int f_even = flags2 & 0xf;
    int f_odd = flags2 >> 4;
    for (int i = 0; i < num; ) {
       d(i) = type_muladd(a(i), b(i), c(i), f_even, status);
       i++;
       d(i) = type_muladd(a(i), b(i), c(i), f_odd, status);
       i++;
    }

+#define FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2)                         
    \
+static void gen_##uname##Px(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode) \
+{                                                                              
    \
+    SSEFunc_0_epppp xmm = s->vex_w ? gen_helper_##lname##pd_xmm : 
gen_helper_##lname##ps_xmm; \
+    SSEFunc_0_epppp ymm = s->vex_w ? gen_helper_##lname##pd_ymm : 
gen_helper_##lname##ps_ymm; \
+    SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm;                                 
    \
+                                                                               
    \
+    fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2);                                    
    \
+}
+
+#define FMA_SSE(uname, lname, ptr0, ptr1, ptr2)                                
    \
+FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2)                                 
    \
+static void gen_##uname##Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode) \
+{                                                                              
    \
+    SSEFunc_0_epppp fn = s->vex_w ? gen_helper_##lname##sd : 
gen_helper_##lname##ss; \
+                                                                               
    \
+    fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2);                                    
    \
+}                                                                              
    \
+
+FMA_SSE(VFMADD231,    fmadd,    OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFMADD213,    fmadd,    OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFMADD132,    fmadd,    OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFNMADD231,   fmnadd,   OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFNMADD213,   fmnadd,   OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFNMADD132,   fmnadd,   OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFMSUB231,    fmsub,    OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFMSUB213,    fmsub,    OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFMSUB132,    fmsub,    OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFNMSUB231,   fmnsub,   OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFNMSUB213,   fmnsub,   OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFNMSUB132,   fmnsub,   OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE_PACKED(VFMADDSUB231, fmaddsub, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE_PACKED(VFMADDSUB213, fmaddsub, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE_PACKED(VFMADDSUB132, fmaddsub, OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE_PACKED(VFMSUBADD231, fmsubadd, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE_PACKED(VFMSUBADD213, fmsubadd, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE_PACKED(VFMSUBADD132, fmsubadd, OP_PTR0, OP_PTR2, OP_PTR1)


Is it more or less confusing to macroize this further?

#define MULADD_S_VFMADD  0
#define MULADD_S_VFMSUB  float_muladd_negate_c
...
#define MULADD_P_VFMADD  (MULADD_S_VFMADD * 0x11)
#define MULADD_P_VFMSUB  (MULADD_S_VFMSUB * 0x11)
...
#define MULADD_P_VFMADDSUB (MULADD_S_VFMADD * 0x10 + MULADD_S_VFMSUB)
#define MULADD_P_VFMSUBADD (MULADD_S_VFMSUB * 0x10 + MULADD_S_VFMADD)

#define OP_PTR1   OP_PTR1
#define OP_PTR2_231   OP_PTR2
#define OP_PTR3_231   OP_PTR0
...

#define FMA_SSE_PACKED(uname, lname, order)     \
static void name(args) {                        \
   fn = select;                                 \
   fn(cpu_env, OP_PTR0,                         \
      glue(OP_PTR_1_,order),                    \
      glue(OP_PTR_2_,order),                    \
      glue(OP_PTR_3_,order),                    \
      tcg_constant_i32(glue(MULADD_P_,UNAME))); \
}

#define FMA_SSE(uname, lname, order)            \
FMA_SSE_PACKED(uname, lname, order)             \
static void name(args) {                        \
   fn = select;                                 \
   fn(cpu_env, OP_PTR0,                         \
      glue(OP_PTR_1_,order),                    \
      glue(OP_PTR_2_,order),                    \
      glue(OP_PTR_3_,order),                    \
      tcg_constant_i32(glue(MULADD_S_,UNAME))); \
}

FMA_SSE(VFMADD, fmadd, 231)
FMA_SSE(VFMADD, fmadd, 213)
FMA_SSE(VFMADD, fmadd, 132)

etc.


r~

Re: [PATCH 4/4] target/i386: implement FMA instructions

Reply via email to