Hi,
testing matrix multiplication benchmarks shows that FMA on a critical chain
is a perofrmance loss over separate multiply and add. While the latency of 4
is lower than multiply + add (3+2) the problem is that all values needs to
be ready before computation starts.

While on znver4 AVX512 code fared well with FMA, it was because of the
split registers. Znver5 benefits from avoding FMA on all widths.  This
may be different with the mobile version though.

On naive matrix multiplication benchmark the difference is 8% with -O3
only since with -Ofast loop interchange solves the problem differently.
It is 30% win, for example, on s323 from TSVC:

real_t s323(struct args_t * func_args)
{

//    recurrences
//    coupled recurrence

    initialise_arrays(__func__);
    gettimeofday(&func_args->t1, NULL);

    for (int nl = 0; nl < iterations/2; nl++) {
        for (int i = 1; i < LEN_1D; i++) {
            a[i] = b[i-1] + c[i] * d[i];
            b[i] = a[i] + c[i] * e[i];
        }   
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }   
    
    gettimeofday(&func_args->t2, NULL);
    return calc_checksum(__func__);
}   

Bootstrapped/regtesed x86_64-linux, will commit it shortly.

gcc/ChangeLog:

        * config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS): Enable for
        znver5.
        (X86_TUNE_AVOID_256FMA_CHAINS): Likewise.
        (X86_TUNE_AVOID_512FMA_CHAINS): Likewise.

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 3d29bffc49c..da1a3d6a3c6 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -508,17 +508,18 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, 
"use_scatter_8parts",
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
    smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
m_ZNVER2 | m_ZNVER3 | m_ZNVER4
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER
           | m_YONGFENG | m_SHIJIDADAO | m_GENERIC)
 
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
    smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3 | m_ZNVER4
-         | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains",
+         m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID
+         | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
 
 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
    smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE)
+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
 
 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
    for v2df vector reduction.  */

Reply via email to