The following fixes the runtime regression of 456.hmmer caused by matching ICC in code generation and using cmov more aggressively (through GIMPLE level MAX_EXPR usage). Appearantly (discovered by manual assembler editing) using the SSE unit for performing SImode loads, adds and then two singed max operations plus stores is quite a bit faster than cmovs - even faster than the original single cmov plus branchy second max. Even more so for AMD CPUs than Intel CPUs.
Instead of hacking up some pattern recognition pass to transform integer mode memory-to-memory computation chains involving conditional moves to "vector" code (similar to what STV does for TImode ops on x86_64) the following simply allows SImode into SSE registers (support for this is already there in some important places like move patterns!). For the particular case of 456.hmmer the required support is loads/stores (already implemented), SImode adds and SImode smax. So the patch adds a smax pattern for SImode (we don't have any for scalar modes but currently expand via a conditional move sequence) emitting as SSE vector max or cmp/cmov depending on the alternative. And it amends the *add<mode>_1 pattern with SSE alternatives (which have to come before the memory alternative as IRA otherwise doesn't consider reloading a memory operand to a register). With this in place the runtime of 456.hmmer improves by 10% on Haswell which is back to before regression speed but not to same levels as seen with manually editing just the single important loop. I'm currently benchmarking all SPEC CPU 2006 on Haswell. More interesting is probably Zen where moves crossing the integer - vector domain are excessively expensive (they get done via the stack). Clearly this approach will run into register allocation issues but it looks cleaner than writing yet another STV-like pass (STV itself is quite awkwardly structured so I refrain from touching it...). Anyway - comments? It seems to me that MMX-in-SSE does something very similar. Bootstrapped on x86_64-unknown-linux-gnu, previous testing revealed some issue. Forgot that *add<mode>_1 also handles DImode..., fixed below, re-testing in progress. Thanks, Richard. 2019-07-23 Richard Biener <rguent...@suse.de> PR target/91154 * config/i386/i386.md (smaxsi3): New. (*add<mode>_1): Add SSE and AVX variants. * config/i386/i386.c (ix86_lea_for_add_ok): Do not allow SSE registers. Index: gcc/config/i386/i386.md =================================================================== --- gcc/config/i386/i386.md (revision 273732) +++ gcc/config/i386/i386.md (working copy) @@ -1881,6 +1881,33 @@ (define_expand "mov<mode>" "" "ix86_expand_move (<MODE>mode, operands); DONE;") +(define_insn "smaxsi3" + [(set (match_operand:SI 0 "register_operand" "=r,v,x") + (smax:SI (match_operand:SI 1 "register_operand" "%0,v,0") + (match_operand:SI 2 "register_operand" "r,v,x"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_SSE4_1" +{ + switch (get_attr_type (insn)) + { + case TYPE_SSEADD: + if (which_alternative == 1) + return "vpmaxsd\t{%2, %1, %0|%0, %1, %2}"; + else + return "pmaxsd\t{%2, %0|%0, %2}"; + case TYPE_ICMOV: + /* ??? Instead split this after reload? */ + return "cmpl\t{%2, %0|%0, %2}\n" + "\tcmovl\t{%2, %0|%0, %2}"; + default: + gcc_unreachable (); + } +} + [(set_attr "isa" "noavx,avx,noavx") + (set_attr "prefix" "orig,vex,orig") + (set_attr "memory" "none") + (set_attr "type" "icmov,sseadd,sseadd")]) + (define_insn "*mov<mode>_xor" [(set (match_operand:SWI48 0 "register_operand" "=r") (match_operand:SWI48 1 "const0_operand")) @@ -5368,10 +5395,10 @@ (define_insn_and_split "*add<dwi>3_doubl }) (define_insn "*add<mode>_1" - [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,r") + [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,v,x,r,r,r") (plus:SWI48 - (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,r,r") - (match_operand:SWI48 2 "x86_64_general_operand" "re,m,0,le"))) + (match_operand:SWI48 1 "nonimmediate_operand" "%0,v,0,0,r,r") + (match_operand:SWI48 2 "x86_64_general_operand" "re,v,x,*m,0,le"))) (clobber (reg:CC FLAGS_REG))] "ix86_binary_operator_ok (PLUS, <MODE>mode, operands)" { @@ -5390,10 +5417,23 @@ (define_insn "*add<mode>_1" return "dec{<imodesuffix>}\t%0"; } + case TYPE_SSEADD: + if (which_alternative == 1) + { + if (<MODE>mode == SImode) + return "%vpaddd\t{%2, %1, %0|%0, %1, %2}"; + else + return "%vpaddq\t{%2, %1, %0|%0, %1, %2}"; + } + else if (<MODE>mode == SImode) + return "paddd\t{%2, %0|%0, %2}"; + else + return "paddq\t{%2, %0|%0, %2}"; + default: /* For most processors, ADD is faster than LEA. This alternative was added to use ADD as much as possible. */ - if (which_alternative == 2) + if (which_alternative == 4) std::swap (operands[1], operands[2]); gcc_assert (rtx_equal_p (operands[0], operands[1])); @@ -5403,9 +5443,14 @@ (define_insn "*add<mode>_1" return "add{<imodesuffix>}\t{%2, %0|%0, %2}"; } } - [(set (attr "type") - (cond [(eq_attr "alternative" "3") + [(set_attr "isa" "*,avx,noavx,*,*,*") + (set (attr "type") + (cond [(eq_attr "alternative" "5") (const_string "lea") + (eq_attr "alternative" "1") + (const_string "sseadd") + (eq_attr "alternative" "2") + (const_string "sseadd") (match_operand:SWI48 2 "incdec_operand") (const_string "incdec") ] Index: gcc/config/i386/i386.c =================================================================== --- gcc/config/i386/i386.c (revision 273732) +++ gcc/config/i386/i386.c (working copy) @@ -14616,6 +14616,9 @@ ix86_lea_for_add_ok (rtx_insn *insn, rtx unsigned int regno1 = true_regnum (operands[1]); unsigned int regno2 = true_regnum (operands[2]); + if (SSE_REGNO_P (regno1)) + return false; + /* If a = b + c, (a!=b && a!=c), must use lea form. */ if (regno0 != regno1 && regno0 != regno2) return true;