Following patch may help with partial SSE reg dependencies for {R,}SQRTS{S,D}, RCPS{S,D} and ROUNDS{S,D} instructions. It takes the same strategy as both ICC and clang take, that is:
a) load from memory with MOVS{S,D} and b) in case of SSE, match input and output register. The implementation uses preferred_for_speed attribute, so in cold sections or when compiled with -Os, the compiler is still able to create direct load from memory (SSE, AVX) and use unmatched registers for SSE targets. The sqrt from memory is now compiled to: movsd z(%rip), %xmm0 sqrtsd %xmm0, %xmm0 (SSE) or vmovsd z(%rip), %xmm1 vsqrtsd %xmm1, %xmm1, %xmm0 (AVX). And sqrt from unmatched input register will compile to: sqrtsd %xmm1, %xmm1 movapd %xmm1, %xmm0 (SSE) or vsqrtsd %xmm1, %xmm1, %xmm0 (AVX). The patch doesn't touch conversion instructions, where XOR clearing is preferred (pending patch for PR 87007). 2019-02-03 Uroš Bizjak <ubiz...@gmail.com> PR target/89071 * config/i386/i386.md (*sqrt<mode>2_sse): Add (v,0) alternative. Do not prefer (v,v) alternative for non-AVX targets and (m,v) alternative for speed when TARGET_SSE_PARTIAL_REG_DEPENDENCY is set. (*rcpsf2_sse): Ditto. (*rsqrtsf2_sse): Ditto. (sse4_1_round<mode<2): Ditto. Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}. Committed to mainline SVN. Uros.
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 744f155fca6f..9948f77fca53 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -4472,9 +4472,9 @@ (set (match_dup 0) (float_extend:DF (match_dup 2)))] "operands[2] = lowpart_subreg (SFmode, operands[0], DFmode);") -;; Break partial reg stall for cvtss2sd. This splitter should split -;; late in the pass sequence (after register rename pass), -;; so allocated registers won't change anymore. +;; Break partial SSE register dependency stall. This splitter should split +;; late in the pass sequence (after register rename pass), so allocated +;; registers won't change anymore (define_split [(set (match_operand:DF 0 "sse_reg_operand") @@ -4632,9 +4632,9 @@ (set (match_dup 0) (float_truncate:SF (match_dup 2)))] "operands[2] = lowpart_subreg (DFmode, operands[0], SFmode);") -;; Break partial reg stall for cvtsd2ss. This splitter should split -;; late in the pass sequence (after register rename pass), -;; so allocated registers won't change anymore. +;; Break partial SSE register dependency stall. This splitter should split +;; late in the pass sequence (after register rename pass), so allocated +;; registers won't change anymore (define_split [(set (match_operand:SF 0 "sse_reg_operand") @@ -5137,7 +5137,7 @@ (set_attr "unit" "i387") (set_attr "fp_int_src" "true")]) -;; Avoid partial SSE register dependency stalls. This splitter should split +;; Break partial SSE register dependency stall. This splitter should split ;; late in the pass sequence (after register rename pass), so allocated ;; registers won't change anymore @@ -14765,18 +14765,26 @@ (symbol_ref "false"))))]) (define_insn "*rcpsf2_sse" - [(set (match_operand:SF 0 "register_operand" "=x,x") - (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "x,m")] + [(set (match_operand:SF 0 "register_operand" "=x,x,x") + (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "0,x,m")] UNSPEC_RCP))] "TARGET_SSE && TARGET_SSE_MATH" "@ + %vrcpss\t{%d1, %0|%0, %d1} %vrcpss\t{%d1, %0|%0, %d1} %vrcpss\t{%1, %d0|%d0, %1}" [(set_attr "type" "sse") (set_attr "atom_sse_attr" "rcp") (set_attr "btver2_sse_attr" "rcp") (set_attr "prefix" "maybe_vex") - (set_attr "mode" "SF")]) + (set_attr "mode" "SF") + (set (attr "preferred_for_speed") + (cond [(eq_attr "alternative" "1") + (symbol_ref "TARGET_AVX || !TARGET_SSE_PARTIAL_REG_DEPENDENCY") + (eq_attr "alternative" "2") + (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY") + ] + (symbol_ref "true")))]) (define_insn "*fop_xf_1_i387" [(set (match_operand:XF 0 "register_operand" "=f,f") @@ -15003,18 +15011,26 @@ (set_attr "bdver1_decode" "direct")]) (define_insn "*rsqrtsf2_sse" - [(set (match_operand:SF 0 "register_operand" "=x,x") - (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "x,m")] + [(set (match_operand:SF 0 "register_operand" "=x,x,x") + (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "0,x,m")] UNSPEC_RSQRT))] "TARGET_SSE && TARGET_SSE_MATH" "@ + %vrsqrtss\t{%d1, %0|%0, %d1} %vrsqrtss\t{%d1, %0|%0, %d1} %vrsqrtss\t{%1, %d0|%d0, %1}" [(set_attr "type" "sse") (set_attr "atom_sse_attr" "rcp") (set_attr "btver2_sse_attr" "rcp") (set_attr "prefix" "maybe_vex") - (set_attr "mode" "SF")]) + (set_attr "mode" "SF") + (set (attr "preferred_for_speed") + (cond [(eq_attr "alternative" "1") + (symbol_ref "TARGET_AVX || !TARGET_SSE_PARTIAL_REG_DEPENDENCY") + (eq_attr "alternative" "2") + (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY") + ] + (symbol_ref "true")))]) (define_expand "rsqrtsf2" [(set (match_operand:SF 0 "register_operand") @@ -15027,11 +15043,12 @@ }) (define_insn "*sqrt<mode>2_sse" - [(set (match_operand:MODEF 0 "register_operand" "=v,v") + [(set (match_operand:MODEF 0 "register_operand" "=v,v,v") (sqrt:MODEF - (match_operand:MODEF 1 "nonimmediate_operand" "v,m")))] + (match_operand:MODEF 1 "nonimmediate_operand" "0,v,m")))] "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH" "@ + %vsqrt<ssemodesuffix>\t{%d1, %0|%0, %d1} %vsqrt<ssemodesuffix>\t{%d1, %0|%0, %d1} %vsqrt<ssemodesuffix>\t{%1, %d0|%d0, %1}" [(set_attr "type" "sse") @@ -15039,9 +15056,13 @@ (set_attr "btver2_sse_attr" "sqrt") (set_attr "prefix" "maybe_vex") (set_attr "mode" "<MODE>") - (set_attr "athlon_decode" "*") - (set_attr "amdfam10_decode" "*") - (set_attr "bdver1_decode" "*")]) + (set (attr "preferred_for_speed") + (cond [(eq_attr "alternative" "1") + (symbol_ref "TARGET_AVX || !TARGET_SSE_PARTIAL_REG_DEPENDENCY") + (eq_attr "alternative" "2") + (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY") + ] + (symbol_ref "true")))]) (define_expand "sqrt<mode>2" [(set (match_operand:MODEF 0 "register_operand") @@ -16175,21 +16196,30 @@ (define_insn "sse4_1_round<mode>2" - [(set (match_operand:MODEF 0 "register_operand" "=x,x,v") - (unspec:MODEF [(match_operand:MODEF 1 "nonimmediate_operand" "x,m,vm") - (match_operand:SI 2 "const_0_to_15_operand" "n,n,n")] - UNSPEC_ROUND))] + [(set (match_operand:MODEF 0 "register_operand" "=x,x,x,v") + (unspec:MODEF + [(match_operand:MODEF 1 "nonimmediate_operand" "0,x,m,vm") + (match_operand:SI 2 "const_0_to_15_operand" "n,n,n,n")] + UNSPEC_ROUND))] "TARGET_SSE4_1" "@ + %vround<ssemodesuffix>\t{%2, %d1, %0|%0, %d1, %2} %vround<ssemodesuffix>\t{%2, %d1, %0|%0, %d1, %2} %vround<ssemodesuffix>\t{%2, %1, %d0|%d0, %1, %2} vrndscale<ssemodesuffix>\t{%2, %1, %d0|%d0, %1, %2}" [(set_attr "type" "ssecvt") - (set_attr "prefix_extra" "1,1,*") - (set_attr "length_immediate" "*,*,1") - (set_attr "prefix" "maybe_vex,maybe_vex,evex") - (set_attr "isa" "noavx512f,noavx512f,avx512f") - (set_attr "mode" "<MODE>")]) + (set_attr "prefix_extra" "1,1,1,*") + (set_attr "length_immediate" "*,*,*,1") + (set_attr "prefix" "maybe_vex,maybe_vex,maybe_vex,evex") + (set_attr "isa" "noavx512f,noavx512f,noavx512f,avx512f") + (set_attr "mode" "<MODE>") + (set (attr "preferred_for_speed") + (cond [(eq_attr "alternative" "1") + (symbol_ref "TARGET_AVX || !TARGET_SSE_PARTIAL_REG_DEPENDENCY") + (eq_attr "alternative" "2") + (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY") + ] + (symbol_ref "true")))]) (define_insn "rintxf2" [(set (match_operand:XF 0 "register_operand" "=f")