From: "Zhang, Jun" <jun.zh...@intel.com> gcc/ChangeLog:
* config/i386/i386-expand.cc (ix86_ssecom_setcc): Mention behavior change on flags. (ix86_expand_sse_comi): Handle AVX10.2 behavior. (ix86_expand_sse_comi_round): Ditto. (ix86_expand_round_builtin): Ditto. (ix86_expand_builtin): Change function call. * config/i386/i386.md (UNSPEC_COMX): New unspec. * config/i386/sse.md (avx10_2_v<unord>comx<ssemodesuffix><round_saeonly_name>): New. (<sse>_<unord>comi<round_saeonly_name>): Add HFmode. gcc/testsuite/ChangeLog: * gcc.target/i386/avx10_2-compare-1.c: New test. Co-authored-by: Haochen Jiang <haochen.ji...@intel.com> Co-authored-by: Hongtao Liu <hongtao....@intel.com> --- gcc/config/i386/i386-expand.cc | 170 +++++++++++++++--- gcc/config/i386/i386.md | 1 + gcc/config/i386/sse.md | 18 +- .../gcc.target/i386/avx10_2-compare-1.c | 21 +++ 4 files changed, 183 insertions(+), 27 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-compare-1.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 0322ef003d1..cdeb8b14eb7 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -10664,7 +10664,9 @@ ix86_ssecom_setcc (const enum rtx_code comparison, rtx_code_label *label = NULL; /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient - with NAN operands. */ + with NAN operands. + Under TARGET_AVX10_2_256, VCOMX/VUCOMX are generated instead of + COMI/UCOMI. VCOMX/VUCOMX will not set ZF for NAN operands. */ if (check_unordered) { gcc_assert (comparison == EQ || comparison == NE); @@ -10703,7 +10705,7 @@ ix86_ssecom_setcc (const enum rtx_code comparison, static rtx ix86_expand_sse_comi (const struct builtin_description *d, tree exp, - rtx target) + rtx target, bool comx_ok) { rtx pat, set_dst; tree arg0 = CALL_EXPR_ARG (exp, 0); @@ -10736,11 +10738,13 @@ ix86_expand_sse_comi (const struct builtin_description *d, tree exp, case GE: break; case EQ: - check_unordered = true; + if (!TARGET_AVX10_2_256 || !comx_ok) + check_unordered = true; mode = CCZmode; break; case NE: - check_unordered = true; + if (!TARGET_AVX10_2_256 || !comx_ok) + check_unordered = true; mode = CCZmode; const_val = const1_rtx; break; @@ -10759,6 +10763,28 @@ ix86_expand_sse_comi (const struct builtin_description *d, tree exp, || !insn_p->operand[1].predicate (op1, mode1)) op1 = copy_to_mode_reg (mode1, op1); + if ((comparison == EQ || comparison == NE) + && TARGET_AVX10_2_256 && comx_ok) + { + switch (icode) + { + case CODE_FOR_sse_comi: + icode = CODE_FOR_avx10_2_comxsf; + break; + case CODE_FOR_sse_ucomi: + icode = CODE_FOR_avx10_2_ucomxsf; + break; + case CODE_FOR_sse2_comi: + icode = CODE_FOR_avx10_2_comxdf; + break; + case CODE_FOR_sse2_ucomi: + icode = CODE_FOR_avx10_2_ucomxdf; + break; + + default: + gcc_unreachable (); + } + } pat = GEN_FCN (icode) (op0, op1); if (! pat) return 0; @@ -12253,7 +12279,7 @@ ix86_erase_embedded_rounding (rtx pat) with rounding. */ static rtx ix86_expand_sse_comi_round (const struct builtin_description *d, - tree exp, rtx target) + tree exp, rtx target, bool comx_ok) { rtx pat, set_dst; tree arg0 = CALL_EXPR_ARG (exp, 0); @@ -12315,6 +12341,7 @@ ix86_expand_sse_comi_round (const struct builtin_description *d, op1 = safe_vector_operand (op1, mode1); enum rtx_code comparison = comparisons[INTVAL (op2)]; + enum rtx_code orig_comp = comparison; bool ordered = ordereds[INTVAL (op2)]; bool non_signaling = non_signalings[INTVAL (op2)]; rtx const_val = const0_rtx; @@ -12326,10 +12353,21 @@ ix86_expand_sse_comi_round (const struct builtin_description *d, case ORDERED: if (!ordered) { - /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */ - if (!non_signaling) - ordered = true; - mode = CCSmode; + if (TARGET_AVX10_2_256 && comx_ok) + { + /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF + differently. So directly return true here. */ + target = gen_reg_rtx (SImode); + emit_move_insn (target, const1_rtx); + return target; + } + else + { + /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */ + if (!non_signaling) + ordered = true; + mode = CCSmode; + } } else { @@ -12343,10 +12381,21 @@ ix86_expand_sse_comi_round (const struct builtin_description *d, case UNORDERED: if (ordered) { - /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */ - if (non_signaling) - ordered = false; - mode = CCSmode; + if (TARGET_AVX10_2_256 && comx_ok) + { + /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF + differently. So directly return false here. */ + target = gen_reg_rtx (SImode); + emit_move_insn (target, const0_rtx); + return target; + } + else + { + /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */ + if (non_signaling) + ordered = false; + mode = CCSmode; + } } else { @@ -12377,17 +12426,23 @@ ix86_expand_sse_comi_round (const struct builtin_description *d, if (ordered == non_signaling) ordered = !ordered; break; - case EQ: /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for - _CMP_EQ_OQ/_CMP_EQ_OS. */ - check_unordered = true; + _CMP_EQ_OQ/_CMP_EQ_OS. + Under TARGET_AVX10_2_256, VCOMX/VUCOMX are always generated instead + of COMI/UCOMI, VCOMX/VUCOMX will not set ZF with NAN. */ + case EQ: + if (!TARGET_AVX10_2_256 || !comx_ok) + check_unordered = true; mode = CCZmode; break; case NE: /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for - _CMP_NEQ_UQ/_CMP_NEQ_US. */ + _CMP_NEQ_UQ/_CMP_NEQ_US. + Under TARGET_AVX10_2_256, VCOMX/VUCOMX are always generated instead + of COMI/UCOMI, VCOMX/VUCOMX will not set ZF with NAN. */ gcc_assert (!ordered); - check_unordered = true; + if (!TARGET_AVX10_2_256 || !comx_ok) + check_unordered = true; mode = CCZmode; const_val = const1_rtx; break; @@ -12406,14 +12461,77 @@ ix86_expand_sse_comi_round (const struct builtin_description *d, || !insn_p->operand[1].predicate (op1, mode1)) op1 = copy_to_mode_reg (mode1, op1); + /* Generate comx instead of comi when EQ/NE to avoid NAN checks. + Use orig_comp to exclude ORDERED/UNORDERED cases. */ + if ((orig_comp == EQ || orig_comp == NE) + && TARGET_AVX10_2_256 && comx_ok) + { + switch (icode) + { + case CODE_FOR_avx512fp16_comi_round: + icode = CODE_FOR_avx10_2_comxhf_round; + break; + case CODE_FOR_sse_comi_round: + icode = CODE_FOR_avx10_2_comxsf_round; + break; + case CODE_FOR_sse2_comi_round: + icode = CODE_FOR_avx10_2_comxdf_round; + break; + + default: + break; + } + } + + /* Generate comi instead of comx when UNEQ/LTGT to avoid NAN checks. */ + if ((comparison == UNEQ || comparison == LTGT) + && TARGET_AVX10_2_256 && comx_ok) + { + switch (icode) + { + case CODE_FOR_avx10_2_comxhf_round: + icode = CODE_FOR_avx512fp16_comi_round; + break; + case CODE_FOR_avx10_2_comxsf_round: + icode = CODE_FOR_sse_comi_round; + break; + case CODE_FOR_avx10_2_comxdf_round: + icode = CODE_FOR_sse2_comi_round; + break; + + default: + break; + } + } + /* - 1. COMI: ordered and signaling. - 2. UCOMI: unordered and non-signaling. + 1. COMI/VCOMX: ordered and signaling. + 2. UCOMI/VUCOMX: unordered and non-signaling. */ if (non_signaling) - icode = (icode == CODE_FOR_sse_comi_round - ? CODE_FOR_sse_ucomi_round - : CODE_FOR_sse2_ucomi_round); + switch (icode) + { + case CODE_FOR_sse_comi_round: + icode = CODE_FOR_sse_ucomi_round; + break; + case CODE_FOR_sse2_comi_round: + icode = CODE_FOR_sse2_ucomi_round; + break; + case CODE_FOR_avx512fp16_comi_round: + icode = CODE_FOR_avx512fp16_ucomi_round; + break; + case CODE_FOR_avx10_2_comxsf_round: + icode = CODE_FOR_avx10_2_ucomxsf_round; + break; + case CODE_FOR_avx10_2_comxhf_round: + icode = CODE_FOR_avx10_2_ucomxhf_round; + break; + case CODE_FOR_avx10_2_comxdf_round: + icode = CODE_FOR_avx10_2_ucomxdf_round; + break; + default: + gcc_unreachable (); + } pat = GEN_FCN (icode) (op0, op1, op3); if (! pat) @@ -12550,7 +12668,7 @@ ix86_expand_round_builtin (const struct builtin_description *d, break; case INT_FTYPE_V4SF_V4SF_INT_INT: case INT_FTYPE_V2DF_V2DF_INT_INT: - return ix86_expand_sse_comi_round (d, exp, target); + return ix86_expand_sse_comi_round (d, exp, target, true); case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI_INT: case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT: case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT: @@ -15691,7 +15809,7 @@ rdseed_step: case IX86_BUILTIN_VCOMSBF16GE: case IX86_BUILTIN_VCOMSBF16LT: case IX86_BUILTIN_VCOMSBF16LE: - return ix86_expand_sse_comi (bdesc_args + i, exp, target); + return ix86_expand_sse_comi (bdesc_args + i, exp, target, false); case IX86_BUILTIN_FABSQ: case IX86_BUILTIN_COPYSIGNQ: if (!TARGET_SSE) @@ -15707,7 +15825,7 @@ rdseed_step: && fcode <= IX86_BUILTIN__BDESC_COMI_LAST) { i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST; - return ix86_expand_sse_comi (bdesc_comi + i, exp, target); + return ix86_expand_sse_comi (bdesc_comi + i, exp, target, true); } if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index e28f9bb5eae..ab6059759b4 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -139,6 +139,7 @@ UNSPEC_SCALEF UNSPEC_PCMP UNSPEC_CVTBFSF + UNSPEC_COMX ;; Generic math support UNSPEC_IEEE_MIN ; not commutative diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 93aa6d46ae4..db538ac4ad5 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -4692,6 +4692,22 @@ (set_attr "prefix" "evex") (set_attr "mode" "<ssescalarmode>")]) +(define_insn "avx10_2_<unord>comx<mode><round_saeonly_name>" + [(set (reg:CCFP FLAGS_REG) + (unspec:CCFP + [(vec_select:MODEFH + (match_operand:<ssevecmode> 0 "register_operand" "v") + (parallel [(const_int 0)])) + (vec_select:MODEFH + (match_operand:<ssevecmode> 1 "<round_saeonly_nimm_scalar_predicate>" "<round_saeonly_constraint>") + (parallel [(const_int 0)]))] + UNSPEC_COMX))] + "TARGET_AVX10_2_256" + "v<unord>comx<ssemodesuffix>\t{<round_saeonly_op2>%1, %0|%0, %<iptr>1<round_saeonly_op2>}" + [(set_attr "type" "ssecomi") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + (define_insn "<sse>_<unord>comi<round_saeonly_name>" [(set (reg:CCFP FLAGS_REG) (compare:CCFP @@ -4701,7 +4717,7 @@ (vec_select:MODEFH (match_operand:<ssevecmode> 1 "<round_saeonly_nimm_scalar_predicate>" "<round_saeonly_constraint>") (parallel [(const_int 0)]))))] - "SSE_FLOAT_MODE_P (<MODE>mode)" + "SSE_FLOAT_MODE_P (<MODE>mode) || <MODE>mode == E_HFmode" "%v<unord>comi<ssemodesuffix>\t{<round_saeonly_op2>%1, %0|%0, %<iptr>1<round_saeonly_op2>}" [(set_attr "type" "ssecomi") (set_attr "prefix" "maybe_vex") diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-compare-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-compare-1.c new file mode 100644 index 00000000000..99d32186e6b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-compare-1.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-final { scan-assembler-times "vcomxsd\[ \\t\]+\{sae\}\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcomxss\[ \\t\]+\{sae\}\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vucomxsd\[ \\t\]+\{sae\}\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vucomxss\[ \\t\]+\{sae\}\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ + +#include <immintrin.h> + +volatile __m128 x3; +volatile __m128d x4; +volatile int a; + +void extern +avx10_2_test (void) +{ + a = _mm_comi_round_sd (x4, x4, _CMP_EQ_OS, _MM_FROUND_NO_EXC); + a = _mm_comi_round_ss (x3, x3, _CMP_NEQ_US, _MM_FROUND_NO_EXC); + a = _mm_comi_round_sd (x4, x4, _CMP_EQ_OQ, _MM_FROUND_NO_EXC); + a = _mm_comi_round_ss (x3, x3, _CMP_NEQ_UQ, _MM_FROUND_NO_EXC); +} -- 2.43.5