Hi! If one vcond argument is all ones (non-bool) vector and another one is all zeros, we can use for AVX512{DQ,BW} (sometimes + VL) the vpmovm2? insns. While if op_true is all ones and op_false, we emit large code that the combiner often optimizes to that vpmovm2?, if the arguments are swapped, we emit vpxor + vpternlog + and masked move (blend), while we could just invert the mask with knot* and use vpmovm2?.
Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? The patch is large, but it is mostly reindentation, in the attachment there is diff -ubpd variant of the i386.c changes to make it more readable. 2018-12-19 Jakub Jelinek <ja...@redhat.com> PR target/88547 * config/i386/i386.c (ix86_expand_sse_movcc): For maskcmp, try to emit vpmovm2? instruction perhaps after knot?. Reorganize code so that it doesn't have to test !maskcmp in almost every conditional. * gcc.target/i386/pr88547-1.c: New test. --- gcc/config/i386/i386.c.jj 2018-12-18 19:40:27.698946295 +0100 +++ gcc/config/i386/i386.c 2018-12-19 17:14:24.948218640 +0100 @@ -23593,33 +23593,117 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp cmp = gen_rtx_SUBREG (mode, cmp, 0); } - if (vector_all_ones_operand (op_true, mode) - && rtx_equal_p (op_false, CONST0_RTX (mode)) - && !maskcmp) + if (maskcmp) + { + rtx (*gen) (rtx, rtx) = NULL; + if ((op_true == CONST0_RTX (mode) + && vector_all_ones_operand (op_false, mode)) + || (op_false == CONST0_RTX (mode) + && vector_all_ones_operand (op_true, mode))) + switch (mode) + { + case E_V64QImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_cvtmask2bv64qi; + break; + case E_V32QImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_cvtmask2bv32qi; + break; + case E_V16QImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_cvtmask2bv16qi; + break; + case E_V32HImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_cvtmask2wv32hi; + break; + case E_V16HImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_cvtmask2wv16hi; + break; + case E_V8HImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_cvtmask2wv8hi; + break; + case E_V16SImode: + if (TARGET_AVX512DQ) + gen = gen_avx512f_cvtmask2dv16si; + break; + case E_V8SImode: + if (TARGET_AVX512VL && TARGET_AVX512DQ) + gen = gen_avx512vl_cvtmask2dv8si; + break; + case E_V4SImode: + if (TARGET_AVX512VL && TARGET_AVX512DQ) + gen = gen_avx512vl_cvtmask2dv4si; + break; + case E_V8DImode: + if (TARGET_AVX512DQ) + gen = gen_avx512f_cvtmask2qv8di; + break; + case E_V4DImode: + if (TARGET_AVX512VL && TARGET_AVX512DQ) + gen = gen_avx512vl_cvtmask2qv4di; + break; + case E_V2DImode: + if (TARGET_AVX512VL && TARGET_AVX512DQ) + gen = gen_avx512vl_cvtmask2qv2di; + break; + default: + break; + } + if (gen && SCALAR_INT_MODE_P (cmpmode)) + { + cmp = force_reg (cmpmode, cmp); + if (op_true == CONST0_RTX (mode)) + { + rtx (*gen_not) (rtx, rtx); + switch (cmpmode) + { + case E_QImode: gen_not = gen_knotqi; break; + case E_HImode: gen_not = gen_knothi; break; + case E_SImode: gen_not = gen_knotsi; break; + case E_DImode: gen_not = gen_knotdi; break; + default: gcc_unreachable (); + } + rtx n = gen_reg_rtx (cmpmode); + emit_insn (gen_not (n, cmp)); + cmp = n; + } + emit_insn (gen (dest, cmp)); + return; + } + } + else if (vector_all_ones_operand (op_true, mode) + && op_false == CONST0_RTX (mode)) { emit_insn (gen_rtx_SET (dest, cmp)); + return; } - else if (op_false == CONST0_RTX (mode) && !maskcmp) + else if (op_false == CONST0_RTX (mode)) { op_true = force_reg (mode, op_true); x = gen_rtx_AND (mode, cmp, op_true); emit_insn (gen_rtx_SET (dest, x)); + return; } - else if (op_true == CONST0_RTX (mode) && !maskcmp) + else if (op_true == CONST0_RTX (mode)) { op_false = force_reg (mode, op_false); x = gen_rtx_NOT (mode, cmp); x = gen_rtx_AND (mode, x, op_false); emit_insn (gen_rtx_SET (dest, x)); + return; } - else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode) - && !maskcmp) + else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)) { op_false = force_reg (mode, op_false); x = gen_rtx_IOR (mode, cmp, op_false); emit_insn (gen_rtx_SET (dest, x)); + return; } - else if (TARGET_XOP && !maskcmp) + else if (TARGET_XOP) { op_true = force_reg (mode, op_true); @@ -23629,127 +23713,126 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp, op_true, op_false))); + return; } - else - { - rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; - rtx d = dest; - if (!vector_operand (op_true, mode)) - op_true = force_reg (mode, op_true); + rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; + rtx d = dest; - op_false = force_reg (mode, op_false); + if (!vector_operand (op_true, mode)) + op_true = force_reg (mode, op_true); - switch (mode) - { - case E_V4SFmode: - if (TARGET_SSE4_1) - gen = gen_sse4_1_blendvps; - break; - case E_V2DFmode: - if (TARGET_SSE4_1) - gen = gen_sse4_1_blendvpd; - break; - case E_SFmode: - if (TARGET_SSE4_1) - { - gen = gen_sse4_1_blendvss; - op_true = force_reg (mode, op_true); - } - break; - case E_DFmode: - if (TARGET_SSE4_1) - { - gen = gen_sse4_1_blendvsd; - op_true = force_reg (mode, op_true); - } - break; - case E_V16QImode: - case E_V8HImode: - case E_V4SImode: - case E_V2DImode: - if (TARGET_SSE4_1) - { - gen = gen_sse4_1_pblendvb; - if (mode != V16QImode) - d = gen_reg_rtx (V16QImode); - op_false = gen_lowpart (V16QImode, op_false); - op_true = gen_lowpart (V16QImode, op_true); - cmp = gen_lowpart (V16QImode, cmp); - } - break; - case E_V8SFmode: - if (TARGET_AVX) - gen = gen_avx_blendvps256; - break; - case E_V4DFmode: - if (TARGET_AVX) - gen = gen_avx_blendvpd256; - break; - case E_V32QImode: - case E_V16HImode: - case E_V8SImode: - case E_V4DImode: - if (TARGET_AVX2) - { - gen = gen_avx2_pblendvb; - if (mode != V32QImode) - d = gen_reg_rtx (V32QImode); - op_false = gen_lowpart (V32QImode, op_false); - op_true = gen_lowpart (V32QImode, op_true); - cmp = gen_lowpart (V32QImode, cmp); - } - break; - - case E_V64QImode: - gen = gen_avx512bw_blendmv64qi; - break; - case E_V32HImode: - gen = gen_avx512bw_blendmv32hi; - break; - case E_V16SImode: - gen = gen_avx512f_blendmv16si; - break; - case E_V8DImode: - gen = gen_avx512f_blendmv8di; - break; - case E_V8DFmode: - gen = gen_avx512f_blendmv8df; - break; - case E_V16SFmode: - gen = gen_avx512f_blendmv16sf; - break; - - default: - break; - } + op_false = force_reg (mode, op_false); - if (gen != NULL) + switch (mode) + { + case E_V4SFmode: + if (TARGET_SSE4_1) + gen = gen_sse4_1_blendvps; + break; + case E_V2DFmode: + if (TARGET_SSE4_1) + gen = gen_sse4_1_blendvpd; + break; + case E_SFmode: + if (TARGET_SSE4_1) { - emit_insn (gen (d, op_false, op_true, cmp)); - if (d != dest) - emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d)); + gen = gen_sse4_1_blendvss; + op_true = force_reg (mode, op_true); } - else + break; + case E_DFmode: + if (TARGET_SSE4_1) { + gen = gen_sse4_1_blendvsd; op_true = force_reg (mode, op_true); + } + break; + case E_V16QImode: + case E_V8HImode: + case E_V4SImode: + case E_V2DImode: + if (TARGET_SSE4_1) + { + gen = gen_sse4_1_pblendvb; + if (mode != V16QImode) + d = gen_reg_rtx (V16QImode); + op_false = gen_lowpart (V16QImode, op_false); + op_true = gen_lowpart (V16QImode, op_true); + cmp = gen_lowpart (V16QImode, cmp); + } + break; + case E_V8SFmode: + if (TARGET_AVX) + gen = gen_avx_blendvps256; + break; + case E_V4DFmode: + if (TARGET_AVX) + gen = gen_avx_blendvpd256; + break; + case E_V32QImode: + case E_V16HImode: + case E_V8SImode: + case E_V4DImode: + if (TARGET_AVX2) + { + gen = gen_avx2_pblendvb; + if (mode != V32QImode) + d = gen_reg_rtx (V32QImode); + op_false = gen_lowpart (V32QImode, op_false); + op_true = gen_lowpart (V32QImode, op_true); + cmp = gen_lowpart (V32QImode, cmp); + } + break; + + case E_V64QImode: + gen = gen_avx512bw_blendmv64qi; + break; + case E_V32HImode: + gen = gen_avx512bw_blendmv32hi; + break; + case E_V16SImode: + gen = gen_avx512f_blendmv16si; + break; + case E_V8DImode: + gen = gen_avx512f_blendmv8di; + break; + case E_V8DFmode: + gen = gen_avx512f_blendmv8df; + break; + case E_V16SFmode: + gen = gen_avx512f_blendmv16sf; + break; + + default: + break; + } + + if (gen != NULL) + { + emit_insn (gen (d, op_false, op_true, cmp)); + if (d != dest) + emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d)); + } + else + { + op_true = force_reg (mode, op_true); - t2 = gen_reg_rtx (mode); - if (optimize) - t3 = gen_reg_rtx (mode); - else - t3 = dest; - - x = gen_rtx_AND (mode, op_true, cmp); - emit_insn (gen_rtx_SET (t2, x)); - - x = gen_rtx_NOT (mode, cmp); - x = gen_rtx_AND (mode, x, op_false); - emit_insn (gen_rtx_SET (t3, x)); + t2 = gen_reg_rtx (mode); + if (optimize) + t3 = gen_reg_rtx (mode); + else + t3 = dest; - x = gen_rtx_IOR (mode, t3, t2); - emit_insn (gen_rtx_SET (dest, x)); - } + x = gen_rtx_AND (mode, op_true, cmp); + emit_insn (gen_rtx_SET (t2, x)); + + x = gen_rtx_NOT (mode, cmp); + x = gen_rtx_AND (mode, x, op_false); + emit_insn (gen_rtx_SET (t3, x)); + + x = gen_rtx_IOR (mode, t3, t2); + emit_insn (gen_rtx_SET (dest, x)); } } --- gcc/testsuite/gcc.target/i386/pr88547-1.c.jj 2018-12-19 17:20:16.396518647 +0100 +++ gcc/testsuite/gcc.target/i386/pr88547-1.c 2018-12-19 17:28:54.818103169 +0100 @@ -0,0 +1,117 @@ +/* PR target/88547 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512vl -mavx512bw -mavx512dq" } */ +/* { dg-final { scan-assembler-not "vpternlog" } } */ +/* { dg-final { scan-assembler-times "vpmovm2b\[\t ]" 4 } } */ +/* { dg-final { scan-assembler-times "vpmovm2w\[\t ]" 4 } } */ +/* { dg-final { scan-assembler-times "vpmovm2d\[\t ]" 4 } } */ +/* { dg-final { scan-assembler-times "vpmovm2q\[\t ]" 4 } } */ +/* { dg-final { scan-assembler-times "knotb\[\t ]" 4 } } */ +/* { dg-final { scan-assembler-times "knotw\[\t ]" 4 } } */ +/* { dg-final { scan-assembler-times "knotd\[\t ]" 2 } } */ +/* { dg-final { scan-assembler-times "knotq\[\t ]" 2 } } */ + +typedef signed char v16qi __attribute__((vector_size(64))); +typedef unsigned char v16uqi __attribute__((vector_size(64))); +typedef short v8hi __attribute__((vector_size(64))); +typedef unsigned short v8uhi __attribute__((vector_size(64))); +typedef int v4si __attribute__((vector_size(64))); +typedef unsigned v4usi __attribute__((vector_size(64))); +typedef long long v2di __attribute__((vector_size(64))); +typedef unsigned long long v2udi __attribute__((vector_size(64))); + +v16qi +f1 (v16qi x, v16qi y) +{ + return x <= y; +} + +v16uqi +f2 (v16uqi x, v16uqi y) +{ + return x <= y; +} + +v16qi +f3 (v16qi x, v16qi y) +{ + return x >= y; +} + +v16uqi +f4 (v16uqi x, v16uqi y) +{ + return x >= y; +} + +v8hi +f5 (v8hi x, v8hi y) +{ + return x <= y; +} + +v8uhi +f6 (v8uhi x, v8uhi y) +{ + return x <= y; +} + +v8hi +f7 (v8hi x, v8hi y) +{ + return x >= y; +} + +v8uhi +f8 (v8uhi x, v8uhi y) +{ + return x >= y; +} + +v4si +f9 (v4si x, v4si y) +{ + return x <= y; +} + +v4usi +f10 (v4usi x, v4usi y) +{ + return x <= y; +} + +v4si +f11 (v4si x, v4si y) +{ + return x >= y; +} + +v4usi +f12 (v4usi x, v4usi y) +{ + return x >= y; +} + +v2di +f13 (v2di x, v2di y) +{ + return x <= y; +} + +v2udi +f14 (v2udi x, v2udi y) +{ + return x <= y; +} + +v2di +f15 (v2di x, v2di y) +{ + return x >= y; +} + +v2udi +f16 (v2udi x, v2udi y) +{ + return x >= y; +} Jakub
--- gcc/config/i386/i386.c.jj 2018-12-18 19:40:27.698946295 +0100 +++ gcc/config/i386/i386.c 2018-12-19 17:14:24.948218640 +0100 @@ -23593,33 +23593,117 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp cmp = gen_rtx_SUBREG (mode, cmp, 0); } - if (vector_all_ones_operand (op_true, mode) - && rtx_equal_p (op_false, CONST0_RTX (mode)) - && !maskcmp) + if (maskcmp) + { + rtx (*gen) (rtx, rtx) = NULL; + if ((op_true == CONST0_RTX (mode) + && vector_all_ones_operand (op_false, mode)) + || (op_false == CONST0_RTX (mode) + && vector_all_ones_operand (op_true, mode))) + switch (mode) + { + case E_V64QImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_cvtmask2bv64qi; + break; + case E_V32QImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_cvtmask2bv32qi; + break; + case E_V16QImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_cvtmask2bv16qi; + break; + case E_V32HImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_cvtmask2wv32hi; + break; + case E_V16HImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_cvtmask2wv16hi; + break; + case E_V8HImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_cvtmask2wv8hi; + break; + case E_V16SImode: + if (TARGET_AVX512DQ) + gen = gen_avx512f_cvtmask2dv16si; + break; + case E_V8SImode: + if (TARGET_AVX512VL && TARGET_AVX512DQ) + gen = gen_avx512vl_cvtmask2dv8si; + break; + case E_V4SImode: + if (TARGET_AVX512VL && TARGET_AVX512DQ) + gen = gen_avx512vl_cvtmask2dv4si; + break; + case E_V8DImode: + if (TARGET_AVX512DQ) + gen = gen_avx512f_cvtmask2qv8di; + break; + case E_V4DImode: + if (TARGET_AVX512VL && TARGET_AVX512DQ) + gen = gen_avx512vl_cvtmask2qv4di; + break; + case E_V2DImode: + if (TARGET_AVX512VL && TARGET_AVX512DQ) + gen = gen_avx512vl_cvtmask2qv2di; + break; + default: + break; + } + if (gen && SCALAR_INT_MODE_P (cmpmode)) + { + cmp = force_reg (cmpmode, cmp); + if (op_true == CONST0_RTX (mode)) + { + rtx (*gen_not) (rtx, rtx); + switch (cmpmode) + { + case E_QImode: gen_not = gen_knotqi; break; + case E_HImode: gen_not = gen_knothi; break; + case E_SImode: gen_not = gen_knotsi; break; + case E_DImode: gen_not = gen_knotdi; break; + default: gcc_unreachable (); + } + rtx n = gen_reg_rtx (cmpmode); + emit_insn (gen_not (n, cmp)); + cmp = n; + } + emit_insn (gen (dest, cmp)); + return; + } + } + else if (vector_all_ones_operand (op_true, mode) + && op_false == CONST0_RTX (mode)) { emit_insn (gen_rtx_SET (dest, cmp)); + return; } - else if (op_false == CONST0_RTX (mode) && !maskcmp) + else if (op_false == CONST0_RTX (mode)) { op_true = force_reg (mode, op_true); x = gen_rtx_AND (mode, cmp, op_true); emit_insn (gen_rtx_SET (dest, x)); + return; } - else if (op_true == CONST0_RTX (mode) && !maskcmp) + else if (op_true == CONST0_RTX (mode)) { op_false = force_reg (mode, op_false); x = gen_rtx_NOT (mode, cmp); x = gen_rtx_AND (mode, x, op_false); emit_insn (gen_rtx_SET (dest, x)); + return; } - else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode) - && !maskcmp) + else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)) { op_false = force_reg (mode, op_false); x = gen_rtx_IOR (mode, cmp, op_false); emit_insn (gen_rtx_SET (dest, x)); + return; } - else if (TARGET_XOP && !maskcmp) + else if (TARGET_XOP) { op_true = force_reg (mode, op_true); @@ -23629,9 +23713,9 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp, op_true, op_false))); + return; } - else - { + rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; rtx d = dest; @@ -23750,7 +23834,6 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp x = gen_rtx_IOR (mode, t3, t2); emit_insn (gen_rtx_SET (dest, x)); } - } } /* Expand a floating-point conditional move. Return true if successful. */