Hi: This is another patch to optimize vec_perm_expr to match vpmov{dw,dq,wb} under AVX512. For scenarios(like pr101846-2.c) where the upper half is not used, this patch generates better code with only one vpmov{wb,dw,qd} instruction. For scenarios(like pr101846-3.c) where the upper half is actually used, if the src vector length is 256/512bits, the patch can still generate better code, but for 128bits, the code generation is worse.
128 bits upper half not used. - vpshufb .LC2(%rip), %xmm0, %xmm0 + vpmovdw %xmm0, %xmm0 128 bits upper half used. - vpshufb .LC2(%rip), %xmm0, %xmm0 + vpmovdw %xmm0, %xmm1 + vmovq %xmm1, %rax + vpinsrq $0, %rax, %xmm0, %xmm0 Maybe expand_vec_perm_trunc_vinsert should only deal with 256/512bits of vectors, but considering the real use of scenarios like pr101846-3.c foo_*_128 possibility is relatively low, I still keep this part of the code. Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. Ok for trunk? gcc/ChangeLog: PR target/101846 * config/i386/i386-expand.c (expand_vec_perm_trunc_vinsert): New function. (ix86_vectorize_vec_perm_const): Call expand_vec_perm_trunc_vinsert. * config/i386/sse.md (vec_set_lo_v32hi): New define_insn. (vec_set_lo_v64qi): Ditto. (vec_set_lo_<mode><mask_name>): Extend to no-avx512dq. gcc/testsuite/ChangeLog: PR target/101846 * gcc.target/i386/pr101846-2.c: New test. * gcc.target/i386/pr101846-3.c: New test. --- gcc/config/i386/i386-expand.c | 125 +++++++++++++++++++++ gcc/config/i386/sse.md | 60 +++++++++- gcc/testsuite/gcc.target/i386/pr101846-2.c | 81 +++++++++++++ gcc/testsuite/gcc.target/i386/pr101846-3.c | 95 ++++++++++++++++ 4 files changed, 359 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-3.c diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index bd21efa9530..519caac2e15 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -18317,6 +18317,126 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) return false; } +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D + in terms of a pair of vpmovdw + vinserti128 instructions. */ +static bool +expand_vec_perm_trunc_vinsert (struct expand_vec_perm_d *d) +{ + unsigned i, nelt = d->nelt, mask = d->nelt - 1; + unsigned half = nelt / 2; + machine_mode half_mode, trunc_mode; + + /* vpmov{wb,dw,qd} only available under AVX512. */ + if (!d->one_operand_p || !TARGET_AVX512F + || (!TARGET_AVX512VL && GET_MODE_SIZE (d->vmode) < 64) + || GET_MODE_SIZE (GET_MODE_INNER (d->vmode)) > 4) + return false; + + /* TARGET_AVX512BW is needed for vpmovwb. */ + if (GET_MODE_INNER (d->vmode) == E_QImode && !TARGET_AVX512BW) + return false; + + for (i = 0; i < nelt; i++) + { + unsigned idx = d->perm[i] & mask; + if (idx != i * 2 && i < half) + return false; + if (idx != i && i >= half) + return false; + } + + rtx (*gen_trunc) (rtx, rtx) = NULL; + rtx (*gen_vec_set_lo) (rtx, rtx, rtx) = NULL; + switch (d->vmode) + { + case E_V16QImode: + gen_trunc = gen_truncv8hiv8qi2; + gen_vec_set_lo = gen_vec_setv2di; + half_mode = V8QImode; + trunc_mode = V8HImode; + break; + case E_V32QImode: + gen_trunc = gen_truncv16hiv16qi2; + gen_vec_set_lo = gen_vec_set_lo_v32qi; + half_mode = V16QImode; + trunc_mode = V16HImode; + break; + case E_V64QImode: + gen_trunc = gen_truncv32hiv32qi2; + gen_vec_set_lo = gen_vec_set_lo_v64qi; + half_mode = V32QImode; + trunc_mode = V32HImode; + break; + case E_V8HImode: + gen_trunc = gen_truncv4siv4hi2; + gen_vec_set_lo = gen_vec_setv2di; + half_mode = V4HImode; + trunc_mode = V4SImode; + break; + case E_V16HImode: + gen_trunc = gen_truncv8siv8hi2; + gen_vec_set_lo = gen_vec_set_lo_v16hi; + half_mode = V8HImode; + trunc_mode = V8SImode; + break; + case E_V32HImode: + gen_trunc = gen_truncv16siv16hi2; + gen_vec_set_lo = gen_vec_set_lo_v32hi; + half_mode = V16HImode; + trunc_mode = V16SImode; + break; + case E_V4SImode: + gen_trunc = gen_truncv2div2si2; + gen_vec_set_lo = gen_vec_setv2di; + half_mode = V2SImode; + trunc_mode = V2DImode; + break; + case E_V8SImode: + gen_trunc = gen_truncv4div4si2; + gen_vec_set_lo = gen_vec_set_lo_v8si; + half_mode = V4SImode; + trunc_mode = V4DImode; + break; + case E_V16SImode: + gen_trunc = gen_truncv8div8si2; + gen_vec_set_lo = gen_vec_set_lo_v16si; + half_mode = V8SImode; + trunc_mode = V8DImode; + break; + + default: + break; + } + + if (gen_trunc == NULL) + return false; + + rtx op_half = gen_reg_rtx (half_mode); + rtx op_trunc = d->op0; + if (d->vmode != trunc_mode) + op_trunc = lowpart_subreg (trunc_mode, op_trunc, d->vmode); + emit_insn (gen_trunc (op_half, op_trunc)); + + if (gen_vec_set_lo == gen_vec_setv2di) + { + op_half = lowpart_subreg (DImode, op_half, half_mode); + rtx op_dest = lowpart_subreg (V2DImode, d->op0, d->vmode); + + /* vec_set<mode> require register_operand. */ + if (MEM_P (op_dest)) + op_dest = force_reg (V2DImode, op_dest); + if (MEM_P (op_half)) + op_half = force_reg (DImode, op_half); + + emit_insn (gen_vec_set_lo (op_dest, op_half, GEN_INT(0))); + op_dest = lowpart_subreg (d->vmode, op_dest, V2DImode); + emit_move_insn (d->target, op_dest); + } + else + emit_insn (gen_vec_set_lo (d->target, d->op0, op_half)); + return true; +} + /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D in terms of a pair of pshuflw + pshufhw instructions. */ @@ -21028,6 +21148,11 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, d.op0 = nop0; d.op1 = force_reg (vmode, d.op1); + /* Try to match vpmov{wb,dw,qd}, although vinserti128 will be generated, + it's very likely to be optimized off. So let's put the function here. */ + if (expand_vec_perm_trunc_vinsert (&d)) + return true; + if (ix86_expand_vec_perm_const_1 (&d)) return true; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index f631756c829..87e22332c83 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -15162,8 +15162,12 @@ (define_insn "vec_set_lo_<mode><mask_name>" (const_int 10) (const_int 11) (const_int 12) (const_int 13) (const_int 14) (const_int 15)]))))] - "TARGET_AVX512DQ" - "vinsert<shuffletype>32x8\t{$0x0, %2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2, 0x0}" + "TARGET_AVX512F && <mask_avx512dq_condition>" +{ + if (TARGET_AVX512DQ) + return "vinsert<shuffletype>32x8\t{$0x0, %2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2, 0x0}"; + return "vinsert<shuffletype>64x4\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}"; +} [(set_attr "type" "sselog") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") @@ -22806,6 +22810,28 @@ (define_insn "vec_set_hi_v16hi" (set_attr "prefix" "vex,evex") (set_attr "mode" "OI")]) +(define_insn "vec_set_lo_v32hi" + [(set (match_operand:V32HI 0 "register_operand" "=v") + (vec_concat:V32HI + (match_operand:V16HI 2 "nonimmediate_operand" "vm") + (vec_select:V16HI + (match_operand:V32HI 1 "register_operand" "v") + (parallel [(const_int 16) (const_int 17) + (const_int 18) (const_int 19) + (const_int 20) (const_int 21) + (const_int 22) (const_int 23) + (const_int 24) (const_int 25) + (const_int 26) (const_int 27) + (const_int 28) (const_int 29) + (const_int 30) (const_int 31)]))))] + "TARGET_AVX512F" + "vinserti64x4\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "XI")]) + (define_insn "vec_set_lo_v32qi" [(set (match_operand:V32QI 0 "register_operand" "=x,v") (vec_concat:V32QI @@ -22854,6 +22880,36 @@ (define_insn "vec_set_hi_v32qi" (set_attr "prefix" "vex,evex") (set_attr "mode" "OI")]) +(define_insn "vec_set_lo_v64qi" + [(set (match_operand:V64QI 0 "register_operand" "=v") + (vec_concat:V64QI + (match_operand:V32QI 2 "nonimmediate_operand" "vm") + (vec_select:V32QI + (match_operand:V64QI 1 "register_operand" "v") + (parallel [(const_int 32) (const_int 33) + (const_int 34) (const_int 35) + (const_int 36) (const_int 37) + (const_int 38) (const_int 39) + (const_int 40) (const_int 41) + (const_int 42) (const_int 43) + (const_int 44) (const_int 45) + (const_int 46) (const_int 47) + (const_int 48) (const_int 49) + (const_int 50) (const_int 51) + (const_int 52) (const_int 53) + (const_int 54) (const_int 55) + (const_int 56) (const_int 57) + (const_int 58) (const_int 59) + (const_int 60) (const_int 61) + (const_int 62) (const_int 63)]))))] + "TARGET_AVX512F" + "vinserti64x4\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "XI")]) + (define_insn "<avx_avx2>_maskload<ssemodesuffix><avxsizesuffix>" [(set (match_operand:V48_AVX2 0 "register_operand" "=x") (unspec:V48_AVX2 diff --git a/gcc/testsuite/gcc.target/i386/pr101846-2.c b/gcc/testsuite/gcc.target/i386/pr101846-2.c new file mode 100644 index 00000000000..af4ae8ccdd6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101846-2.c @@ -0,0 +1,81 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mavx512vl -mavx512dq -O2" } */ +/* { dg-final { scan-assembler-times "vpmovwb" "3" } } */ +/* { dg-final { scan-assembler-times "vpmovdw" "3" } } */ +/* { dg-final { scan-assembler-times "vpmovqd" "3" } } */ + +typedef short v4hi __attribute__((vector_size (8))); +typedef short v8hi __attribute__((vector_size (16))); +typedef short v16hi __attribute__((vector_size (32))); +typedef short v32hi __attribute__((vector_size (64))); +typedef char v8qi __attribute__((vector_size (8))); +typedef char v16qi __attribute__((vector_size (16))); +typedef char v32qi __attribute__((vector_size (32))); +typedef char v64qi __attribute__((vector_size (64))); +typedef int v2si __attribute__((vector_size (8))); +typedef int v4si __attribute__((vector_size (16))); +typedef int v8si __attribute__((vector_size (32))); +typedef int v16si __attribute__((vector_size (64))); + +v16hi +foo_dw_512 (v32hi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30); +} + +v8hi +foo_dw_256 (v16hi x) +{ + return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14); +} + +v4hi +foo_dw_128 (v8hi x) +{ + return __builtin_shufflevector (x, x, 0, 2, 4, 6); +} + +v8si +foo_qd_512 (v16si x) +{ + return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14); +} + +v4si +foo_qd_256 (v8si x) +{ + return __builtin_shufflevector (x, x, 0, 2, 4, 6); +} + +v2si +foo_qd_128 (v4si x) +{ + return __builtin_shufflevector (x, x, 0, 2); +} + +v32qi +foo_wb_512 (v64qi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62); +} + +v16qi +foo_wb_256 (v32qi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30); +} + +v8qi +foo_wb_128 (v16qi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14); +} diff --git a/gcc/testsuite/gcc.target/i386/pr101846-3.c b/gcc/testsuite/gcc.target/i386/pr101846-3.c new file mode 100644 index 00000000000..380b1220327 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101846-3.c @@ -0,0 +1,95 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mavx512vl -mavx512dq -O2" } */ +/* { dg-final { scan-assembler-times "vpmovwb" "3" } } */ +/* { dg-final { scan-assembler-times "vpmovdw" "3" } } */ +/* { dg-final { scan-assembler-times "vpmovqd" "3" } } */ + +typedef short v4hi __attribute__((vector_size (8))); +typedef short v8hi __attribute__((vector_size (16))); +typedef short v16hi __attribute__((vector_size (32))); +typedef short v32hi __attribute__((vector_size (64))); +typedef char v8qi __attribute__((vector_size (8))); +typedef char v16qi __attribute__((vector_size (16))); +typedef char v32qi __attribute__((vector_size (32))); +typedef char v64qi __attribute__((vector_size (64))); +typedef int v2si __attribute__((vector_size (8))); +typedef int v4si __attribute__((vector_size (16))); +typedef int v8si __attribute__((vector_size (32))); +typedef int v16si __attribute__((vector_size (64))); + +v32hi +foo_dw_512 (v32hi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); +} + +v16hi +foo_dw_256 (v16hi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14, + 8, 9, 10, 11, 12, 13, 14, 15); +} + +v8hi +foo_dw_128 (v8hi x) +{ + return __builtin_shufflevector (x, x, 0, 2, 4, 6, 4, 5, 6, 7); +} + +v16si +foo_qd_512 (v16si x) +{ + return __builtin_shufflevector (x, x, 0, + 2, 4, 6, 8, 10, 12, 14, + 8, 9, 10, 11, 12, 13, 14, 15); +} + +v8si +foo_qd_256 (v8si x) +{ + return __builtin_shufflevector (x, x, 0, 2, 4, 6, 4, 5, 6, 7); +} + +v4si +foo_qd_128 (v4si x) +{ + return __builtin_shufflevector (x, x, 0, 2, 2, 3); +} + +v64qi +foo_wb_512 (v64qi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63); +} + +v32qi +foo_wb_256 (v32qi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); +} + +v16qi +foo_wb_128 (v16qi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14, + 8, 9, 10, 11, 12, 13, 14, 15); +} + -- 2.27.0