Hi: Add define_insn_and_split to combine avx_vec_concatv16si/2 and avx512f_zero_extendv16hiv16si2_1 since the latter already zero_extend the upper bits, similar for other patterns which are related to pmovzx{bw,wd,dq}.
It will do optimization like - vmovdqa %ymm0, %ymm0 # 7 [c=4 l=6] avx_vec_concatv16si/2 vpmovzxwd %ymm0, %zmm0 # 22 [c=4 l=6] avx512f_zero_extendv16hiv16si2 ret # 25 [c=0 l=1] simple_return_internal Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. Ok for trunk? gcc/ChangeLog: PR target/101846 * config/i386/sse.md (*avx2_zero_extendv16qiv16hi2_2): New post_reload define_insn_and_split. (*avx512bw_zero_extendv32qiv32hi2_2): Ditto. (*sse4_1_zero_extendv8qiv8hi2_4): Ditto. (*avx512f_zero_extendv16hiv16si2_2): Ditto. (*avx2_zero_extendv8hiv8si2_2): Ditto. (*sse4_1_zero_extendv4hiv4si2_4): Ditto. (*avx512f_zero_extendv8siv8di2_2): Ditto. (*avx2_zero_extendv4siv4di2_2): Ditto. (*sse4_1_zero_extendv2siv2di2_4): Ditto. gcc/testsuite/ChangeLog: PR target/101846 * gcc.target/i386/pr101846-1.c: New test. --- gcc/config/i386/sse.md | 220 +++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr101846-1.c | 95 +++++++++ 2 files changed, 315 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-1.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a46a2373547..6450c058458 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -673,8 +673,14 @@ (define_mode_iterator VI12_128 [V16QI V8HI]) (define_mode_iterator VI14_128 [V16QI V4SI]) (define_mode_iterator VI124_128 [V16QI V8HI V4SI]) (define_mode_iterator VI24_128 [V8HI V4SI]) +(define_mode_iterator VI128_128 [V16QI V8HI V2DI]) (define_mode_iterator VI248_128 [V8HI V4SI V2DI]) +(define_mode_iterator VI248_256 [V16HI V8SI V4DI]) +(define_mode_iterator VI248_512 [V32HI V16SI V8DI]) (define_mode_iterator VI48_128 [V4SI V2DI]) +(define_mode_iterator VI148_512 [V64QI V16SI V8DI]) +(define_mode_iterator VI148_256 [V32QI V8SI V4DI]) +(define_mode_iterator VI148_128 [V16QI V4SI V2DI]) ;; Various 256bit and 512 vector integer mode combinations (define_mode_iterator VI124_256 [V32QI V16HI V8SI]) @@ -18499,6 +18505,26 @@ (define_insn_and_split "*avx2_zero_extendv16qiv16hi2_1" operands[1] = lowpart_subreg (V16QImode, operands[1], V32QImode); }) +(define_insn_and_split "*avx2_zero_extendv16qiv16hi2_2" + [(set (match_operand:V32QI 0 "register_operand" "=v") + (vec_select:V32QI + (vec_concat:V64QI + (subreg:V32QI + (vec_concat:VI248_256 + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm") + (match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0) + (match_operand:V32QI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX2" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V16HI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V16HImode, operands[0], V32QImode); + operands[1] = lowpart_subreg (V16QImode, operands[1], <ssehalfvecmode>mode); +}) + (define_expand "<insn>v16qiv16hi2" [(set (match_operand:V16HI 0 "register_operand") (any_extend:V16HI @@ -18533,6 +18559,26 @@ (define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_1" operands[1] = lowpart_subreg (V32QImode, operands[1], V64QImode); }) +(define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_2" + [(set (match_operand:V64QI 0 "register_operand" "=v") + (vec_select:V64QI + (vec_concat:V128QI + (subreg:V64QI + (vec_concat:VI248_512 + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm") + (match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0) + (match_operand:V64QI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX512BW" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V32HI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V32HImode, operands[0], V64QImode); + operands[1] = lowpart_subreg (V32QImode, operands[1], <ssehalfvecmode>mode); +}) + (define_expand "<insn>v32qiv32hi2" [(set (match_operand:V32HI 0 "register_operand") (any_extend:V32HI @@ -18619,6 +18665,41 @@ (define_insn_and_split "*sse4_1_zero_extendv8qiv8hi2_3" } [(set_attr "isa" "noavx,noavx,avx")]) +(define_insn_and_split "*sse4_1_zero_extendv8qiv8hi2_4" + [(set (match_operand:V16QI 0 "register_operand" "=Yr,*x,Yw") + (vec_select:V16QI + (vec_concat:V32QI + (subreg:V16QI + (vec_concat:VI248_128 + (match_operand:<ssehalfvecmode> 1 "vector_operand" "YrBm,*xBm,Ywm") + (match_operand:<ssehalfvecmode> 2 "const0_operand" "C,C,C")) 0) + (match_operand:V16QI 3 "const0_operand" "C,C,C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n,n,n")])))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:V8HI + (vec_select:V8QI + (match_dup 1) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] +{ + operands[0] = lowpart_subreg (V8HImode, operands[0], V16QImode); + if (MEM_P (operands[1])) + { + operands[1] = lowpart_subreg (V8QImode, operands[1], <ssehalfvecmode>mode); + operands[1] = gen_rtx_ZERO_EXTEND (V8HImode, operands[1]); + emit_insn (gen_rtx_SET (operands[0], operands[1])); + DONE; + } + operands[1] = lowpart_subreg (V16QImode, operands[1], <ssehalfvecmode>mode); +} + [(set_attr "isa" "noavx,noavx,avx")]) + (define_expand "<insn>v8qiv8hi2" [(set (match_operand:V8HI 0 "register_operand") (any_extend:V8HI @@ -18809,6 +18890,26 @@ (define_insn_and_split "avx512f_zero_extendv16hiv16si2_1" operands[1] = lowpart_subreg (V16HImode, operands[1], V32HImode); }) +(define_insn_and_split "*avx512f_zero_extendv16hiv16si2_2" + [(set (match_operand:V32HI 0 "register_operand" "=v") + (vec_select:V32HI + (vec_concat:V64HI + (subreg:V32HI + (vec_concat:VI148_512 + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm") + (match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0) + (match_operand:V32HI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX512F" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V16SI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V16SImode, operands[0], V32HImode); + operands[1] = lowpart_subreg (V16HImode, operands[1], <ssehalfvecmode>mode); +}) + (define_insn "avx2_<code>v8hiv8si2<mask_name>" [(set (match_operand:V8SI 0 "register_operand" "=v") (any_extend:V8SI @@ -18843,6 +18944,27 @@ (define_insn_and_split "avx2_zero_extendv8hiv8si2_1" operands[1] = lowpart_subreg (V8HImode, operands[1], V16HImode); }) +(define_insn_and_split "*avx2_zero_extendv8hiv8si2_2" + [(set (match_operand:V16HI 0 "register_operand" "=v") + (vec_select:V16HI + (vec_concat:V32HI + (subreg:V16HI + (vec_concat:VI148_256 + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm") + (match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0) + (match_operand:V16HI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX2" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V8SI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V8SImode, operands[0], V16HImode); + operands[1] = lowpart_subreg (V8HImode, operands[1], <ssehalfvecmode>mode); +}) + + (define_insn "sse4_1_<code>v4hiv4si2<mask_name>" [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") (any_extend:V4SI @@ -18932,6 +19054,39 @@ (define_insn_and_split "*sse4_1_zero_extendv4hiv4si2_3" } [(set_attr "isa" "noavx,noavx,avx")]) +(define_insn_and_split "*sse4_1_zero_extendv4hiv4si2_4" + [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v") + (vec_select:V8HI + (vec_concat:V16HI + (subreg:V8HI + (vec_concat:VI148_128 + (match_operand:<ssehalfvecmode> 1 "vector_operand" "YrBm,*xBm,vm") + (match_operand:<ssehalfvecmode> 2 "const0_operand" "C,C,C")) 0) + (match_operand:V8HI 3 "const0_operand" "C,C,C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n,n,n")])))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:V4SI + (vec_select:V4HI + (match_dup 1) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] +{ + operands[0] = lowpart_subreg (V4SImode, operands[0], V8HImode); + if (MEM_P (operands[1])) + { + operands[1] = lowpart_subreg (V4HImode, operands[1], <ssehalfvecmode>mode); + operands[1] = gen_rtx_ZERO_EXTEND (V4SImode, operands[1]); + emit_insn (gen_rtx_SET (operands[0], operands[1])); + DONE; + } + operands[1] = lowpart_subreg (V8HImode, operands[1], <ssehalfvecmode>mode); +} + [(set_attr "isa" "noavx,noavx,avx")]) + (define_insn "avx512f_<code>v8qiv8di2<mask_name>" [(set (match_operand:V8DI 0 "register_operand" "=v") (any_extend:V8DI @@ -19242,6 +19397,24 @@ (define_insn_and_split "*avx512f_zero_extendv8siv8di2_1" operands[1] = lowpart_subreg (V8SImode, operands[1], V16SImode); }) +(define_insn_and_split "*avx512f_zero_extendv8siv8di2_2" + [(set (match_operand:V16SI 0 "register_operand" "=v") + (vec_select:V16SI + (vec_concat:V32SI + (vec_concat:V16SI + (match_operand:V8SI 1 "nonimmediate_operand" "vm") + (match_operand:V8SI 2 "const0_operand" "C")) + (match_operand:V16SI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX512F" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V8DI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V8DImode, operands[0], V16SImode); +}) + (define_expand "<insn>v8siv8di2" [(set (match_operand:V8DI 0 "register_operand" "=v") (any_extend:V8DI @@ -19276,6 +19449,24 @@ (define_insn_and_split "*avx2_zero_extendv4siv4di2_1" operands[1] = lowpart_subreg (V4SImode, operands[1], V8SImode); }) +(define_insn_and_split "*avx2_zero_extendv4siv4di2_2" + [(set (match_operand:V8SI 0 "register_operand" "=v") + (vec_select:V8SI + (vec_concat:V16SI + (vec_concat:V8SI + (match_operand:V4SI 1 "nonimmediate_operand" "vm") + (match_operand:V4SI 2 "const0_operand" "C")) + (match_operand:V8SI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX2" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V4DI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V4DImode, operands[0], V8SImode); +}) + (define_expand "<insn>v4siv4di2" [(set (match_operand:V4DI 0 "register_operand") (any_extend:V4DI @@ -19352,6 +19543,35 @@ (define_insn_and_split "*sse4_1_zero_extendv2siv2di2_3" } [(set_attr "isa" "noavx,noavx,avx")]) +(define_insn_and_split "*sse4_1_zero_extendv2siv2di2_4" + [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") + (vec_select:V4SI + (vec_concat:V8SI + (vec_concat:V4SI + (match_operand:V2SI 1 "vector_operand" "YrBm, *xBm, vm") + (match_operand:V2SI 2 "const0_operand" "C,C,C")) + (match_operand:V4SI 3 "const0_operand" "C,C,C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n,n,n")])))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:V2DI + (vec_select:V2SI (match_dup 1) + (parallel [(const_int 0) (const_int 1)]))))] +{ + operands[0] = lowpart_subreg (V2DImode, operands[0], V4SImode); + if (MEM_P (operands[1])) + { + operands[1] = gen_rtx_ZERO_EXTEND (V2DImode, operands[1]); + emit_insn (gen_rtx_SET (operands[0], operands[1])); + DONE; + } + operands[1] = lowpart_subreg (V4SImode, operands[1], V2SImode); +} + [(set_attr "isa" "noavx,noavx,avx")]) + (define_expand "<insn>v2siv2di2" [(set (match_operand:V2DI 0 "register_operand") (any_extend:V2DI diff --git a/gcc/testsuite/gcc.target/i386/pr101846-1.c b/gcc/testsuite/gcc.target/i386/pr101846-1.c new file mode 100644 index 00000000000..40d95bde6fd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101846-1.c @@ -0,0 +1,95 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mavx512vl -mavx512dq -O2" } */ +/* { dg-final { scan-assembler-not "vmov" } } */ +/* { dg-final { scan-assembler-times "vpmovzxbw" "3" } } */ +/* { dg-final { scan-assembler-times "vpmovzxwd" "3" } } */ +/* { dg-final { scan-assembler-times "vpmovzxdq" "3" } } */ + +typedef short v4hi __attribute__((vector_size (8))); +typedef short v8hi __attribute__((vector_size (16))); +typedef short v16hi __attribute__((vector_size (32))); +typedef short v32hi __attribute__((vector_size (64))); +typedef char v8qi __attribute__((vector_size (8))); +typedef char v16qi __attribute__((vector_size (16))); +typedef char v32qi __attribute__((vector_size (32))); +typedef char v64qi __attribute__((vector_size (64))); +typedef int v2si __attribute__((vector_size (8))); +typedef int v4si __attribute__((vector_size (16))); +typedef int v8si __attribute__((vector_size (32))); +typedef int v16si __attribute__((vector_size (64))); + +v32hi +foo_zxwd_512 (v16hi x) +{ + return __builtin_shufflevector (x, (v16hi) {}, + 0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, + 12, 28, 13, 29, 14, 30, 15, 31); +} + +v16hi +foo_zxwd_256 (v8hi x) +{ + return __builtin_shufflevector (x, (v8hi) {}, + 0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15); +} + +v8hi +foo_zxwd_128 (v4hi x) +{ + return __builtin_shufflevector (x, (v4hi) {}, 0, 4, 1, 5, 2, 6, 3, 7); +} + +v16si +foo_zxdq_512 (v8si x) +{ + return __builtin_shufflevector (x, (v8si) {}, + 0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15); +} + +v8si +foo_zxdq_256 (v4si x) +{ + return __builtin_shufflevector (x, (v4si) {}, 0, 4, 1, 5, 2, 6, 3, 7); +} + +v4si +foo_zxdq_128 (v2si x) +{ + return __builtin_shufflevector (x, (v2si) {}, 0, 2, 1, 3); +} + +v64qi +foo_zxbw_512 (v32qi x) +{ + return __builtin_shufflevector (x, (v32qi) {}, + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47, + 16, 48, 17, 49, 18, 50, 19, 51, + 20, 52, 21, 53, 22, 54, 23, 55, + 24, 56, 25, 57, 26, 58, 27, 59, + 28, 60, 29, 61, 30, 62, 31, 63); +} + +v32qi +foo_zxbw_256 (v16qi x) +{ + return __builtin_shufflevector (x, (v16qi) {}, + 0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, + 12, 28, 13, 29, 14, 30, 15, 31); +} + +v16qi +foo_zxbw_128 (v8qi x) +{ + return __builtin_shufflevector (x, (v8qi) {}, + 0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15); +} -- 2.27.0