Hello! Attached patch adds missing vector zero/sign_extend expanders to allow vectorization of operations between different vector sizes.
The patch regresses (progresses?): FAIL: gcc.target/i386/pr92645-4.c scan-tree-dump-times optimized "vec_unpack_lo" 3 but eyeballing the asm code before/after the patch, we get much better: .L3: - vmovdqu (%rsi,%rax), %xmm6 - vpxor %xmm5, %xmm5, %xmm5 - vmovdqa %ymm5, -32(%rsp) - vmovdqa %xmm6, -32(%rsp) - vpmovzxbw -32(%rsp), %ymm0 + vpmovzxbw (%rsi,%rax), %ymm0 vpmullw %ymm4, %ymm0, %ymm0 vpaddw %ymm2, %ymm0, %ymm0 vpsrlw $8, %ymm0, %ymm0 and even more differences to a much better code in the loop prologue. (Please note a strange double-save to a stack slot in the old code). Richi, I guess that the testcase you introduced needs some adjustment. As discussed in the PR, there are a couple of XFAILs, where the compiler is not able to vectorize the code. The named expanders are there, but for the reason, explained in PR comment #8, middle-end doesn't exercise them. gcc/ChangeLog: 2020-05-19 Uroš Bizjak <ubiz...@gmail.com> PR target/92658 * config/i386/sse.md (<code>v16qiv16hi2): New expander. (<code>v32qiv32hi2): Ditto. (<code>v8qiv8hi2): Ditto. (<code>v16qiv16si2): Ditto. (<code>v8qiv8si2): Ditto. (<code>v4qiv4si2): Ditto. (<code>v16hiv16si2): Ditto. (<code>v8hiv8si2): Ditto. (<code>v4hiv4si2): Ditto. (<code>v8qiv8di2): Ditto. (<code>v4qiv4di2): Ditto. (<code>v2qiv2di2): Ditto. (<code>v8hiv8di2): Ditto. (<code>v4hiv4di2): Ditto. (<code>v2hiv2di2): Ditto. (<code>v8siv8di2): Ditto. (<code>v4siv4di2): Ditto. (<code>v2siv2di2): Ditto. testsuite/ChangeLog: 2020-05-19 Uroš Bizjak <ubiz...@gmail.com> PR target/92658 * gcc.target/i386/pr92658-sse4.c: New test. * gcc.target/i386/pr92658-avx2.c: New test. * gcc.target/i386/pr92658-avx512bw.c: New test. Patch si bootstrapped and regression tested on x86_64-linux-gnu {,-m32}. Uros.
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 153982c9f12..9bf4361384a 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17328,6 +17328,12 @@ (set_attr "prefix" "maybe_evex") (set_attr "mode" "OI")]) +(define_expand "<code>v16qiv16hi2" + [(set (match_operand:V16HI 0 "register_operand") + (any_extend:V16HI + (match_operand:V16QI 1 "nonimmediate_operand")))] + "TARGET_AVX2") + (define_insn "avx512bw_<code>v32qiv32hi2<mask_name>" [(set (match_operand:V32HI 0 "register_operand" "=v") (any_extend:V32HI @@ -17339,6 +17345,12 @@ (set_attr "prefix" "evex") (set_attr "mode" "XI")]) +(define_expand "<code>v32qiv32hi2" + [(set (match_operand:V32HI 0 "register_operand") + (any_extend:V32HI + (match_operand:V32QI 1 "nonimmediate_operand")))] + "TARGET_AVX512BW") + (define_insn "sse4_1_<code>v8qiv8hi2<mask_name>" [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v") (any_extend:V8HI @@ -17388,6 +17400,20 @@ (any_extend:V8HI (match_dup 1)))] "operands[1] = adjust_address_nv (operands[1], V8QImode, 0);") +(define_expand "<code>v8qiv8hi2" + [(set (match_operand:V8HI 0 "register_operand") + (any_extend:V8HI + (match_operand:V8QI 1 "nonimmediate_operand")))] + "TARGET_SSE4_1" +{ + if (!MEM_P (operands[1])) + { + operands[1] = simplify_subreg (V16QImode, operands[1], V8QImode, 0); + emit_insn (gen_sse4_1_<code>v8qiv8hi2 (operands[0], operands[1])); + DONE; + } +}) + (define_insn "<mask_codefor>avx512f_<code>v16qiv16si2<mask_name>" [(set (match_operand:V16SI 0 "register_operand" "=v") (any_extend:V16SI @@ -17398,6 +17424,12 @@ (set_attr "prefix" "evex") (set_attr "mode" "XI")]) +(define_expand "<code>v16qiv16si2" + [(set (match_operand:V16SI 0 "register_operand") + (any_extend:V16SI + (match_operand:V16QI 1 "nonimmediate_operand")))] + "TARGET_AVX512F") + (define_insn "avx2_<code>v8qiv8si2<mask_name>" [(set (match_operand:V8SI 0 "register_operand" "=v") (any_extend:V8SI @@ -17445,6 +17477,20 @@ (any_extend:V8SI (match_dup 1)))] "operands[1] = adjust_address_nv (operands[1], V8QImode, 0);") +(define_expand "<code>v8qiv8si2" + [(set (match_operand:V8SI 0 "register_operand") + (any_extend:V8SI + (match_operand:V8QI 1 "nonimmediate_operand")))] + "TARGET_AVX2" +{ + if (!MEM_P (operands[1])) + { + operands[1] = simplify_subreg (V16QImode, operands[1], V8QImode, 0); + emit_insn (gen_avx2_<code>v8qiv8si2 (operands[0], operands[1])); + DONE; + } +}) + (define_insn "sse4_1_<code>v4qiv4si2<mask_name>" [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") (any_extend:V4SI @@ -17494,6 +17540,20 @@ (any_extend:V4SI (match_dup 1)))] "operands[1] = adjust_address_nv (operands[1], V4QImode, 0);") +(define_expand "<code>v4qiv4si2" + [(set (match_operand:V4SI 0 "register_operand") + (any_extend:V4SI + (match_operand:V4QI 1 "nonimmediate_operand")))] + "TARGET_SSE4_1" +{ + if (!MEM_P (operands[1])) + { + operands[1] = simplify_subreg (V16QImode, operands[1], V4QImode, 0); + emit_insn (gen_sse4_1_<code>v4qiv4si2 (operands[0], operands[1])); + DONE; + } +}) + (define_insn "avx512f_<code>v16hiv16si2<mask_name>" [(set (match_operand:V16SI 0 "register_operand" "=v") (any_extend:V16SI @@ -17504,6 +17564,12 @@ (set_attr "prefix" "evex") (set_attr "mode" "XI")]) +(define_expand "<code>v16hiv16si2" + [(set (match_operand:V16SI 0 "register_operand") + (any_extend:V16SI + (match_operand:V16HI 1 "nonimmediate_operand")))] + "TARGET_AVX512F") + (define_insn "avx2_<code>v8hiv8si2<mask_name>" [(set (match_operand:V8SI 0 "register_operand" "=v") (any_extend:V8SI @@ -17515,6 +17581,12 @@ (set_attr "prefix" "maybe_evex") (set_attr "mode" "OI")]) +(define_expand "<code>v8hiv8si2" + [(set (match_operand:V8SI 0 "register_operand") + (any_extend:V8SI + (match_operand:V8HI 1 "nonimmediate_operand")))] + "TARGET_AVX2") + (define_insn "sse4_1_<code>v4hiv4si2<mask_name>" [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") (any_extend:V4SI @@ -17560,6 +17632,20 @@ (any_extend:V4SI (match_dup 1)))] "operands[1] = adjust_address_nv (operands[1], V4HImode, 0);") +(define_expand "<code>v4hiv4si2" + [(set (match_operand:V4SI 0 "register_operand") + (any_extend:V4SI + (match_operand:V4HI 1 "nonimmediate_operand")))] + "TARGET_SSE4_1" +{ + if (!MEM_P (operands[1])) + { + operands[1] = simplify_subreg (V8HImode, operands[1], V4HImode, 0); + emit_insn (gen_sse4_1_<code>v4hiv4si2 (operands[0], operands[1])); + DONE; + } +}) + (define_insn "avx512f_<code>v8qiv8di2<mask_name>" [(set (match_operand:V8DI 0 "register_operand" "=v") (any_extend:V8DI @@ -17604,6 +17690,20 @@ (any_extend:V8DI (match_dup 1)))] "operands[1] = adjust_address_nv (operands[1], V8QImode, 0);") +(define_expand "<code>v8qiv8di2" + [(set (match_operand:V8DI 0 "register_operand") + (any_extend:V8DI + (match_operand:V8QI 1 "nonimmediate_operand")))] + "TARGET_AVX512F" +{ + if (!MEM_P (operands[1])) + { + operands[1] = simplify_subreg (V16QImode, operands[1], V8QImode, 0); + emit_insn (gen_avx512f_<code>v8qiv8di2 (operands[0], operands[1])); + DONE; + } +}) + (define_insn "avx2_<code>v4qiv4di2<mask_name>" [(set (match_operand:V4DI 0 "register_operand" "=v") (any_extend:V4DI @@ -17651,6 +17751,20 @@ (any_extend:V4DI (match_dup 1)))] "operands[1] = adjust_address_nv (operands[1], V4QImode, 0);") +(define_expand "<code>v4qiv4di2" + [(set (match_operand:V4DI 0 "register_operand") + (any_extend:V4DI + (match_operand:V4QI 1 "nonimmediate_operand")))] + "TARGET_AVX2" +{ + if (!MEM_P (operands[1])) + { + operands[1] = simplify_subreg (V16QImode, operands[1], V8QImode, 0); + emit_insn (gen_avx2_<code>v4qiv4di2 (operands[0], operands[1])); + DONE; + } +}) + (define_insn "sse4_1_<code>v2qiv2di2<mask_name>" [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,v") (any_extend:V2DI @@ -17665,6 +17779,17 @@ (set_attr "prefix" "orig,orig,maybe_evex") (set_attr "mode" "TI")]) +(define_expand "<code>v2qiv2di2" + [(set (match_operand:V2DI 0 "register_operand") + (any_extend:V2DI + (match_operand:V2QI 1 "register_operand")))] + "TARGET_SSE4_1" +{ + operands[1] = simplify_subreg (V16QImode, operands[1], V2QImode, 0); + emit_insn (gen_sse4_1_<code>v2qiv2di2 (operands[0], operands[1])); + DONE; +}) + (define_insn "avx512f_<code>v8hiv8di2<mask_name>" [(set (match_operand:V8DI 0 "register_operand" "=v") (any_extend:V8DI @@ -17675,6 +17800,12 @@ (set_attr "prefix" "evex") (set_attr "mode" "XI")]) +(define_expand "<code>v8hiv8di2" + [(set (match_operand:V8DI 0 "register_operand") + (any_extend:V8DI + (match_operand:V8HI 1 "nonimmediate_operand")))] + "TARGET_AVX512F") + (define_insn "avx2_<code>v4hiv4di2<mask_name>" [(set (match_operand:V4DI 0 "register_operand" "=v") (any_extend:V4DI @@ -17718,6 +17849,20 @@ (any_extend:V4DI (match_dup 1)))] "operands[1] = adjust_address_nv (operands[1], V4HImode, 0);") +(define_expand "<code>v4hiv4di2" + [(set (match_operand:V4DI 0 "register_operand") + (any_extend:V4DI + (match_operand:V4HI 1 "nonimmediate_operand")))] + "TARGET_AVX2" +{ + if (!MEM_P (operands[1])) + { + operands[1] = simplify_subreg (V8HImode, operands[1], V4HImode, 0); + emit_insn (gen_avx2_<code>v4hiv4di2 (operands[0], operands[1])); + DONE; + } +}) + (define_insn "sse4_1_<code>v2hiv2di2<mask_name>" [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,v") (any_extend:V2DI @@ -17765,6 +17910,20 @@ (any_extend:V2DI (match_dup 1)))] "operands[1] = adjust_address_nv (operands[1], V2HImode, 0);") +(define_expand "<code>v2hiv2di2" + [(set (match_operand:V2DI 0 "register_operand") + (any_extend:V2DI + (match_operand:V2HI 1 "nonimmediate_operand")))] + "TARGET_SSE4_1" +{ + if (!MEM_P (operands[1])) + { + operands[1] = simplify_subreg (V8HImode, operands[1], V2HImode, 0); + emit_insn (gen_sse4_1_<code>v2hiv2di2 (operands[0], operands[1])); + DONE; + } +}) + (define_insn "avx512f_<code>v8siv8di2<mask_name>" [(set (match_operand:V8DI 0 "register_operand" "=v") (any_extend:V8DI @@ -17775,6 +17934,12 @@ (set_attr "prefix" "evex") (set_attr "mode" "XI")]) +(define_expand "<code>v8siv8di2" + [(set (match_operand:V8DI 0 "register_operand" "=v") + (any_extend:V8DI + (match_operand:V8SI 1 "nonimmediate_operand" "vm")))] + "TARGET_AVX512F") + (define_insn "avx2_<code>v4siv4di2<mask_name>" [(set (match_operand:V4DI 0 "register_operand" "=v") (any_extend:V4DI @@ -17786,6 +17951,12 @@ (set_attr "prefix_extra" "1") (set_attr "mode" "OI")]) +(define_expand "<code>v4siv4di2" + [(set (match_operand:V4DI 0 "register_operand" "=v") + (any_extend:V4DI + (match_operand:V4SI 1 "nonimmediate_operand" "vm")))] + "TARGET_AVX2") + (define_insn "sse4_1_<code>v2siv2di2<mask_name>" [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,v") (any_extend:V2DI @@ -17829,6 +18000,20 @@ (any_extend:V2DI (match_dup 1)))] "operands[1] = adjust_address_nv (operands[1], V2SImode, 0);") +(define_expand "<code>v2siv2di2" + [(set (match_operand:V2DI 0 "register_operand") + (any_extend:V2DI + (match_operand:V2SI 1 "nonimmediate_operand")))] + "TARGET_SSE4_1" +{ + if (!MEM_P (operands[1])) + { + operands[1] = simplify_subreg (V4SImode, operands[1], V2SImode, 0); + emit_insn (gen_sse4_1_<code>v2siv2di2 (operands[0], operands[1])); + DONE; + } +}) + ;; ptestps/ptestpd are very similar to comiss and ucomiss when ;; setting FLAGS_REG. But it is not a really compare instruction. (define_insn "avx_vtest<ssemodesuffix><avxsizesuffix>" diff --git a/gcc/testsuite/gcc.target/i386/pr92658-avx2.c b/gcc/testsuite/gcc.target/i386/pr92658-avx2.c new file mode 100644 index 00000000000..21fa3e5530f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr92658-avx2.c @@ -0,0 +1,192 @@ +/* PR target/92658 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -mavx2" } */ + +typedef unsigned char v32qi __attribute__((vector_size (32))); +typedef unsigned short v16hi __attribute__((vector_size (32))); +typedef unsigned int v8si __attribute__((vector_size (32))); +typedef unsigned long long v4di __attribute__((vector_size (32))); + +void +foo_u8_u16 (v16hi * dst, v32qi * __restrict src) +{ + unsigned short tem[16]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + tem[4] = (*src)[4]; + tem[5] = (*src)[5]; + tem[6] = (*src)[6]; + tem[7] = (*src)[7]; + tem[8] = (*src)[8]; + tem[9] = (*src)[9]; + tem[10] = (*src)[10]; + tem[11] = (*src)[11]; + tem[12] = (*src)[12]; + tem[13] = (*src)[13]; + tem[14] = (*src)[14]; + tem[15] = (*src)[15]; + dst[0] = *(v16hi *) tem; +} + +void +bar_u8_u16 (v16hi * dst, v32qi src) +{ + unsigned short tem[16]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + tem[4] = src[4]; + tem[5] = src[5]; + tem[6] = src[6]; + tem[7] = src[7]; + tem[8] = src[8]; + tem[9] = src[9]; + tem[10] = src[10]; + tem[11] = src[11]; + tem[12] = src[12]; + tem[13] = src[13]; + tem[14] = src[14]; + tem[15] = src[15]; + dst[0] = *(v16hi *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxbw" 2 } } */ + +void +foo_u8_u32 (v8si * dst, v32qi * __restrict src) +{ + unsigned int tem[8]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + tem[4] = (*src)[4]; + tem[5] = (*src)[5]; + tem[6] = (*src)[6]; + tem[7] = (*src)[7]; + dst[0] = *(v8si *) tem; +} + +void +bar_u8_u32 (v8si * dst, v32qi src) +{ + unsigned int tem[8]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + tem[4] = src[4]; + tem[5] = src[5]; + tem[6] = src[6]; + tem[7] = src[7]; + dst[0] = *(v8si *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxbd" 2 } } */ + +void +foo_u8_u64 (v4di * dst, v32qi * __restrict src) +{ + unsigned long long tem[4]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + dst[0] = *(v4di *) tem; +} + +void +bar_u8_u64 (v4di * dst, v32qi src) +{ + unsigned long long tem[4]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + dst[0] = *(v4di *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxbq" 2 { xfail *-*-* } } } */ + +void +foo_u16_u32 (v8si * dst, v16hi * __restrict src) +{ + unsigned int tem[8]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + tem[4] = (*src)[4]; + tem[5] = (*src)[5]; + tem[6] = (*src)[6]; + tem[7] = (*src)[7]; + dst[0] = *(v8si *) tem; +} + +void +bar_u16_u32 (v8si * dst, v16hi src) +{ + unsigned int tem[8]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + tem[4] = src[4]; + tem[5] = src[5]; + tem[6] = src[6]; + tem[7] = src[7]; + dst[0] = *(v8si *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxwd" 2 } } */ + +void +foo_u16_u64 (v4di * dst, v16hi * __restrict src) +{ + unsigned long long tem[4]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + dst[0] = *(v4di *) tem; +} + +void +bar_u16_u64 (v4di * dst, v16hi src) +{ + unsigned long long tem[4]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + dst[0] = *(v4di *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxwq" 2 } } */ + +void +foo_u32_u64 (v4di * dst, v8si * __restrict src) +{ + unsigned long long tem[4]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + dst[0] = *(v4di *) tem; +} + +void +bar_u32_u64 (v4di * dst, v8si src) +{ + unsigned long long tem[4]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + dst[0] = *(v4di *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxdq" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr92658-avx512bw.c b/gcc/testsuite/gcc.target/i386/pr92658-avx512bw.c new file mode 100644 index 00000000000..b1d54d24a81 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr92658-avx512bw.c @@ -0,0 +1,280 @@ +/* PR target/92658 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -mavx512bw" } */ + +typedef unsigned char v64qi __attribute__((vector_size (64))); +typedef unsigned short v32hi __attribute__((vector_size (64))); +typedef unsigned int v16si __attribute__((vector_size (64))); +typedef unsigned long long v8di __attribute__((vector_size (64))); + +void +foo_u8_u16 (v32hi * dst, v64qi * __restrict src) +{ + unsigned short tem[32]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + tem[4] = (*src)[4]; + tem[5] = (*src)[5]; + tem[6] = (*src)[6]; + tem[7] = (*src)[7]; + tem[8] = (*src)[8]; + tem[9] = (*src)[9]; + tem[10] = (*src)[10]; + tem[11] = (*src)[11]; + tem[12] = (*src)[12]; + tem[13] = (*src)[13]; + tem[14] = (*src)[14]; + tem[15] = (*src)[15]; + tem[16] = (*src)[16]; + tem[17] = (*src)[17]; + tem[18] = (*src)[18]; + tem[19] = (*src)[19]; + tem[20] = (*src)[20]; + tem[21] = (*src)[21]; + tem[22] = (*src)[22]; + tem[23] = (*src)[23]; + tem[24] = (*src)[24]; + tem[25] = (*src)[25]; + tem[26] = (*src)[26]; + tem[27] = (*src)[27]; + tem[28] = (*src)[28]; + tem[29] = (*src)[29]; + tem[30] = (*src)[30]; + tem[31] = (*src)[31]; + dst[0] = *(v32hi *) tem; +} + +void +bar_u8_u16 (v32hi * dst, v64qi src) +{ + unsigned short tem[32]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + tem[4] = src[4]; + tem[5] = src[5]; + tem[6] = src[6]; + tem[7] = src[7]; + tem[8] = src[8]; + tem[9] = src[9]; + tem[10] = src[10]; + tem[11] = src[11]; + tem[12] = src[12]; + tem[13] = src[13]; + tem[14] = src[14]; + tem[15] = src[15]; + tem[16] = src[16]; + tem[17] = src[17]; + tem[18] = src[18]; + tem[19] = src[19]; + tem[20] = src[20]; + tem[21] = src[21]; + tem[22] = src[22]; + tem[23] = src[23]; + tem[24] = src[24]; + tem[25] = src[25]; + tem[26] = src[26]; + tem[27] = src[27]; + tem[28] = src[28]; + tem[29] = src[29]; + tem[30] = src[30]; + tem[31] = src[31]; + dst[0] = *(v32hi *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxbw" 2 } } */ + +void +foo_u8_u32 (v16si * dst, v64qi * __restrict src) +{ + unsigned int tem[16]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + tem[4] = (*src)[4]; + tem[5] = (*src)[5]; + tem[6] = (*src)[6]; + tem[7] = (*src)[7]; + tem[8] = (*src)[8]; + tem[9] = (*src)[9]; + tem[10] = (*src)[10]; + tem[11] = (*src)[11]; + tem[12] = (*src)[12]; + tem[13] = (*src)[13]; + tem[14] = (*src)[14]; + tem[15] = (*src)[15]; + dst[0] = *(v16si *) tem; +} + +void +bar_u8_u32 (v16si * dst, v64qi src) +{ + unsigned int tem[16]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + tem[4] = src[4]; + tem[5] = src[5]; + tem[6] = src[6]; + tem[7] = src[7]; + tem[8] = src[8]; + tem[9] = src[9]; + tem[10] = src[10]; + tem[11] = src[11]; + tem[12] = src[12]; + tem[13] = src[13]; + tem[14] = src[14]; + tem[15] = src[15]; + dst[0] = *(v16si *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxbd" 2 } } */ + +void +foo_u8_u64 (v8di * dst, v64qi * __restrict src) +{ + unsigned long long tem[8]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + tem[4] = (*src)[4]; + tem[5] = (*src)[5]; + tem[6] = (*src)[6]; + tem[7] = (*src)[7]; + dst[0] = *(v8di *) tem; +} + +void +bar_u8_u64 (v8di * dst, v64qi src) +{ + unsigned long long tem[8]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + tem[4] = src[4]; + tem[5] = src[5]; + tem[6] = src[6]; + tem[7] = src[7]; + dst[0] = *(v8di *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxbq" 2 } } */ + +void +foo_u16_u32 (v16si * dst, v32hi * __restrict src) +{ + unsigned int tem[16]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + tem[4] = (*src)[4]; + tem[5] = (*src)[5]; + tem[6] = (*src)[6]; + tem[7] = (*src)[7]; + tem[8] = (*src)[8]; + tem[9] = (*src)[9]; + tem[10] = (*src)[10]; + tem[11] = (*src)[11]; + tem[12] = (*src)[12]; + tem[13] = (*src)[13]; + tem[14] = (*src)[14]; + tem[15] = (*src)[15]; + dst[0] = *(v16si *) tem; +} + +void +bar_u16_u32 (v16si * dst, v32hi src) +{ + unsigned int tem[16]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + tem[4] = src[4]; + tem[5] = src[5]; + tem[6] = src[6]; + tem[7] = src[7]; + tem[8] = src[8]; + tem[9] = src[9]; + tem[10] = src[10]; + tem[11] = src[11]; + tem[12] = src[12]; + tem[13] = src[13]; + tem[14] = src[14]; + tem[15] = src[15]; + dst[0] = *(v16si *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxwd" 2 } } */ + +void +foo_u16_u64 (v8di * dst, v32hi * __restrict src) +{ + unsigned long long tem[8]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + tem[4] = (*src)[4]; + tem[5] = (*src)[5]; + tem[6] = (*src)[6]; + tem[7] = (*src)[7]; + dst[0] = *(v8di *) tem; +} + +void +bar_u16_u64 (v8di * dst, v32hi src) +{ + unsigned long long tem[8]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + tem[4] = src[4]; + tem[5] = src[5]; + tem[6] = src[6]; + tem[7] = src[7]; + dst[0] = *(v8di *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxwq" 2 } } */ + +void +foo_u32_u64 (v8di * dst, v16si * __restrict src) +{ + unsigned long long tem[8]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + tem[4] = (*src)[4]; + tem[5] = (*src)[5]; + tem[6] = (*src)[6]; + tem[7] = (*src)[7]; + dst[0] = *(v8di *) tem; +} + +void +bar_u32_u64 (v8di * dst, v16si src) +{ + unsigned long long tem[8]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + tem[4] = src[4]; + tem[5] = src[5]; + tem[6] = src[6]; + tem[7] = src[7]; + dst[0] = *(v8di *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxdq" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr92658-sse4.c b/gcc/testsuite/gcc.target/i386/pr92658-sse4.c new file mode 100644 index 00000000000..e4626292e95 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr92658-sse4.c @@ -0,0 +1,148 @@ +/* PR target/92658 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msse4.1" } */ + +typedef unsigned char v16qi __attribute__((vector_size (16))); +typedef unsigned short v8hi __attribute__((vector_size (16))); +typedef unsigned int v4si __attribute__((vector_size (16))); +typedef unsigned long long v2di __attribute__((vector_size (16))); + +void +foo_u8_u16 (v8hi * dst, v16qi * __restrict src) +{ + unsigned short tem[8]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + tem[4] = (*src)[4]; + tem[5] = (*src)[5]; + tem[6] = (*src)[6]; + tem[7] = (*src)[7]; + dst[0] = *(v8hi *) tem; +} + +void +bar_u8_u16 (v8hi * dst, v16qi src) +{ + unsigned short tem[8]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + tem[4] = src[4]; + tem[5] = src[5]; + tem[6] = src[6]; + tem[7] = src[7]; + dst[0] = *(v8hi *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxbw" 2 } } */ + +void +foo_u8_u32 (v4si * dst, v16qi * __restrict src) +{ + unsigned int tem[4]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + dst[0] = *(v4si *) tem; +} + +void +bar_u8_u32 (v4si * dst, v16qi src) +{ + unsigned int tem[4]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + dst[0] = *(v4si *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxbd" 2 { xfail *-*-* } } } */ + +void +foo_u8_u64 (v2di * dst, v16qi * __restrict src) +{ + unsigned long long tem[2]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + dst[0] = *(v2di *) tem; +} + +void +bar_u8_u64 (v2di * dst, v16qi src) +{ + unsigned long long tem[2]; + tem[0] = src[0]; + tem[1] = src[1]; + dst[0] = *(v2di *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxbq" 2 { xfail *-*-* } } } */ + +void +foo_u16_u32 (v4si * dst, v8hi * __restrict src) +{ + unsigned int tem[4]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + dst[0] = *(v4si *) tem; +} + +void +bar_u16_u32 (v4si * dst, v8hi src) +{ + unsigned int tem[4]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + dst[0] = *(v4si *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxwd" 2 } } */ + +void +foo_u16_u64 (v2di * dst, v8hi * __restrict src) +{ + unsigned long long tem[2]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + dst[0] = *(v2di *) tem; +} + +void +bar_u16_u64 (v2di * dst, v8hi src) +{ + unsigned long long tem[2]; + tem[0] = src[0]; + tem[1] = src[1]; + dst[0] = *(v2di *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxwq" 2 { xfail *-*-* } } } */ + +void +foo_u32_u64 (v2di * dst, v4si * __restrict src) +{ + unsigned long long tem[2]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + dst[0] = *(v2di *) tem; +} + +void +bar_u32_u64 (v2di * dst, v4si src) +{ + unsigned long long tem[2]; + tem[0] = src[0]; + tem[1] = src[1]; + dst[0] = *(v2di *) tem; +} + +/* { dg-final { scan-assembler-times "pmovzxdq" 2 } } */