2021-06-24 Uroš Bizjak <ubiz...@gmail.com> gcc/ PR target/89021 * config/i386/i386-expand.c (ix86_expand_sse_unpack): Handle V8QI and V4HI modes. * config/i386/mmx.md (sse4_1_<any_extend:code>v4qiv4hi2): New insn pattern. (sse4_1_<any_extend:code>v4qiv4hi2): Ditto. (mmxpackmode): New mode attribute. (vec_pack_trunc_<mmxpackmode:mode>): New expander. (mmxunpackmode): New mode attribute. (vec_unpacks_lo_<mmxunpackmode:mode>): New expander. (vec_unpacks_hi_<mmxunpackmode:mode>): Ditto. (vec_unpacku_lo_<mmxunpackmode:mode>): Ditto. (vec_unpacku_hi_<mmxunpackmode:mode>): Ditto. * config/i386/i386.md (extsuffix): Move from ... * config/i386/sse.md: ... here.
gcc/testsuite/ PR target/89021 * gcc.target/i386/pr97249-1.c (foo): Add #pragma to avoid loop vectorization. (foo1): Ditto. (foo2): Ditto. Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}. There is still one scan-tree-not failure in generic vectorization testsuite: FAIL: gcc.dg/vect/vect-nb-iter-ub-3.c scan-tree-dump-not cunroll "loop turned into non-loop; it never loops" This probably happens due to the additional epilogue vectorization, but I don't know how to "fix" this failure. Richi, can you perhaps help me here? Uros.
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 2cb939e51c3..e9763eb5b3e 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -5161,6 +5161,18 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) else unpack = gen_sse4_1_sign_extendv2siv2di2; break; + case E_V8QImode: + if (unsigned_p) + unpack = gen_sse4_1_zero_extendv4qiv4hi2; + else + unpack = gen_sse4_1_sign_extendv4qiv4hi2; + break; + case E_V4HImode: + if (unsigned_p) + unpack = gen_sse4_1_zero_extendv2hiv2si2; + else + unpack = gen_sse4_1_sign_extendv2hiv2si2; + break; default: gcc_unreachable (); } @@ -5172,10 +5184,24 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) } else if (high_p) { - /* Shift higher 8 bytes to lower 8 bytes. */ - tmp = gen_reg_rtx (V1TImode); - emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src), - GEN_INT (64))); + switch (GET_MODE_SIZE (imode)) + { + case 16: + /* Shift higher 8 bytes to lower 8 bytes. */ + tmp = gen_reg_rtx (V1TImode); + emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src), + GEN_INT (64))); + break; + case 8: + /* Shift higher 4 bytes to lower 4 bytes. */ + tmp = gen_reg_rtx (V1DImode); + emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src), + GEN_INT (32))); + break; + default: + gcc_unreachable (); + } + tmp = gen_lowpart (imode, tmp); } else @@ -5207,6 +5233,18 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) else unpack = gen_vec_interleave_lowv4si; break; + case E_V8QImode: + if (high_p) + unpack = gen_mmx_punpckhbw; + else + unpack = gen_mmx_punpcklbw; + break; + case E_V4HImode: + if (high_p) + unpack = gen_mmx_punpckhwd; + else + unpack = gen_mmx_punpcklwd; + break; default: gcc_unreachable (); } diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 9043be3105d..9b619e2f78f 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -1000,6 +1000,9 @@ (define_code_iterator any_truncate [ss_truncate truncate us_truncate]) (define_code_attr trunsuffix [(ss_truncate "s") (truncate "") (us_truncate "us")]) +;; Instruction suffix for SSE sign and zero extensions. +(define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")]) + ;; Used in signed and unsigned fix. (define_code_iterator any_fix [fix unsigned_fix]) (define_code_attr fixsuffix [(fix "") (unsigned_fix "u")]) diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 7a827dceb01..e887f03474d 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -2639,6 +2639,78 @@ (define_insn_and_split "mmx_punpckldq" (set_attr "type" "mmxcvt,sselog,sselog") (set_attr "mode" "DI,TI,TI")]) +(define_insn "sse4_1_<code>v4qiv4hi2" + [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,Yw") + (any_extend:V4HI + (vec_select:V4QI + (match_operand:V8QI 1 "register_operand" "Yr,*x,Yw") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" + "%vpmov<extsuffix>bw\t{%1, %0|%0, %1}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,maybe_evex") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_<code>v2hiv2si2" + [(set (match_operand:V2SI 0 "register_operand" "=Yr,*x,v") + (any_extend:V2SI + (vec_select:V2HI + (match_operand:V4HI 1 "register_operand" "Yr,*x,v") + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" + "%vpmov<extsuffix>wd\t{%1, %0|%0, %1}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,maybe_evex") + (set_attr "mode" "TI")]) + +;; Pack/unpack vector modes +(define_mode_attr mmxpackmode + [(V4HI "V8QI") (V2SI "V4HI")]) + +(define_expand "vec_pack_trunc_<mode>" + [(match_operand:<mmxpackmode> 0 "register_operand") + (match_operand:MMXMODE24 1 "register_operand") + (match_operand:MMXMODE24 2 "register_operand")] + "TARGET_MMX_WITH_SSE" +{ + rtx op1 = gen_lowpart (<mmxpackmode>mode, operands[1]); + rtx op2 = gen_lowpart (<mmxpackmode>mode, operands[2]); + ix86_expand_vec_extract_even_odd (operands[0], op1, op2, 0); + DONE; +}) + +(define_mode_attr mmxunpackmode + [(V8QI "V4HI") (V4HI "V2SI")]) + +(define_expand "vec_unpacks_lo_<mode>" + [(match_operand:<mmxunpackmode> 0 "register_operand") + (match_operand:MMXMODE12 1 "register_operand")] + "TARGET_MMX_WITH_SSE" + "ix86_expand_sse_unpack (operands[0], operands[1], false, false); DONE;") + +(define_expand "vec_unpacks_hi_<mode>" + [(match_operand:<mmxunpackmode> 0 "register_operand") + (match_operand:MMXMODE12 1 "register_operand")] + "TARGET_MMX_WITH_SSE" + "ix86_expand_sse_unpack (operands[0], operands[1], false, true); DONE;") + +(define_expand "vec_unpacku_lo_<mode>" + [(match_operand:<mmxunpackmode> 0 "register_operand") + (match_operand:MMXMODE12 1 "register_operand")] + "TARGET_MMX_WITH_SSE" + "ix86_expand_sse_unpack (operands[0], operands[1], true, false); DONE;") + +(define_expand "vec_unpacku_hi_<mode>" + [(match_operand:<mmxunpackmode> 0 "register_operand") + (match_operand:MMXMODE12 1 "register_operand")] + "TARGET_MMX_WITH_SSE" + "ix86_expand_sse_unpack (operands[0], operands[1], true, true); DONE;") + (define_insn "*mmx_pinsrd" [(set (match_operand:V2SI 0 "register_operand" "=x,Yv") (vec_merge:V2SI diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 5bd65dd9312..d718a82cb58 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -976,9 +976,6 @@ (define_mode_attr castmode [(V8SI "si") (V8SF "ps") (V4DF "pd") (V16SI "si") (V16SF "ps") (V8DF "pd")]) -;; Instruction suffix for sign and zero extensions. -(define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")]) - ;; i128 for integer vectors and TARGET_AVX2, f128 otherwise. ;; i64x4 or f64x4 for 512bit modes. (define_mode_attr i128 diff --git a/gcc/testsuite/gcc.target/i386/pr97249-1.c b/gcc/testsuite/gcc.target/i386/pr97249-1.c index 4478a34a9f8..e7d1d74a208 100644 --- a/gcc/testsuite/gcc.target/i386/pr97249-1.c +++ b/gcc/testsuite/gcc.target/i386/pr97249-1.c @@ -8,23 +8,26 @@ void foo (unsigned char* p1, unsigned char* p2, short* __restrict p3) { - for (int i = 0 ; i != 8; i++) - p3[i] = p1[i] + p2[i]; - return; + /* Avoid loop vectorization. */ +#pragma GCC unroll 8 + for (int i = 0 ; i != 8; i++) + p3[i] = p1[i] + p2[i]; } void foo1 (unsigned short* p1, unsigned short* p2, int* __restrict p3) { - for (int i = 0 ; i != 4; i++) - p3[i] = p1[i] + p2[i]; - return; + /* Avoid loop vectorization. */ +#pragma GCC unroll 4 + for (int i = 0 ; i != 4; i++) + p3[i] = p1[i] + p2[i]; } void foo2 (unsigned int* p1, unsigned int* p2, long long* __restrict p3) { - for (int i = 0 ; i != 2; i++) - p3[i] = (long long)p1[i] + (long long)p2[i]; - return; + /* Avoid loop vectorization. */ +#pragma GCC unroll 2 + for (int i = 0 ; i != 2; i++) + p3[i] = (long long)p1[i] + (long long)p2[i]; }