On Thu, Feb 6, 2020 at 9:34 AM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > The following testcase shows that for _mm256_set*_m128i and similar > intrinsics, we sometimes generate bad code. All 4 routines are expressing > the same thing, a 128-bit vector zero padded to 256-bit vector, but only the > 3rd one actually emits the desired vmovdqa %xmm0, %xmm0 insn, the > others vpxor %xmm1, %xmm1, %xmm1; vinserti128 $0x1, %xmm1, %ymm0, > %ymm0 > The problem is that the cast builtins use UNSPEC_CAST which is after reload > simplified using a splitter, but during combine it prevents optimizations. > We do have avx_vec_concat* patterns that generate efficient code, both for > this low part + zero concatenation special case and for other cases too, so > the following define_insn_and_split just recognizes avx_vec_concat made of a > low half of a cast and some other reg. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2020-02-06 Jakub Jelinek <ja...@redhat.com> > > PR target/93594 > * config/i386/predicates.md (avx_identity_operand): New predicate. > * config/i386/sse.md (*avx_vec_concat<mode>_1): New > define_insn_and_split. > > * gcc.target/i386/avx2-pr93594.c: New test.
LGTM. Thanks, Uros. > --- gcc/config/i386/predicates.md.jj 2020-01-12 11:54:36.331414646 +0100 > +++ gcc/config/i386/predicates.md 2020-02-05 17:44:44.663517106 +0100 > @@ -1584,6 +1584,19 @@ (define_predicate "palignr_operand" > return true; > }) > > +;; Return true if OP is a parallel for identity permute. > +(define_predicate "avx_identity_operand" > + (and (match_code "parallel") > + (match_code "const_int" "a")) > +{ > + int i, nelt = XVECLEN (op, 0); > + > + for (i = 0; i < nelt; ++i) > + if (INTVAL (XVECEXP (op, 0, i)) != i) > + return false; > + return true; > +}) > + > ;; Return true if OP is a proper third operand to vpblendw256. > (define_predicate "avx2_pblendw_operand" > (match_code "const_int") > --- gcc/config/i386/sse.md.jj 2020-02-05 15:38:06.636292475 +0100 > +++ gcc/config/i386/sse.md 2020-02-05 17:55:06.696352286 +0100 > @@ -21358,6 +21358,24 @@ (define_insn "avx_vec_concat<mode>" > (set_attr "prefix" "maybe_evex") > (set_attr "mode" "<sseinsnmode>")]) > > +(define_insn_and_split "*avx_vec_concat<mode>_1" > + [(set (match_operand:V_256_512 0 "register_operand") > + (vec_concat:V_256_512 > + (vec_select:<ssehalfvecmode> > + (unspec:V_256_512 > + [(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand")] > + UNSPEC_CAST) > + (match_parallel 3 "avx_identity_operand" > + [(match_operand 4 "const_int_operand")])) > + (match_operand:<ssehalfvecmode> 2 "nonimm_or_0_operand")))] > + "TARGET_AVX > + && (operands[2] == CONST0_RTX (<ssehalfvecmode>mode) > + || !MEM_P (operands[1])) > + && ix86_pre_reload_split ()" > + "#" > + "&& 1" > + [(set (match_dup 0) (vec_concat:V_256_512 (match_dup 1) (match_dup 2)))]) > + > (define_insn "vcvtph2ps<mask_name>" > [(set (match_operand:V4SF 0 "register_operand" "=v") > (vec_select:V4SF > --- gcc/testsuite/gcc.target/i386/avx2-pr93594.c.jj 2020-02-05 > 17:59:33.470416968 +0100 > +++ gcc/testsuite/gcc.target/i386/avx2-pr93594.c 2020-02-05 > 18:06:20.703403613 +0100 > @@ -0,0 +1,32 @@ > +/* PR target/93594 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx2 -masm=att" } */ > +/* { dg-final { scan-assembler-times "vmovdqa\t%xmm0, %xmm0" 4 } } */ > +/* { dg-final { scan-assembler-not "vpxor\t%" } } */ > +/* { dg-final { scan-assembler-not "vinserti128\t\\\$" } } */ > + > +#include <x86intrin.h> > + > +__m256i > +foo (__m128i x) > +{ > + return _mm256_setr_m128i (x, _mm_setzero_si128 ()); > +} > + > +__m256i > +bar (__m128i x) > +{ > + return _mm256_set_m128i (_mm_setzero_si128 (), x); > +} > + > +__m256i > +baz (__m128i x) > +{ > + return _mm256_insertf128_si256 (_mm256_setzero_si256 (), x, 0); > +} > + > +__m256i > +qux (__m128i x) > +{ > + return _mm256_insertf128_si256 (_mm256_castsi128_si256 (x), > _mm_setzero_si128 (), 1); > +} > > Jakub >