On Thu, Jan 11, 2018 at 7:59 PM, Jakub Jelinek <ja...@redhat.com> wrote: > Hi! > > This patch improves insertion of a single scalar into the first element > of otherwise empty vector for 256-bit and 512-bit vectors. > As 128-bit vmovd/vmovq/vmovss/vinsertps all clear all the upper bits > of the target, there is no need to do anything but these instructions > (or, when tuning for amd or generic, these from memory). > E.g. given: > typedef long long v4di __attribute__((vector_size (32))); > typedef int v8si __attribute__((vector_size (32))); > typedef double v4df __attribute__((vector_size (32))); > typedef float v8sf __attribute__((vector_size (32))); > > v4di > f1 (long long x) > { > return (v4di) { x }; > } > > v8si > f2 (int x) > { > return (v8si) { x }; > } > > v4df > f3 (double x) > { > return (v4df) { x }; > } > > v8sf > f4 (float x) > { > return (v8sf) { x }; > } > > #ifdef __AVX512F__ > typedef long long v8di __attribute__((vector_size (64))); > typedef int v16si __attribute__((vector_size (64))); > typedef double v8df __attribute__((vector_size (64))); > typedef float v16sf __attribute__((vector_size (64))); > > v8di > f5 (long long x) > { > return (v8di) { x }; > } > > v16si > f6 (int x) > { > return (v16si) { x }; > } > > v8df > f7 (double x) > { > return (v8df) { x }; > } > > v16sf > f8 (float x) > { > return (v16sf) { x }; > } > #endif > > with -O2 -m64 -mavx512{bw,dq,vl} -mtune=intel, the difference with the > patch is: > f1: > vmovq %rdi, %xmm0 > - vmovdqa %xmm0, %xmm0 > ret > f2: > vmovd %edi, %xmm0 > - vmovdqa %xmm0, %xmm0 > ret > f3: > vmovq %xmm0, %xmm0 > - vmovapd %xmm0, %xmm0 > ret > f4: > vinsertps $0xe, %xmm0, %xmm0, %xmm0 > - vmovaps %xmm0, %xmm0 > ret > f5: > vmovq %rdi, %xmm0 > - vmovdqa %xmm0, %xmm0 > - vmovdqa %ymm0, %ymm0 > ret > f6: > - vpxor %xmm1, %xmm1, %xmm1 > vmovd %edi, %xmm0 > - vpunpcklqdq %xmm1, %xmm0, %xmm0 > - vmovdqa %xmm0, %xmm0 > - vmovdqa %ymm0, %ymm0 > ret > f7: > vmovq %xmm0, %xmm0 > - vmovapd %xmm0, %xmm0 > - vmovapd %ymm0, %ymm0 > ret > f8: > - pushq %rbp > - vxorps %xmm1, %xmm1, %xmm1 > - movq %rsp, %rbp > - andq $-64, %rsp > - vmovss %xmm0, -4(%rsp) > - vmovss -4(%rsp), %xmm0 > - vmovlhps %xmm1, %xmm0, %xmm0 > - vmovaps %xmm0, %xmm0 > - vmovaps %ymm0, %ymm0 > - leave > + vinsertps $0xe, %xmm0, %xmm0, %xmm0 > ret > > Bootstrapped/regtested on x86_64-linux and i686-linux, > ok for trunk? > > 2018-01-11 Jakub Jelinek <ja...@redhat.com> > > PR target/83203 > * config/i386/i386.c (ix86_expand_vector_init_one_nonzero): If one_var > is 0, for V{8,16}S[IF] and V[48]D[IF]mode use gen_vec_set<mode>_0. > * config/i386/sse.md (VI8_AVX_AVX512F, VI4F_256_512): New mode > iterators. > (ssescalarmodesuffix): Add 512-bit vectors. Use "d" or "q" for > integral modes instead of "ss" and "sd". > (vec_set<mode>_0): New define_insns for 256-bit and 512-bit > vectors with 32-bit and 64-bit elements. > (vecdupssescalarmodesuffix): New mode attribute. > (vec_dup<mode>): Use it.
OK. Thanks, Uros. > --- gcc/config/i386/i386.c.jj 2018-01-10 17:08:56.076912734 +0100 > +++ gcc/config/i386/i386.c 2018-01-11 15:32:12.686848932 +0100 > @@ -41762,6 +41762,7 @@ ix86_expand_vector_init_one_nonzero (boo > rtx new_target; > rtx x, tmp; > bool use_vector_set = false; > + rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL; > > switch (mode) > { > @@ -41786,14 +41787,41 @@ ix86_expand_vector_init_one_nonzero (boo > break; > case E_V32QImode: > case E_V16HImode: > + use_vector_set = TARGET_AVX; > + break; > case E_V8SImode: > + use_vector_set = TARGET_AVX; > + gen_vec_set_0 = gen_vec_setv8si_0; > + break; > case E_V8SFmode: > + use_vector_set = TARGET_AVX; > + gen_vec_set_0 = gen_vec_setv8sf_0; > + break; > case E_V4DFmode: > use_vector_set = TARGET_AVX; > + gen_vec_set_0 = gen_vec_setv4df_0; > break; > case E_V4DImode: > /* Use ix86_expand_vector_set in 64bit mode only. */ > use_vector_set = TARGET_AVX && TARGET_64BIT; > + gen_vec_set_0 = gen_vec_setv4di_0; > + break; > + case E_V16SImode: > + use_vector_set = TARGET_AVX512F && one_var == 0; > + gen_vec_set_0 = gen_vec_setv16si_0; > + break; > + case E_V16SFmode: > + use_vector_set = TARGET_AVX512F && one_var == 0; > + gen_vec_set_0 = gen_vec_setv16sf_0; > + break; > + case E_V8DFmode: > + use_vector_set = TARGET_AVX512F && one_var == 0; > + gen_vec_set_0 = gen_vec_setv8df_0; > + break; > + case E_V8DImode: > + /* Use ix86_expand_vector_set in 64bit mode only. */ > + use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0; > + gen_vec_set_0 = gen_vec_setv8di_0; > break; > default: > break; > @@ -41801,6 +41829,12 @@ ix86_expand_vector_init_one_nonzero (boo > > if (use_vector_set) > { > + if (gen_vec_set_0 && one_var == 0) > + { > + var = force_reg (GET_MODE_INNER (mode), var); > + emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var)); > + return true; > + } > emit_insn (gen_rtx_SET (target, CONST0_RTX (mode))); > var = force_reg (GET_MODE_INNER (mode), var); > ix86_expand_vector_set (mmx_ok, target, var, one_var); > --- gcc/config/i386/sse.md.jj 2018-01-05 17:39:34.591260408 +0100 > +++ gcc/config/i386/sse.md 2018-01-11 15:52:22.762139416 +0100 > @@ -401,6 +401,9 @@ (define_mode_iterator VI8_AVX2 > (define_mode_iterator VI8_AVX2_AVX512F > [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI]) > > +(define_mode_iterator VI8_AVX_AVX512F > + [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX")]) > + > (define_mode_iterator VI4_128_8_256 > [V4SI V4DI]) > > @@ -622,6 +625,9 @@ (define_mode_iterator VI4F_128 [V4SI V4S > (define_mode_iterator VI8F_128 [V2DI V2DF]) > (define_mode_iterator VI4F_256 [V8SI V8SF]) > (define_mode_iterator VI8F_256 [V4DI V4DF]) > +(define_mode_iterator VI4F_256_512 > + [V8SI V8SF > + (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")]) > (define_mode_iterator VI48F_256_512 > [V8SI V8SF > (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") > @@ -838,10 +844,12 @@ (define_mode_attr sseintprefix > ;; SSE scalar suffix for vector modes > (define_mode_attr ssescalarmodesuffix > [(SF "ss") (DF "sd") > + (V16SF "ss") (V8DF "sd") > (V8SF "ss") (V4DF "sd") > (V4SF "ss") (V2DF "sd") > - (V8SI "ss") (V4DI "sd") > - (V4SI "d")]) > + (V16SI "d") (V8DI "q") > + (V8SI "d") (V4DI "q") > + (V4SI "d") (V2DI "q")]) > > ;; Pack/unpack vector modes > (define_mode_attr sseunpackmode > @@ -7092,6 +7100,26 @@ (define_insn "*vec_setv4sf_sse4_1" > (set_attr "prefix" "orig,orig,maybe_evex") > (set_attr "mode" "V4SF")]) > > +;; All of vinsertps, vmovss, vmovd clear also the higher bits. > +(define_insn "vec_set<mode>_0" > + [(set (match_operand:VI4F_256_512 0 "register_operand" "=v,v,Yi") > + (vec_merge:VI4F_256_512 > + (vec_duplicate:VI4F_256_512 > + (match_operand:<ssescalarmode> 2 "general_operand" "v,m,r")) > + (match_operand:VI4F_256_512 1 "const0_operand" "C,C,C") > + (const_int 1)))] > + "TARGET_AVX" > + "@ > + vinsertps\t{$0xe, %2, %2, %x0|%x0, %2, %2, 0xe} > + vmov<ssescalarmodesuffix>\t{%x2, %x0|%x0, %2} > + vmovd\t{%2, %x0|%x0, %2}" > + [(set (attr "type") > + (if_then_else (eq_attr "alternative" "0") > + (const_string "sselog") > + (const_string "ssemov"))) > + (set_attr "prefix" "maybe_evex") > + (set_attr "mode" "SF,<ssescalarmode>,SI")]) > + > (define_insn "sse4_1_insertps" > [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v") > (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" > "Yrm,*xm,vm") > @@ -9220,6 +9248,20 @@ (define_insn "vec_concatv2df" > (const_string "orig"))) > (set_attr "mode" "V2DF,V2DF,V2DF, DF, DF, V1DF,V1DF,DF,V4SF,V2SF")]) > > +;; vmovq clears also the higher bits. > +(define_insn "vec_set<mode>_0" > + [(set (match_operand:VF2_512_256 0 "register_operand" "=v") > + (vec_merge:VF2_512_256 > + (vec_duplicate:VF2_512_256 > + (match_operand:<ssescalarmode> 2 "general_operand" "xm")) > + (match_operand:VF2_512_256 1 "const0_operand" "C") > + (const_int 1)))] > + "TARGET_AVX" > + "vmovq\t{%2, %x0|%x0, %2}" > + [(set_attr "type" "ssemov") > + (set_attr "prefix" "maybe_evex") > + (set_attr "mode" "DF")]) > + > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > ;; > ;; Parallel integer down-conversion operations > @@ -13993,6 +14035,22 @@ (define_insn "vec_concatv2di" > (const_string "orig"))) > (set_attr "mode" "TI,TI,TI,TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")]) > > +;; vmovq clears also the higher bits. > +(define_insn "vec_set<mode>_0" > + [(set (match_operand:VI8_AVX_AVX512F 0 "register_operand" "=Yi,v") > + (vec_merge:VI8_AVX_AVX512F > + (vec_duplicate:VI8_AVX_AVX512F > + (match_operand:<ssescalarmode> 2 "general_operand" "r,vm")) > + (match_operand:VI8_AVX_AVX512F 1 "const0_operand" "C,C") > + (const_int 1)))] > + "TARGET_AVX" > + "vmovq\t{%2, %x0|%x0, %2}" > + [(set_attr "isa" "x64,*") > + (set_attr "type" "ssemov") > + (set_attr "prefix_rex" "1,*") > + (set_attr "prefix" "maybe_evex") > + (set_attr "mode" "TI")]) > + > (define_expand "vec_unpacks_lo_<mode>" > [(match_operand:<sseunpackmode> 0 "register_operand") > (match_operand:VI124_AVX2_24_AVX512F_1_AVX512BW 1 "register_operand")] > @@ -17743,6 +17801,8 @@ (define_insn "avx2_vbroadcasti128_<mode> > ;; Modes handled by AVX vec_dup patterns. > (define_mode_iterator AVX_VEC_DUP_MODE > [V8SI V8SF V4DI V4DF]) > +(define_mode_attr vecdupssescalarmodesuffix > + [(V8SF "ss") (V4DF "sd") (V8SI "ss") (V4DI "sd")]) > ;; Modes handled by AVX2 vec_dup patterns. > (define_mode_iterator AVX2_VEC_DUP_MODE > [V32QI V16QI V16HI V8HI V8SI V4SI]) > @@ -17769,7 +17829,7 @@ (define_insn "vec_dup<mode>" > "TARGET_AVX" > "@ > v<sseintprefix>broadcast<bcstscalarsuff>\t{%1, %0|%0, %1} > - vbroadcast<ssescalarmodesuffix>\t{%1, %0|%0, %1} > + vbroadcast<vecdupssescalarmodesuffix>\t{%1, %0|%0, %1} > v<sseintprefix>broadcast<bcstscalarsuff>\t{%x1, %0|%0, %x1} > v<sseintprefix>broadcast<bcstscalarsuff>\t{%x1, %g0|%g0, %x1} > #" > > Jakub