Hi! This patch improves insertion of a single scalar into the first element of otherwise empty vector for 256-bit and 512-bit vectors. As 128-bit vmovd/vmovq/vmovss/vinsertps all clear all the upper bits of the target, there is no need to do anything but these instructions (or, when tuning for amd or generic, these from memory). E.g. given: typedef long long v4di __attribute__((vector_size (32))); typedef int v8si __attribute__((vector_size (32))); typedef double v4df __attribute__((vector_size (32))); typedef float v8sf __attribute__((vector_size (32)));
v4di f1 (long long x) { return (v4di) { x }; } v8si f2 (int x) { return (v8si) { x }; } v4df f3 (double x) { return (v4df) { x }; } v8sf f4 (float x) { return (v8sf) { x }; } #ifdef __AVX512F__ typedef long long v8di __attribute__((vector_size (64))); typedef int v16si __attribute__((vector_size (64))); typedef double v8df __attribute__((vector_size (64))); typedef float v16sf __attribute__((vector_size (64))); v8di f5 (long long x) { return (v8di) { x }; } v16si f6 (int x) { return (v16si) { x }; } v8df f7 (double x) { return (v8df) { x }; } v16sf f8 (float x) { return (v16sf) { x }; } #endif with -O2 -m64 -mavx512{bw,dq,vl} -mtune=intel, the difference with the patch is: f1: vmovq %rdi, %xmm0 - vmovdqa %xmm0, %xmm0 ret f2: vmovd %edi, %xmm0 - vmovdqa %xmm0, %xmm0 ret f3: vmovq %xmm0, %xmm0 - vmovapd %xmm0, %xmm0 ret f4: vinsertps $0xe, %xmm0, %xmm0, %xmm0 - vmovaps %xmm0, %xmm0 ret f5: vmovq %rdi, %xmm0 - vmovdqa %xmm0, %xmm0 - vmovdqa %ymm0, %ymm0 ret f6: - vpxor %xmm1, %xmm1, %xmm1 vmovd %edi, %xmm0 - vpunpcklqdq %xmm1, %xmm0, %xmm0 - vmovdqa %xmm0, %xmm0 - vmovdqa %ymm0, %ymm0 ret f7: vmovq %xmm0, %xmm0 - vmovapd %xmm0, %xmm0 - vmovapd %ymm0, %ymm0 ret f8: - pushq %rbp - vxorps %xmm1, %xmm1, %xmm1 - movq %rsp, %rbp - andq $-64, %rsp - vmovss %xmm0, -4(%rsp) - vmovss -4(%rsp), %xmm0 - vmovlhps %xmm1, %xmm0, %xmm0 - vmovaps %xmm0, %xmm0 - vmovaps %ymm0, %ymm0 - leave + vinsertps $0xe, %xmm0, %xmm0, %xmm0 ret Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2018-01-11 Jakub Jelinek <ja...@redhat.com> PR target/83203 * config/i386/i386.c (ix86_expand_vector_init_one_nonzero): If one_var is 0, for V{8,16}S[IF] and V[48]D[IF]mode use gen_vec_set<mode>_0. * config/i386/sse.md (VI8_AVX_AVX512F, VI4F_256_512): New mode iterators. (ssescalarmodesuffix): Add 512-bit vectors. Use "d" or "q" for integral modes instead of "ss" and "sd". (vec_set<mode>_0): New define_insns for 256-bit and 512-bit vectors with 32-bit and 64-bit elements. (vecdupssescalarmodesuffix): New mode attribute. (vec_dup<mode>): Use it. --- gcc/config/i386/i386.c.jj 2018-01-10 17:08:56.076912734 +0100 +++ gcc/config/i386/i386.c 2018-01-11 15:32:12.686848932 +0100 @@ -41762,6 +41762,7 @@ ix86_expand_vector_init_one_nonzero (boo rtx new_target; rtx x, tmp; bool use_vector_set = false; + rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL; switch (mode) { @@ -41786,14 +41787,41 @@ ix86_expand_vector_init_one_nonzero (boo break; case E_V32QImode: case E_V16HImode: + use_vector_set = TARGET_AVX; + break; case E_V8SImode: + use_vector_set = TARGET_AVX; + gen_vec_set_0 = gen_vec_setv8si_0; + break; case E_V8SFmode: + use_vector_set = TARGET_AVX; + gen_vec_set_0 = gen_vec_setv8sf_0; + break; case E_V4DFmode: use_vector_set = TARGET_AVX; + gen_vec_set_0 = gen_vec_setv4df_0; break; case E_V4DImode: /* Use ix86_expand_vector_set in 64bit mode only. */ use_vector_set = TARGET_AVX && TARGET_64BIT; + gen_vec_set_0 = gen_vec_setv4di_0; + break; + case E_V16SImode: + use_vector_set = TARGET_AVX512F && one_var == 0; + gen_vec_set_0 = gen_vec_setv16si_0; + break; + case E_V16SFmode: + use_vector_set = TARGET_AVX512F && one_var == 0; + gen_vec_set_0 = gen_vec_setv16sf_0; + break; + case E_V8DFmode: + use_vector_set = TARGET_AVX512F && one_var == 0; + gen_vec_set_0 = gen_vec_setv8df_0; + break; + case E_V8DImode: + /* Use ix86_expand_vector_set in 64bit mode only. */ + use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0; + gen_vec_set_0 = gen_vec_setv8di_0; break; default: break; @@ -41801,6 +41829,12 @@ ix86_expand_vector_init_one_nonzero (boo if (use_vector_set) { + if (gen_vec_set_0 && one_var == 0) + { + var = force_reg (GET_MODE_INNER (mode), var); + emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var)); + return true; + } emit_insn (gen_rtx_SET (target, CONST0_RTX (mode))); var = force_reg (GET_MODE_INNER (mode), var); ix86_expand_vector_set (mmx_ok, target, var, one_var); --- gcc/config/i386/sse.md.jj 2018-01-05 17:39:34.591260408 +0100 +++ gcc/config/i386/sse.md 2018-01-11 15:52:22.762139416 +0100 @@ -401,6 +401,9 @@ (define_mode_iterator VI8_AVX2 (define_mode_iterator VI8_AVX2_AVX512F [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI]) +(define_mode_iterator VI8_AVX_AVX512F + [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX")]) + (define_mode_iterator VI4_128_8_256 [V4SI V4DI]) @@ -622,6 +625,9 @@ (define_mode_iterator VI4F_128 [V4SI V4S (define_mode_iterator VI8F_128 [V2DI V2DF]) (define_mode_iterator VI4F_256 [V8SI V8SF]) (define_mode_iterator VI8F_256 [V4DI V4DF]) +(define_mode_iterator VI4F_256_512 + [V8SI V8SF + (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")]) (define_mode_iterator VI48F_256_512 [V8SI V8SF (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") @@ -838,10 +844,12 @@ (define_mode_attr sseintprefix ;; SSE scalar suffix for vector modes (define_mode_attr ssescalarmodesuffix [(SF "ss") (DF "sd") + (V16SF "ss") (V8DF "sd") (V8SF "ss") (V4DF "sd") (V4SF "ss") (V2DF "sd") - (V8SI "ss") (V4DI "sd") - (V4SI "d")]) + (V16SI "d") (V8DI "q") + (V8SI "d") (V4DI "q") + (V4SI "d") (V2DI "q")]) ;; Pack/unpack vector modes (define_mode_attr sseunpackmode @@ -7092,6 +7100,26 @@ (define_insn "*vec_setv4sf_sse4_1" (set_attr "prefix" "orig,orig,maybe_evex") (set_attr "mode" "V4SF")]) +;; All of vinsertps, vmovss, vmovd clear also the higher bits. +(define_insn "vec_set<mode>_0" + [(set (match_operand:VI4F_256_512 0 "register_operand" "=v,v,Yi") + (vec_merge:VI4F_256_512 + (vec_duplicate:VI4F_256_512 + (match_operand:<ssescalarmode> 2 "general_operand" "v,m,r")) + (match_operand:VI4F_256_512 1 "const0_operand" "C,C,C") + (const_int 1)))] + "TARGET_AVX" + "@ + vinsertps\t{$0xe, %2, %2, %x0|%x0, %2, %2, 0xe} + vmov<ssescalarmodesuffix>\t{%x2, %x0|%x0, %2} + vmovd\t{%2, %x0|%x0, %2}" + [(set (attr "type") + (if_then_else (eq_attr "alternative" "0") + (const_string "sselog") + (const_string "ssemov"))) + (set_attr "prefix" "maybe_evex") + (set_attr "mode" "SF,<ssescalarmode>,SI")]) + (define_insn "sse4_1_insertps" [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v") (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm") @@ -9220,6 +9248,20 @@ (define_insn "vec_concatv2df" (const_string "orig"))) (set_attr "mode" "V2DF,V2DF,V2DF, DF, DF, V1DF,V1DF,DF,V4SF,V2SF")]) +;; vmovq clears also the higher bits. +(define_insn "vec_set<mode>_0" + [(set (match_operand:VF2_512_256 0 "register_operand" "=v") + (vec_merge:VF2_512_256 + (vec_duplicate:VF2_512_256 + (match_operand:<ssescalarmode> 2 "general_operand" "xm")) + (match_operand:VF2_512_256 1 "const0_operand" "C") + (const_int 1)))] + "TARGET_AVX" + "vmovq\t{%2, %x0|%x0, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_evex") + (set_attr "mode" "DF")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integer down-conversion operations @@ -13993,6 +14035,22 @@ (define_insn "vec_concatv2di" (const_string "orig"))) (set_attr "mode" "TI,TI,TI,TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")]) +;; vmovq clears also the higher bits. +(define_insn "vec_set<mode>_0" + [(set (match_operand:VI8_AVX_AVX512F 0 "register_operand" "=Yi,v") + (vec_merge:VI8_AVX_AVX512F + (vec_duplicate:VI8_AVX_AVX512F + (match_operand:<ssescalarmode> 2 "general_operand" "r,vm")) + (match_operand:VI8_AVX_AVX512F 1 "const0_operand" "C,C") + (const_int 1)))] + "TARGET_AVX" + "vmovq\t{%2, %x0|%x0, %2}" + [(set_attr "isa" "x64,*") + (set_attr "type" "ssemov") + (set_attr "prefix_rex" "1,*") + (set_attr "prefix" "maybe_evex") + (set_attr "mode" "TI")]) + (define_expand "vec_unpacks_lo_<mode>" [(match_operand:<sseunpackmode> 0 "register_operand") (match_operand:VI124_AVX2_24_AVX512F_1_AVX512BW 1 "register_operand")] @@ -17743,6 +17801,8 @@ (define_insn "avx2_vbroadcasti128_<mode> ;; Modes handled by AVX vec_dup patterns. (define_mode_iterator AVX_VEC_DUP_MODE [V8SI V8SF V4DI V4DF]) +(define_mode_attr vecdupssescalarmodesuffix + [(V8SF "ss") (V4DF "sd") (V8SI "ss") (V4DI "sd")]) ;; Modes handled by AVX2 vec_dup patterns. (define_mode_iterator AVX2_VEC_DUP_MODE [V32QI V16QI V16HI V8HI V8SI V4SI]) @@ -17769,7 +17829,7 @@ (define_insn "vec_dup<mode>" "TARGET_AVX" "@ v<sseintprefix>broadcast<bcstscalarsuff>\t{%1, %0|%0, %1} - vbroadcast<ssescalarmodesuffix>\t{%1, %0|%0, %1} + vbroadcast<vecdupssescalarmodesuffix>\t{%1, %0|%0, %1} v<sseintprefix>broadcast<bcstscalarsuff>\t{%x1, %0|%0, %x1} v<sseintprefix>broadcast<bcstscalarsuff>\t{%x1, %g0|%g0, %x1} #" Jakub