On Thu, Jan 11, 2018 at 7:59 PM, Jakub Jelinek <ja...@redhat.com> wrote:
> Hi!
>
> This patch improves insertion of a single scalar into the first element
> of otherwise empty vector for 256-bit and 512-bit vectors.
> As 128-bit vmovd/vmovq/vmovss/vinsertps all clear all the upper bits
> of the target, there is no need to do anything but these instructions
> (or, when tuning for amd or generic, these from memory).
> E.g. given:
> typedef long long v4di __attribute__((vector_size (32)));
> typedef int v8si __attribute__((vector_size (32)));
> typedef double v4df __attribute__((vector_size (32)));
> typedef float v8sf __attribute__((vector_size (32)));
>
> v4di
> f1 (long long x)
> {
>   return (v4di) { x };
> }
>
> v8si
> f2 (int x)
> {
>   return (v8si) { x };
> }
>
> v4df
> f3 (double x)
> {
>   return (v4df) { x };
> }
>
> v8sf
> f4 (float x)
> {
>   return (v8sf) { x };
> }
>
> #ifdef __AVX512F__
> typedef long long v8di __attribute__((vector_size (64)));
> typedef int v16si __attribute__((vector_size (64)));
> typedef double v8df __attribute__((vector_size (64)));
> typedef float v16sf __attribute__((vector_size (64)));
>
> v8di
> f5 (long long x)
> {
>   return (v8di) { x };
> }
>
> v16si
> f6 (int x)
> {
>   return (v16si) { x };
> }
>
> v8df
> f7 (double x)
> {
>   return (v8df) { x };
> }
>
> v16sf
> f8 (float x)
> {
>   return (v16sf) { x };
> }
> #endif
>
> with -O2 -m64 -mavx512{bw,dq,vl} -mtune=intel, the difference with the
> patch is:
>  f1:
>         vmovq   %rdi, %xmm0
> -       vmovdqa %xmm0, %xmm0
>         ret
>  f2:
>         vmovd   %edi, %xmm0
> -       vmovdqa %xmm0, %xmm0
>         ret
>  f3:
>         vmovq   %xmm0, %xmm0
> -       vmovapd %xmm0, %xmm0
>         ret
>  f4:
>         vinsertps       $0xe, %xmm0, %xmm0, %xmm0
> -       vmovaps %xmm0, %xmm0
>         ret
>  f5:
>         vmovq   %rdi, %xmm0
> -       vmovdqa %xmm0, %xmm0
> -       vmovdqa %ymm0, %ymm0
>         ret
>  f6:
> -       vpxor   %xmm1, %xmm1, %xmm1
>         vmovd   %edi, %xmm0
> -       vpunpcklqdq     %xmm1, %xmm0, %xmm0
> -       vmovdqa %xmm0, %xmm0
> -       vmovdqa %ymm0, %ymm0
>         ret
>  f7:
>         vmovq   %xmm0, %xmm0
> -       vmovapd %xmm0, %xmm0
> -       vmovapd %ymm0, %ymm0
>         ret
>  f8:
> -       pushq   %rbp
> -       vxorps  %xmm1, %xmm1, %xmm1
> -       movq    %rsp, %rbp
> -       andq    $-64, %rsp
> -       vmovss  %xmm0, -4(%rsp)
> -       vmovss  -4(%rsp), %xmm0
> -       vmovlhps        %xmm1, %xmm0, %xmm0
> -       vmovaps %xmm0, %xmm0
> -       vmovaps %ymm0, %ymm0
> -       leave
> +       vinsertps       $0xe, %xmm0, %xmm0, %xmm0
>         ret
>
> Bootstrapped/regtested on x86_64-linux and i686-linux,
> ok for trunk?
>
> 2018-01-11  Jakub Jelinek  <ja...@redhat.com>
>
>         PR target/83203
>         * config/i386/i386.c (ix86_expand_vector_init_one_nonzero): If one_var
>         is 0, for V{8,16}S[IF] and V[48]D[IF]mode use gen_vec_set<mode>_0.
>         * config/i386/sse.md (VI8_AVX_AVX512F, VI4F_256_512): New mode
>         iterators.
>         (ssescalarmodesuffix): Add 512-bit vectors.  Use "d" or "q" for
>         integral modes instead of "ss" and "sd".
>         (vec_set<mode>_0): New define_insns for 256-bit and 512-bit
>         vectors with 32-bit and 64-bit elements.
>         (vecdupssescalarmodesuffix): New mode attribute.
>         (vec_dup<mode>): Use it.

OK.

Thanks,
Uros.

> --- gcc/config/i386/i386.c.jj   2018-01-10 17:08:56.076912734 +0100
> +++ gcc/config/i386/i386.c      2018-01-11 15:32:12.686848932 +0100
> @@ -41762,6 +41762,7 @@ ix86_expand_vector_init_one_nonzero (boo
>    rtx new_target;
>    rtx x, tmp;
>    bool use_vector_set = false;
> +  rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
>
>    switch (mode)
>      {
> @@ -41786,14 +41787,41 @@ ix86_expand_vector_init_one_nonzero (boo
>        break;
>      case E_V32QImode:
>      case E_V16HImode:
> +      use_vector_set = TARGET_AVX;
> +      break;
>      case E_V8SImode:
> +      use_vector_set = TARGET_AVX;
> +      gen_vec_set_0 = gen_vec_setv8si_0;
> +      break;
>      case E_V8SFmode:
> +      use_vector_set = TARGET_AVX;
> +      gen_vec_set_0 = gen_vec_setv8sf_0;
> +      break;
>      case E_V4DFmode:
>        use_vector_set = TARGET_AVX;
> +      gen_vec_set_0 = gen_vec_setv4df_0;
>        break;
>      case E_V4DImode:
>        /* Use ix86_expand_vector_set in 64bit mode only.  */
>        use_vector_set = TARGET_AVX && TARGET_64BIT;
> +      gen_vec_set_0 = gen_vec_setv4di_0;
> +      break;
> +    case E_V16SImode:
> +      use_vector_set = TARGET_AVX512F && one_var == 0;
> +      gen_vec_set_0 = gen_vec_setv16si_0;
> +      break;
> +    case E_V16SFmode:
> +      use_vector_set = TARGET_AVX512F && one_var == 0;
> +      gen_vec_set_0 = gen_vec_setv16sf_0;
> +      break;
> +    case E_V8DFmode:
> +      use_vector_set = TARGET_AVX512F && one_var == 0;
> +      gen_vec_set_0 = gen_vec_setv8df_0;
> +      break;
> +    case E_V8DImode:
> +      /* Use ix86_expand_vector_set in 64bit mode only.  */
> +      use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
> +      gen_vec_set_0 = gen_vec_setv8di_0;
>        break;
>      default:
>        break;
> @@ -41801,6 +41829,12 @@ ix86_expand_vector_init_one_nonzero (boo
>
>    if (use_vector_set)
>      {
> +      if (gen_vec_set_0 && one_var == 0)
> +       {
> +         var = force_reg (GET_MODE_INNER (mode), var);
> +         emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
> +         return true;
> +       }
>        emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
>        var = force_reg (GET_MODE_INNER (mode), var);
>        ix86_expand_vector_set (mmx_ok, target, var, one_var);
> --- gcc/config/i386/sse.md.jj   2018-01-05 17:39:34.591260408 +0100
> +++ gcc/config/i386/sse.md      2018-01-11 15:52:22.762139416 +0100
> @@ -401,6 +401,9 @@ (define_mode_iterator VI8_AVX2
>  (define_mode_iterator VI8_AVX2_AVX512F
>    [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
>
> +(define_mode_iterator VI8_AVX_AVX512F
> +  [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX")])
> +
>  (define_mode_iterator VI4_128_8_256
>    [V4SI V4DI])
>
> @@ -622,6 +625,9 @@ (define_mode_iterator VI4F_128 [V4SI V4S
>  (define_mode_iterator VI8F_128 [V2DI V2DF])
>  (define_mode_iterator VI4F_256 [V8SI V8SF])
>  (define_mode_iterator VI8F_256 [V4DI V4DF])
> +(define_mode_iterator VI4F_256_512
> +  [V8SI V8SF
> +   (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")])
>  (define_mode_iterator VI48F_256_512
>    [V8SI V8SF
>    (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
> @@ -838,10 +844,12 @@ (define_mode_attr sseintprefix
>  ;; SSE scalar suffix for vector modes
>  (define_mode_attr ssescalarmodesuffix
>    [(SF "ss") (DF "sd")
> +   (V16SF "ss") (V8DF "sd")
>     (V8SF "ss") (V4DF "sd")
>     (V4SF "ss") (V2DF "sd")
> -   (V8SI "ss") (V4DI "sd")
> -   (V4SI "d")])
> +   (V16SI "d") (V8DI "q")
> +   (V8SI "d") (V4DI "q")
> +   (V4SI "d") (V2DI "q")])
>
>  ;; Pack/unpack vector modes
>  (define_mode_attr sseunpackmode
> @@ -7092,6 +7100,26 @@ (define_insn "*vec_setv4sf_sse4_1"
>     (set_attr "prefix" "orig,orig,maybe_evex")
>     (set_attr "mode" "V4SF")])
>
> +;; All of vinsertps, vmovss, vmovd clear also the higher bits.
> +(define_insn "vec_set<mode>_0"
> +  [(set (match_operand:VI4F_256_512 0 "register_operand" "=v,v,Yi")
> +       (vec_merge:VI4F_256_512
> +         (vec_duplicate:VI4F_256_512
> +           (match_operand:<ssescalarmode> 2 "general_operand" "v,m,r"))
> +         (match_operand:VI4F_256_512 1 "const0_operand" "C,C,C")
> +         (const_int 1)))]
> +  "TARGET_AVX"
> +  "@
> +   vinsertps\t{$0xe, %2, %2, %x0|%x0, %2, %2, 0xe}
> +   vmov<ssescalarmodesuffix>\t{%x2, %x0|%x0, %2}
> +   vmovd\t{%2, %x0|%x0, %2}"
> +  [(set (attr "type")
> +     (if_then_else (eq_attr "alternative" "0")
> +                  (const_string "sselog")
> +                  (const_string "ssemov")))
> +   (set_attr "prefix" "maybe_evex")
> +   (set_attr "mode" "SF,<ssescalarmode>,SI")])
> +
>  (define_insn "sse4_1_insertps"
>    [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
>         (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" 
> "Yrm,*xm,vm")
> @@ -9220,6 +9248,20 @@ (define_insn "vec_concatv2df"
>            (const_string "orig")))
>     (set_attr "mode" "V2DF,V2DF,V2DF, DF, DF, V1DF,V1DF,DF,V4SF,V2SF")])
>
> +;; vmovq clears also the higher bits.
> +(define_insn "vec_set<mode>_0"
> +  [(set (match_operand:VF2_512_256 0 "register_operand" "=v")
> +       (vec_merge:VF2_512_256
> +         (vec_duplicate:VF2_512_256
> +           (match_operand:<ssescalarmode> 2 "general_operand" "xm"))
> +         (match_operand:VF2_512_256 1 "const0_operand" "C")
> +         (const_int 1)))]
> +  "TARGET_AVX"
> +  "vmovq\t{%2, %x0|%x0, %2}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "maybe_evex")
> +   (set_attr "mode" "DF")])
> +
>  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>  ;;
>  ;; Parallel integer down-conversion operations
> @@ -13993,6 +14035,22 @@ (define_insn "vec_concatv2di"
>            (const_string "orig")))
>     (set_attr "mode" "TI,TI,TI,TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")])
>
> +;; vmovq clears also the higher bits.
> +(define_insn "vec_set<mode>_0"
> +  [(set (match_operand:VI8_AVX_AVX512F 0 "register_operand" "=Yi,v")
> +       (vec_merge:VI8_AVX_AVX512F
> +         (vec_duplicate:VI8_AVX_AVX512F
> +           (match_operand:<ssescalarmode> 2 "general_operand" "r,vm"))
> +         (match_operand:VI8_AVX_AVX512F 1 "const0_operand" "C,C")
> +         (const_int 1)))]
> +  "TARGET_AVX"
> +  "vmovq\t{%2, %x0|%x0, %2}"
> +  [(set_attr "isa" "x64,*")
> +   (set_attr "type" "ssemov")
> +   (set_attr "prefix_rex" "1,*")
> +   (set_attr "prefix" "maybe_evex")
> +   (set_attr "mode" "TI")])
> +
>  (define_expand "vec_unpacks_lo_<mode>"
>    [(match_operand:<sseunpackmode> 0 "register_operand")
>     (match_operand:VI124_AVX2_24_AVX512F_1_AVX512BW 1 "register_operand")]
> @@ -17743,6 +17801,8 @@ (define_insn "avx2_vbroadcasti128_<mode>
>  ;; Modes handled by AVX vec_dup patterns.
>  (define_mode_iterator AVX_VEC_DUP_MODE
>    [V8SI V8SF V4DI V4DF])
> +(define_mode_attr vecdupssescalarmodesuffix
> +  [(V8SF "ss") (V4DF "sd") (V8SI "ss") (V4DI "sd")])
>  ;; Modes handled by AVX2 vec_dup patterns.
>  (define_mode_iterator AVX2_VEC_DUP_MODE
>    [V32QI V16QI V16HI V8HI V8SI V4SI])
> @@ -17769,7 +17829,7 @@ (define_insn "vec_dup<mode>"
>    "TARGET_AVX"
>    "@
>     v<sseintprefix>broadcast<bcstscalarsuff>\t{%1, %0|%0, %1}
> -   vbroadcast<ssescalarmodesuffix>\t{%1, %0|%0, %1}
> +   vbroadcast<vecdupssescalarmodesuffix>\t{%1, %0|%0, %1}
>     v<sseintprefix>broadcast<bcstscalarsuff>\t{%x1, %0|%0, %x1}
>     v<sseintprefix>broadcast<bcstscalarsuff>\t{%x1, %g0|%g0, %x1}
>     #"
>
>         Jakub

Reply via email to