packusdw with UNSPEC_US_TRUNCATE instead of original us_truncate.

Uros Bizjak via Gcc-patches Fri, 16 Jun 2023 00:33:11 -0700

On Fri, Jun 16, 2023 at 4:12 AM liuhongt <hongtao....@intel.com> wrote:
>
> packuswb/packusdw does unsigned saturation for signed source, but rtl
> us_truncate means does unsigned saturation for unsigned source.
> So for value -1, packuswb will produce 0, but us_truncate produces
> 255. The patch reimplement those related patterns and functions with
> UNSPEC_US_TRUNCATE instead of us_truncate.
>
> The patch will fix below testcase which failed after
> g:921b841350c4fc298d09f6c5674663e0f4208610 added constant-folding for 
> US_TRUNCATE
>
> FAIL: gcc.target/i386/avx-vpackuswb-1.c execution test
> FAIL: gcc.target/i386/avx2-vpackusdw-2.c execution test
> FAIL: gcc.target/i386/avx2-vpackuswb-2.c execution test
> FAIL: gcc.target/i386/sse2-packuswb-1.c execution test
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu.
> Ok for trunk?


Please proofread the ChangeLog entries and comments and fix confusion
with truncation / saturation in comments.

OK with the above change.

Thanks,
Uros.

>
> gcc/ChangeLog:
>
>         PR target/110235
>         * config/i386/i386-expand.cc (ix86_split_mmx_pack): Use
>         UNSPEC_US_TRUNCATE instead of original us_truncate for
>         packusdw/packuswb.
>         * config/i386/mmx.md (mmx_pack<s_trunsuffix>swb): Splitted to
>         below 2 new patterns.

Just say: ...: Substitute with ...

>         (mmx_packsswb): New reload_completed define_insn_and_split.

...: ... this and ...
>         (mmx_packuswb): Ditto.
...: ... this.

>         (mmx_packusdw): Use UNSPEC_US_TRUNCATE instead of original
>         us_truncate.
>         (s_trunsuffix): Removed.

...: Remove code iterator.

>         (any_s_truncate): Removed.

...: Ditto.

>         * config/i386/sse.md (<sse2_avx2>_packuswb<mask_name>): Use
>         UNSPEC_US_TRUNCATE instead of original us_truncate.
>         (<sse4_1_avx2>_packusdw<mask_name>): Ditto.
>         * config/i386/i386.md (UNSPEC_US_TRUNCATE): New unspec_c_enum.
> ---
>  gcc/config/i386/i386-expand.cc | 20 ++++++++++++----
>  gcc/config/i386/i386.md        |  4 ++++
>  gcc/config/i386/mmx.md         | 43 ++++++++++++++++++++++------------
>  gcc/config/i386/sse.md         | 20 ++++++++--------
>  4 files changed, 57 insertions(+), 30 deletions(-)
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index def060ab562..35e2740f9b6 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -1019,6 +1019,7 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
>    rtx op0 = operands[0];
>    rtx op1 = operands[1];
>    rtx op2 = operands[2];
> +  rtx src;
>
>    machine_mode dmode = GET_MODE (op0);
>    machine_mode smode = GET_MODE (op1);
> @@ -1042,11 +1043,20 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code 
> code)
>    op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
>    op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
>
> -  op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
> -  op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
> -  rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
> -                                                   op1, op2));
> -  emit_insn (insn);
> +  /* For packusdw/packuswb, it does unsigned saturation for
> +     signed source which is different for rtl US_TRUNCATE.  */

paskusdw/packuswb does unsigned saturation of a signed source which is
different from generic us_truncate RTX.

> +  if (code == US_TRUNCATE)
> +    src = gen_rtx_UNSPEC (sse_dmode,
> +                         gen_rtvec (2, op1, op2),
> +                         UNSPEC_US_TRUNCATE);
> +  else
> +    {
> +      op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
> +      op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
> +      src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
> +    }
> +
> +  emit_move_insn (dest, src);
>
>    ix86_move_vector_high_sse_to_mmx (op0);
>  }
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 0929115ed4d..070a84d8af9 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -129,6 +129,10 @@ (define_c_enum "unspec" [
>    UNSPEC_RSQRT
>    UNSPEC_PSADBW
>
> +  ;; US_TRUNCATE this is different from rtl us_truncate,
> +  ;; it does unsigned truncation for signed source.

Different from generic us_truncate RTX as it does unsigned saturation
of signed source.

> +  UNSPEC_US_TRUNCATE
> +
>    ;; For AVX/AVX512F support
>    UNSPEC_SCALEF
>    UNSPEC_PCMP
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index 6fbe3909c8b..315eb4193c4 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -3337,27 +3337,41 @@ (define_split
>  ;;
>  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>
> -;; Used in signed and unsigned truncations with saturation.
> -(define_code_iterator any_s_truncate [ss_truncate us_truncate])
> -;; Instruction suffix for truncations with saturation.
> -(define_code_attr s_trunsuffix [(ss_truncate "s") (us_truncate "u")])
> -
> -(define_insn_and_split "mmx_pack<s_trunsuffix>swb"
> +(define_insn_and_split "mmx_packsswb"
>    [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw")
>         (vec_concat:V8QI
> -         (any_s_truncate:V4QI
> +         (ss_truncate:V4QI
>             (match_operand:V4HI 1 "register_operand" "0,0,Yw"))
> -         (any_s_truncate:V4QI
> +         (ss_truncate:V4QI
>             (match_operand:V4HI 2 "register_mmxmem_operand" "ym,x,Yw"))))]
>    "TARGET_MMX || TARGET_MMX_WITH_SSE"
>    "@
> -   pack<s_trunsuffix>swb\t{%2, %0|%0, %2}
> +   packsswb\t{%2, %0|%0, %2}
> +   #
> +   #"
> +  "&& reload_completed
> +   && SSE_REGNO_P (REGNO (operands[0]))"
> +  [(const_int 0)]
> +  "ix86_split_mmx_pack (operands, SS_TRUNCATE); DONE;"
> +  [(set_attr "mmx_isa" "native,sse_noavx,avx")
> +   (set_attr "type" "mmxshft,sselog,sselog")
> +   (set_attr "mode" "DI,TI,TI")])
> +

This instruction does unsigned saturation of signed source and is
different from generic us_truncate RTX.

> +(define_insn_and_split "mmx_packuswb"
> +  [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw")
> +       (unspec:V8QI
> +         [(match_operand:V4HI 1 "register_operand" "0,0,Yw")
> +          (match_operand:V4HI 2 "register_mmxmem_operand" "ym,x,Yw")]
> +         UNSPEC_US_TRUNCATE))]
> +  "TARGET_MMX || TARGET_MMX_WITH_SSE"
> +  "@
> +   packuswb\t{%2, %0|%0, %2}
>     #
>     #"
>    "&& reload_completed
>     && SSE_REGNO_P (REGNO (operands[0]))"
>    [(const_int 0)]
> -  "ix86_split_mmx_pack (operands, <any_s_truncate:CODE>); DONE;"
> +  "ix86_split_mmx_pack (operands, US_TRUNCATE); DONE;"
>    [(set_attr "mmx_isa" "native,sse_noavx,avx")
>     (set_attr "type" "mmxshft,sselog,sselog")
>     (set_attr "mode" "DI,TI,TI")])
> @@ -3384,11 +3398,10 @@ (define_insn_and_split "mmx_packssdw"
>
>  (define_insn_and_split "mmx_packusdw"
>    [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,Yw")
> -       (vec_concat:V4HI
> -         (us_truncate:V2HI
> -           (match_operand:V2SI 1 "register_operand" "0,0,Yw"))
> -         (us_truncate:V2HI
> -           (match_operand:V2SI 2 "register_operand" "Yr,*x,Yw"))))]
> +       (unspec:V4HI
> +         [(match_operand:V2SI 1 "register_operand" "0,0,Yw")
> +          (match_operand:V2SI 2 "register_operand" "Yr,*x,Yw")]
> +          UNSPEC_US_TRUNCATE))]
>    "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
>    "#"
>    "&& reload_completed"
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 7d4b4ec8df5..83e3f534fd2 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -17796,13 +17796,14 @@ (define_insn "<sse2_avx2>_packssdw<mask_name>"
>     (set_attr "prefix" "orig,<mask_prefix>")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +;; This is different from rtl unsigned saturation, the instruction does
> +;; unsigned saturation for signed value.

This instruction does unsigned saturation of signed source and is
different from generic us_truncate RTX.

>  (define_insn "<sse2_avx2>_packuswb<mask_name>"
>    [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,<v_Yw>")
> -       (vec_concat:VI1_AVX512
> -         (us_truncate:<ssehalfvecmode>
> -           (match_operand:<sseunpackmode> 1 "register_operand" "0,<v_Yw>"))
> -         (us_truncate:<ssehalfvecmode>
> -           (match_operand:<sseunpackmode> 2 "vector_operand" 
> "xBm,<v_Yw>m"))))]
> +       (unspec:VI1_AVX512
> +         [(match_operand:<sseunpackmode> 1 "register_operand" "0,<v_Yw>")
> +          (match_operand:<sseunpackmode> 2 "vector_operand" "xBm,<v_Yw>m")]
> +         UNSPEC_US_TRUNCATE))]
>    "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
>    "@
>     packuswb\t{%2, %0|%0, %2}
> @@ -21889,11 +21890,10 @@ (define_insn "<sse4_1_avx2>_mpsadbw"
>
>  (define_insn "<sse4_1_avx2>_packusdw<mask_name>"
>    [(set (match_operand:VI2_AVX2 0 "register_operand" "=Yr,*x,<v_Yw>")
> -       (vec_concat:VI2_AVX2
> -         (us_truncate:<ssehalfvecmode>
> -           (match_operand:<sseunpackmode> 1 "register_operand" "0,0,<v_Yw>"))
> -         (us_truncate:<ssehalfvecmode>
> -           (match_operand:<sseunpackmode> 2 "vector_operand" 
> "YrBm,*xBm,<v_Yw>m"))))]
> +       (unspec:VI2_AVX2
> +         [(match_operand:<sseunpackmode> 1 "register_operand" "0,0,<v_Yw>")
> +          (match_operand:<sseunpackmode> 2 "vector_operand" 
> "YrBm,*xBm,<v_Yw>m")]
> +          UNSPEC_US_TRUNCATE))]
>    "TARGET_SSE4_1 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
>    "@
>     packusdw\t{%2, %0|%0, %2}
> --
> 2.39.1.388.g2fc9e9ca3c
>

Re: [PATCH 1/2] Reimplement packuswb/packusdw with UNSPEC_US_TRUNCATE instead of original us_truncate.

Reply via email to