On Fri, Jun 16, 2023 at 4:12 AM liuhongt <hongtao....@intel.com> wrote: > > packuswb/packusdw does unsigned saturation for signed source, but rtl > us_truncate means does unsigned saturation for unsigned source. > So for value -1, packuswb will produce 0, but us_truncate produces > 255. The patch reimplement those related patterns and functions with > UNSPEC_US_TRUNCATE instead of us_truncate. > > The patch will fix below testcase which failed after > g:921b841350c4fc298d09f6c5674663e0f4208610 added constant-folding for > US_TRUNCATE > > FAIL: gcc.target/i386/avx-vpackuswb-1.c execution test > FAIL: gcc.target/i386/avx2-vpackusdw-2.c execution test > FAIL: gcc.target/i386/avx2-vpackuswb-2.c execution test > FAIL: gcc.target/i386/sse2-packuswb-1.c execution test > > Bootstrapped and regtested on x86_64-pc-linux-gnu. > Ok for trunk?
Please proofread the ChangeLog entries and comments and fix confusion with truncation / saturation in comments. OK with the above change. Thanks, Uros. > > gcc/ChangeLog: > > PR target/110235 > * config/i386/i386-expand.cc (ix86_split_mmx_pack): Use > UNSPEC_US_TRUNCATE instead of original us_truncate for > packusdw/packuswb. > * config/i386/mmx.md (mmx_pack<s_trunsuffix>swb): Splitted to > below 2 new patterns. Just say: ...: Substitute with ... > (mmx_packsswb): New reload_completed define_insn_and_split. ...: ... this and ... > (mmx_packuswb): Ditto. ...: ... this. > (mmx_packusdw): Use UNSPEC_US_TRUNCATE instead of original > us_truncate. > (s_trunsuffix): Removed. ...: Remove code iterator. > (any_s_truncate): Removed. ...: Ditto. > * config/i386/sse.md (<sse2_avx2>_packuswb<mask_name>): Use > UNSPEC_US_TRUNCATE instead of original us_truncate. > (<sse4_1_avx2>_packusdw<mask_name>): Ditto. > * config/i386/i386.md (UNSPEC_US_TRUNCATE): New unspec_c_enum. > --- > gcc/config/i386/i386-expand.cc | 20 ++++++++++++---- > gcc/config/i386/i386.md | 4 ++++ > gcc/config/i386/mmx.md | 43 ++++++++++++++++++++++------------ > gcc/config/i386/sse.md | 20 ++++++++-------- > 4 files changed, 57 insertions(+), 30 deletions(-) > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > index def060ab562..35e2740f9b6 100644 > --- a/gcc/config/i386/i386-expand.cc > +++ b/gcc/config/i386/i386-expand.cc > @@ -1019,6 +1019,7 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code) > rtx op0 = operands[0]; > rtx op1 = operands[1]; > rtx op2 = operands[2]; > + rtx src; > > machine_mode dmode = GET_MODE (op0); > machine_mode smode = GET_MODE (op1); > @@ -1042,11 +1043,20 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code > code) > op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1)); > op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2)); > > - op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1); > - op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2); > - rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode, > - op1, op2)); > - emit_insn (insn); > + /* For packusdw/packuswb, it does unsigned saturation for > + signed source which is different for rtl US_TRUNCATE. */ paskusdw/packuswb does unsigned saturation of a signed source which is different from generic us_truncate RTX. > + if (code == US_TRUNCATE) > + src = gen_rtx_UNSPEC (sse_dmode, > + gen_rtvec (2, op1, op2), > + UNSPEC_US_TRUNCATE); > + else > + { > + op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1); > + op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2); > + src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2); > + } > + > + emit_move_insn (dest, src); > > ix86_move_vector_high_sse_to_mmx (op0); > } > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index 0929115ed4d..070a84d8af9 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -129,6 +129,10 @@ (define_c_enum "unspec" [ > UNSPEC_RSQRT > UNSPEC_PSADBW > > + ;; US_TRUNCATE this is different from rtl us_truncate, > + ;; it does unsigned truncation for signed source. Different from generic us_truncate RTX as it does unsigned saturation of signed source. > + UNSPEC_US_TRUNCATE > + > ;; For AVX/AVX512F support > UNSPEC_SCALEF > UNSPEC_PCMP > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > index 6fbe3909c8b..315eb4193c4 100644 > --- a/gcc/config/i386/mmx.md > +++ b/gcc/config/i386/mmx.md > @@ -3337,27 +3337,41 @@ (define_split > ;; > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > > -;; Used in signed and unsigned truncations with saturation. > -(define_code_iterator any_s_truncate [ss_truncate us_truncate]) > -;; Instruction suffix for truncations with saturation. > -(define_code_attr s_trunsuffix [(ss_truncate "s") (us_truncate "u")]) > - > -(define_insn_and_split "mmx_pack<s_trunsuffix>swb" > +(define_insn_and_split "mmx_packsswb" > [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw") > (vec_concat:V8QI > - (any_s_truncate:V4QI > + (ss_truncate:V4QI > (match_operand:V4HI 1 "register_operand" "0,0,Yw")) > - (any_s_truncate:V4QI > + (ss_truncate:V4QI > (match_operand:V4HI 2 "register_mmxmem_operand" "ym,x,Yw"))))] > "TARGET_MMX || TARGET_MMX_WITH_SSE" > "@ > - pack<s_trunsuffix>swb\t{%2, %0|%0, %2} > + packsswb\t{%2, %0|%0, %2} > + # > + #" > + "&& reload_completed > + && SSE_REGNO_P (REGNO (operands[0]))" > + [(const_int 0)] > + "ix86_split_mmx_pack (operands, SS_TRUNCATE); DONE;" > + [(set_attr "mmx_isa" "native,sse_noavx,avx") > + (set_attr "type" "mmxshft,sselog,sselog") > + (set_attr "mode" "DI,TI,TI")]) > + This instruction does unsigned saturation of signed source and is different from generic us_truncate RTX. > +(define_insn_and_split "mmx_packuswb" > + [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw") > + (unspec:V8QI > + [(match_operand:V4HI 1 "register_operand" "0,0,Yw") > + (match_operand:V4HI 2 "register_mmxmem_operand" "ym,x,Yw")] > + UNSPEC_US_TRUNCATE))] > + "TARGET_MMX || TARGET_MMX_WITH_SSE" > + "@ > + packuswb\t{%2, %0|%0, %2} > # > #" > "&& reload_completed > && SSE_REGNO_P (REGNO (operands[0]))" > [(const_int 0)] > - "ix86_split_mmx_pack (operands, <any_s_truncate:CODE>); DONE;" > + "ix86_split_mmx_pack (operands, US_TRUNCATE); DONE;" > [(set_attr "mmx_isa" "native,sse_noavx,avx") > (set_attr "type" "mmxshft,sselog,sselog") > (set_attr "mode" "DI,TI,TI")]) > @@ -3384,11 +3398,10 @@ (define_insn_and_split "mmx_packssdw" > > (define_insn_and_split "mmx_packusdw" > [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,Yw") > - (vec_concat:V4HI > - (us_truncate:V2HI > - (match_operand:V2SI 1 "register_operand" "0,0,Yw")) > - (us_truncate:V2HI > - (match_operand:V2SI 2 "register_operand" "Yr,*x,Yw"))))] > + (unspec:V4HI > + [(match_operand:V2SI 1 "register_operand" "0,0,Yw") > + (match_operand:V2SI 2 "register_operand" "Yr,*x,Yw")] > + UNSPEC_US_TRUNCATE))] > "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" > "#" > "&& reload_completed" > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 7d4b4ec8df5..83e3f534fd2 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -17796,13 +17796,14 @@ (define_insn "<sse2_avx2>_packssdw<mask_name>" > (set_attr "prefix" "orig,<mask_prefix>") > (set_attr "mode" "<sseinsnmode>")]) > > +;; This is different from rtl unsigned saturation, the instruction does > +;; unsigned saturation for signed value. This instruction does unsigned saturation of signed source and is different from generic us_truncate RTX. > (define_insn "<sse2_avx2>_packuswb<mask_name>" > [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,<v_Yw>") > - (vec_concat:VI1_AVX512 > - (us_truncate:<ssehalfvecmode> > - (match_operand:<sseunpackmode> 1 "register_operand" "0,<v_Yw>")) > - (us_truncate:<ssehalfvecmode> > - (match_operand:<sseunpackmode> 2 "vector_operand" > "xBm,<v_Yw>m"))))] > + (unspec:VI1_AVX512 > + [(match_operand:<sseunpackmode> 1 "register_operand" "0,<v_Yw>") > + (match_operand:<sseunpackmode> 2 "vector_operand" "xBm,<v_Yw>m")] > + UNSPEC_US_TRUNCATE))] > "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" > "@ > packuswb\t{%2, %0|%0, %2} > @@ -21889,11 +21890,10 @@ (define_insn "<sse4_1_avx2>_mpsadbw" > > (define_insn "<sse4_1_avx2>_packusdw<mask_name>" > [(set (match_operand:VI2_AVX2 0 "register_operand" "=Yr,*x,<v_Yw>") > - (vec_concat:VI2_AVX2 > - (us_truncate:<ssehalfvecmode> > - (match_operand:<sseunpackmode> 1 "register_operand" "0,0,<v_Yw>")) > - (us_truncate:<ssehalfvecmode> > - (match_operand:<sseunpackmode> 2 "vector_operand" > "YrBm,*xBm,<v_Yw>m"))))] > + (unspec:VI2_AVX2 > + [(match_operand:<sseunpackmode> 1 "register_operand" "0,0,<v_Yw>") > + (match_operand:<sseunpackmode> 2 "vector_operand" > "YrBm,*xBm,<v_Yw>m")] > + UNSPEC_US_TRUNCATE))] > "TARGET_SSE4_1 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" > "@ > packusdw\t{%2, %0|%0, %2} > -- > 2.39.1.388.g2fc9e9ca3c >