This patch to the i386 backend's ix86_expand_v1ti_ashiftrt provides
improved (shorter) implementations of V1TI mode arithmetic right shifts
for constant amounts between 111 and 126 bits.  The significance of
this range is that this functionality is useful for (eventually)
providing sign extension from HImode and QImode to V1TImode.

For example, x>>112 (to sign extend a 16-bit value), was previously
generated as a four operation sequence:

        movdqa  %xmm0, %xmm1            // word    7 6 5 4 3 2 1 0
        psrad   $31, %xmm0              // V8HI = [S,S,?,?,?,?,?,?]
        psrad   $16, %xmm1              // V8HI = [S,X,?,?,?,?,?,?]
        punpckhqdq      %xmm0, %xmm1    // V8HI = [S,S,?,?,S,X,?,?]
        pshufd  $253, %xmm1, %xmm0      // V8HI = [S,S,S,S,S,S,S,X]

with this patch, we now generates a three operation sequence:

        psrad   $16, %xmm0              // V8HI = [S,X,?,?,?,?,?,?]
        pshufhw $254, %xmm0, %xmm0      // V8HI = [S,S,S,X,?,?,?,?]
        pshufd  $254, %xmm0, %xmm0      // V8HI = [S,S,S,S,S,S,S,X]

The correctness of generated code is confirmed by the existing
run-time test gcc.target/i386/sse2-v1ti-ashiftrt-1.c in the testsuite.
This idiom is safe to use for shifts by 127, but that case gets handled
by a two operation sequence earlier in this function.


This patch has been tested on x86_64-pc-linux-gnu with a make bootstrap
and make -k check with no new failures.  OK for mainline?


2022-01-11  Roger Sayle  <ro...@nextmovesoftware.com>

gcc/ChangeLog
        * config/i386/i386-expand.c (ix86_expand_v1ti_ashiftrt): Provide
        new three operation implementations for shifts by 111..126 bits.


Thanks in advance,
Roger
--

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 95bba25..3138b5f 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -6470,12 +6470,12 @@ ix86_expand_v1ti_ashiftrt (rtx operands[])
   if (bits == 96)
     {
       /* Three operations.  */
-      rtx tmp3 = gen_reg_rtx (V2DImode);
       rtx tmp1 = gen_reg_rtx (V4SImode);
       rtx tmp2 = gen_reg_rtx (V4SImode);
       emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
 
+      rtx tmp3 = gen_reg_rtx (V2DImode);
       rtx tmp4 = gen_reg_rtx (V2DImode);
       rtx tmp5 = gen_reg_rtx (V2DImode);
       emit_move_insn (tmp3, gen_lowpart (V2DImode, tmp1));
@@ -6493,6 +6493,30 @@ ix86_expand_v1ti_ashiftrt (rtx operands[])
       return;
     }
 
+  if (bits >= 111)
+    {
+      /* Three operations.  */
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
+      emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
+
+      rtx tmp3 = gen_reg_rtx (V8HImode);
+      rtx tmp4 = gen_reg_rtx (V8HImode);
+      emit_move_insn (tmp3, gen_lowpart (V8HImode, tmp2));
+      emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
+
+      rtx tmp5 = gen_reg_rtx (V4SImode);
+      rtx tmp6 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp5, gen_lowpart (V4SImode, tmp4));
+      emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
+
+      rtx tmp7 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp7, gen_lowpart (V1TImode, tmp6));
+      emit_move_insn (operands[0], tmp7);
+      return;
+    }
+
   if (TARGET_AVX2 || TARGET_SSE4_1)
     {
       /* Three operations.  */

Reply via email to