On Tue, 9 Feb 2021, Jakub Jelinek wrote:

> Hi!
> 
> As mentioned in the PR, we don't support arithmetic right V2DImode or
> V4DImode on x86 without -mavx512vl or -mxop.  The ISAs indeed don't have
> {,v}psraq instructions until AVX512VL, but we actually can emulate it quite
> easily.
> One case is arithmetic >> 63, we can just emit {,v}pxor; {,v}pcmpgt for
> that for SSE4.2+, or for SSE2 psrad $31; pshufd $0xf5.
> Then arithmetic >> by constant > 32, that can be done with {,v}psrad $31
> and {,v}psrad $(cst-32) and two operand permutation,
> arithmetic >> 32 can be done as {,v}psrad $31 and permutation of that
> and the original operand.  Arithmetic >> by constant < 32 can be done
> as {,v}psrad $cst and {,v}psrlq $cst and two operand permutation.
> And arithmetic >> by variable scalar amount can be done as
> arithmetic >> 63, logical >> by the amount, << by (64 - amount of the
> >> 63 result; note that the vector << 64 result in 0) and oring together.
> 
> I had to improve the permutation generation so that it actually handles
> the needed permutations (or handles them better).
> 
> Richard, does this actually improve the benchmark that regressed?

No, it doesn't improve the benchmark which, even when optimally
vectorized, is slower than scalar.  Note there's more fundamental
SLP support missing to actually get at vectorizing in this case
since on one path we have a "missing" * 1 operation.

> If not, I guess this is a GCC 12 material.

Yeah, it does look useful in the end.  Note that you might want
to adjust ix86_add_stmt_cost (or ix86_shift_rotate_cost, that is)
to reflect the complex expansion.

Richard.

> Bootstrapped/regtested on x86_64-linux and i686-linux.
> 
> 2021-02-09  Jakub Jelinek  <ja...@redhat.com>
> 
>       PR tree-optimization/98856
>       * config/i386/i386.c (ix86_shift_rotate_cost): Add CODE argument.
>       Expect V2DI and V4DI arithmetic right shifts to be emulated.
>       (ix86_rtx_costs, ix86_add_stmt_cost): Adjust ix86_shift_rotate_cost
>       caller.
>       * config/i386/i386-expand.c (expand_vec_perm_2perm_interleave,
>       expand_vec_perm_2perm_pblendv): New functions.
>       (ix86_expand_vec_perm_const_1): Use them.
>       * config/i386/sse.md (ashr<mode>3<mask_name>): Rename to ...
>       (<mask_codefor>ashr<mode>3<mask_name>): ... this.
>       (ashr<mode>3): New define_expand with VI248_AVX512BW iterator.
>       (ashrv4di3): New define_expand.
>       (ashrv2di3): Change condition to TARGET_SSE2, handle !TARGET_XOP
>       and !TARGET_AVX512VL expansion.
> 
>       * gcc.target/i386/sse2-psraq-1.c: New test.
>       * gcc.target/i386/sse4_2-psraq-1.c: New test.
>       * gcc.target/i386/avx-psraq-1.c: New test.
>       * gcc.target/i386/avx2-psraq-1.c: New test.
>       * gcc.target/i386/avx-pr82370.c: Adjust expected number of vpsrad
>       instructions.
>       * gcc.target/i386/avx2-pr82370.c: Likewise.
>       * gcc.target/i386/avx512f-pr82370.c: Likewise.
>       * gcc.target/i386/avx512bw-pr82370.c: Likewise.
>       * gcc.dg/torture/vshuf-4.inc: Add two further permutations.
>       * gcc.dg/torture/vshuf-8.inc: Likewise.
> 
> --- gcc/config/i386/i386.c.jj 2021-02-08 19:07:21.869292064 +0100
> +++ gcc/config/i386/i386.c    2021-02-09 10:12:30.934084039 +0100
> @@ -19689,6 +19689,7 @@ ix86_division_cost (const struct process
>  
>  static int
>  ix86_shift_rotate_cost (const struct processor_costs *cost,
> +                     enum rtx_code code,
>                       enum machine_mode mode, bool constant_op1,
>                       HOST_WIDE_INT op1_val,
>                       bool speed,
> @@ -19727,6 +19728,19 @@ ix86_shift_rotate_cost (const struct pro
>           count = 7;
>         return ix86_vec_cost (mode, cost->sse_op * count);
>       }
> +      /* V*DImode arithmetic right shift is emulated.  */
> +      else if (code == ASHIFTRT
> +            && (mode == V2DImode || mode == V4DImode)
> +            && !TARGET_XOP
> +            && !TARGET_AVX512VL)
> +     {
> +       int count = 4;
> +       if (constant_op1 && op1_val == 63 && TARGET_SSE4_2)
> +         count = 2;
> +       else if (constant_op1)
> +         count = 3;
> +       return ix86_vec_cost (mode, cost->sse_op * count);
> +     }
>        else
>       return ix86_vec_cost (mode, cost->sse_op);
>      }
> @@ -19896,13 +19910,15 @@ ix86_rtx_costs (rtx x, machine_mode mode
>      case LSHIFTRT:
>      case ROTATERT:
>        bool skip_op0, skip_op1;
> -      *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
> +      *total = ix86_shift_rotate_cost (cost, code, mode,
> +                                    CONSTANT_P (XEXP (x, 1)),
>                                      CONST_INT_P (XEXP (x, 1))
>                                        ? INTVAL (XEXP (x, 1)) : -1,
>                                      speed,
>                                      GET_CODE (XEXP (x, 1)) == AND,
>                                      SUBREG_P (XEXP (x, 1))
> -                                    && GET_CODE (XEXP (XEXP (x, 1), 0)) == 
> AND,
> +                                    && GET_CODE (XEXP (XEXP (x, 1),
> +                                                       0)) == AND,
>                                      &skip_op0, &skip_op1);
>        if (skip_op0 || skip_op1)
>       {
> @@ -22335,11 +22351,16 @@ ix86_add_stmt_cost (class vec_info *vinf
>       case LROTATE_EXPR:
>       case RROTATE_EXPR:
>         {
> +         tree op1 = gimple_assign_rhs1 (stmt_info->stmt);
>           tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
>           stmt_cost = ix86_shift_rotate_cost
> -                        (ix86_cost, mode,
> +                        (ix86_cost,
> +                         (subcode == RSHIFT_EXPR
> +                          && !TYPE_UNSIGNED (TREE_TYPE (op1)))
> +                         ? ASHIFTRT : LSHIFTRT, mode,
>                           TREE_CODE (op2) == INTEGER_CST,
> -                         cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : 
> -1,
> +                         cst_and_fits_in_hwi (op2)
> +                         ? int_cst_value (op2) : -1,
>                           true, false, false, NULL, NULL);
>         }
>         break;
> --- gcc/config/i386/i386-expand.c.jj  2021-02-08 19:07:21.852292253 +0100
> +++ gcc/config/i386/i386-expand.c     2021-02-09 10:12:30.949083871 +0100
> @@ -18582,6 +18582,242 @@ expand_vec_perm_vperm2f128_vblend (struc
>    return true;
>  }
>  
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
> +   a two vector permutation using two single vector permutations and
> +   {,v}{,p}unpckl{ps,pd,bw,wd,dq}.  If two_insn, succeed only if one
> +   of dfirst or dsecond is identity permutation.  */
> +
> +static bool
> +expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
> +{
> +  unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
> +  struct expand_vec_perm_d dfirst, dsecond, dfinal;
> +  bool ident1 = true, ident2 = true;
> +
> +  if (d->one_operand_p)
> +    return false;
> +
> +  if (GET_MODE_SIZE (d->vmode) == 16)
> +    {
> +      if (!TARGET_SSE)
> +     return false;
> +      if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
> +     return false;
> +    }
> +  else if (GET_MODE_SIZE (d->vmode) == 32)
> +    {
> +      if (!TARGET_AVX)
> +     return false;
> +      if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
> +     return false;
> +      lane = nelt2;
> +    }
> +  else
> +    return false;
> +
> +  for (i = 1; i < nelt; i++)
> +    if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
> +      return false;
> +
> +  dfirst = *d;
> +  dsecond = *d;
> +  dfinal = *d;
> +  dfirst.op1 = dfirst.op0;
> +  dfirst.one_operand_p = true;
> +  dsecond.op0 = dsecond.op1;
> +  dsecond.one_operand_p = true;
> +
> +  for (i = 0; i < nelt; i++)
> +    if (d->perm[i] >= nelt)
> +      {
> +     dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
> +     if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
> +       ident2 = false;
> +     dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
> +       = d->perm[i] - nelt;
> +      }
> +    else
> +      {
> +     dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
> +     if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
> +       ident1 = false;
> +     dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
> +      }
> +
> +  if (two_insn && !ident1 && !ident2)
> +    return false;
> +
> +  if (!d->testing_p)
> +    {
> +      if (!ident1)
> +     dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
> +      if (!ident2)
> +     dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
> +      if (d->perm[0] >= nelt)
> +     std::swap (dfinal.op0, dfinal.op1);
> +    }
> +
> +  bool ok;
> +  rtx_insn *seq1 = NULL, *seq2 = NULL;
> +
> +  if (!ident1)
> +    {
> +      start_sequence ();
> +      ok = expand_vec_perm_1 (&dfirst);
> +      seq1 = get_insns ();
> +      end_sequence ();
> +
> +      if (!ok)
> +     return false;
> +    }
> +
> +  if (!ident2)
> +    {
> +      start_sequence ();
> +      ok = expand_vec_perm_1 (&dsecond);
> +      seq2 = get_insns ();
> +      end_sequence ();
> +
> +      if (!ok)
> +     return false;
> +    }
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  for (i = 0; i < nelt; i++)
> +    {
> +      dfinal.perm[i] = i / 2;
> +      if (i >= lane)
> +     dfinal.perm[i] += lane / 2;
> +      if ((i & 1) != 0)
> +     dfinal.perm[i] += nelt;
> +    }
> +  emit_insn (seq1);
> +  emit_insn (seq2);
> +  ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
> +                            dfinal.perm, dfinal.nelt, false);
> +  gcc_assert (ok);
> +  return true;
> +}
> +
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
> +   the permutation using two single vector permutations and the SSE4_1 
> pblendv
> +   instruction.  If two_insn, succeed only if one of dfirst or dsecond is
> +   identity permutation.  */
> +
> +static bool
> +expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
> +{
> +  unsigned i, nelt = d->nelt;
> +  struct expand_vec_perm_d dfirst, dsecond, dfinal;
> +  machine_mode vmode = d->vmode;
> +  bool ident1 = true, ident2 = true;
> +
> +  /* Use the same checks as in expand_vec_perm_blend.  */
> +  if (d->one_operand_p)
> +    return false;
> +  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
> +    ;
> +  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
> +    ;
> +  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
> +    ;
> +  else
> +    return false;
> +
> +  dfirst = *d;
> +  dsecond = *d;
> +  dfinal = *d;
> +  dfirst.op1 = dfirst.op0;
> +  dfirst.one_operand_p = true;
> +  dsecond.op0 = dsecond.op1;
> +  dsecond.one_operand_p = true;
> +
> +  for (i = 0; i < nelt; ++i)
> +    if (d->perm[i] >= nelt)
> +      {
> +     dfirst.perm[i] = 0xff;
> +     dsecond.perm[i] = d->perm[i] - nelt;
> +     if (d->perm[i] != i + nelt)
> +       ident2 = false;
> +      }
> +    else
> +      {
> +     dsecond.perm[i] = 0xff;
> +     dfirst.perm[i] = d->perm[i];
> +     if (d->perm[i] != i)
> +       ident1 = false;
> +      }
> +
> +  if (two_insn && !ident1 && !ident2)
> +    return false;
> +
> +  /* For now.  Ideally treat 0xff as a wildcard.  */
> +  for (i = 0; i < nelt; ++i)
> +    if (dfirst.perm[i] == 0xff)
> +      {
> +     if (GET_MODE_SIZE (vmode) == 32
> +         && dfirst.perm[i ^ (nelt / 2)] != 0xff)
> +       dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
> +     else
> +       dfirst.perm[i] = i;
> +      }
> +    else
> +      {
> +     if (GET_MODE_SIZE (vmode) == 32
> +         && dsecond.perm[i ^ (nelt / 2)] != 0xff)
> +       dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
> +     else
> +       dsecond.perm[i] = i;
> +      }
> +
> +  if (!d->testing_p)
> +    {
> +      if (!ident1)
> +     dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
> +      if (!ident2)
> +     dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
> +    }
> +
> +  bool ok;
> +  rtx_insn *seq1 = NULL, *seq2 = NULL;
> +
> +  if (!ident1)
> +    {
> +      start_sequence ();
> +      ok = expand_vec_perm_1 (&dfirst);
> +      seq1 = get_insns ();
> +      end_sequence ();
> +
> +      if (!ok)
> +     return false;
> +    }
> +
> +  if (!ident2)
> +    {
> +      start_sequence ();
> +      ok = expand_vec_perm_1 (&dsecond);
> +      seq2 = get_insns ();
> +      end_sequence ();
> +
> +      if (!ok)
> +     return false;
> +    }
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  for (i = 0; i < nelt; ++i)
> +    dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
> +
> +  emit_insn (seq1);
> +  emit_insn (seq2);
> +  ok = expand_vec_perm_blend (&dfinal);
> +  gcc_assert (ok);
> +  return true;
> +}
> +
>  /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
>     permutation using two vperm2f128, followed by a vshufpd insn blending
>     the two vectors together.  */
> @@ -19693,6 +19929,12 @@ ix86_expand_vec_perm_const_1 (struct exp
>    if (expand_vec_perm_pblendv (d))
>      return true;
>  
> +  if (expand_vec_perm_2perm_interleave (d, true))
> +    return true;
> +
> +  if (expand_vec_perm_2perm_pblendv (d, true))
> +    return true;
> +
>    /* Try sequences of three instructions.  */
>  
>    if (expand_vec_perm_even_odd_pack (d))
> @@ -19710,6 +19952,12 @@ ix86_expand_vec_perm_const_1 (struct exp
>    if (expand_vec_perm_vperm2f128_vblend (d))
>      return true;
>  
> +  if (expand_vec_perm_2perm_interleave (d, false))
> +    return true;
> +
> +  if (expand_vec_perm_2perm_pblendv (d, false))
> +    return true;
> +
>    /* Try sequences of four instructions.  */
>  
>    if (expand_vec_perm_even_odd_trunc (d))
> --- gcc/config/i386/sse.md.jj 2021-02-08 19:07:21.871292042 +0100
> +++ gcc/config/i386/sse.md    2021-02-09 11:43:21.487646494 +0100
> @@ -12458,7 +12458,7 @@ (define_insn "ashr<mode>3"
>     (set_attr "prefix" "orig,vex")
>     (set_attr "mode" "<sseinsnmode>")])
>  
> -(define_insn "ashr<mode>3<mask_name>"
> +(define_insn "<mask_codefor>ashr<mode>3<mask_name>"
>    [(set (match_operand:VI248_AVX512BW_AVX512VL 0 "register_operand" "=v,v")
>       (ashiftrt:VI248_AVX512BW_AVX512VL
>         (match_operand:VI248_AVX512BW_AVX512VL 1 "nonimmediate_operand" 
> "v,vm")
> @@ -12472,6 +12472,126 @@ (define_insn "ashr<mode>3<mask_name>"
>         (const_string "0")))
>     (set_attr "mode" "<sseinsnmode>")])
>  
> +(define_expand "ashr<mode>3"
> +  [(set (match_operand:VI248_AVX512BW 0 "register_operand")
> +     (ashiftrt:VI248_AVX512BW
> +       (match_operand:VI248_AVX512BW 1 "nonimmediate_operand")
> +       (match_operand:DI 2 "nonmemory_operand")))]
> +  "TARGET_AVX512F")
> +
> +(define_expand "ashrv4di3"
> +  [(set (match_operand:V4DI 0 "register_operand")
> +     (ashiftrt:V4DI
> +       (match_operand:V4DI 1 "nonimmediate_operand")
> +       (match_operand:DI 2 "nonmemory_operand")))]
> +  "TARGET_AVX2"
> +{
> +  if (!TARGET_AVX512VL)
> +    {
> +      if (CONST_INT_P (operands[2]) && UINTVAL (operands[2]) >= 63)
> +     {
> +       rtx zero = force_reg (V4DImode, CONST0_RTX (V4DImode));
> +       emit_insn (gen_avx2_gtv4di3 (operands[0], zero, operands[1]));
> +       DONE;
> +     }
> +      if (operands[2] == const0_rtx)
> +     {
> +       emit_move_insn (operands[0], operands[1]);
> +       DONE;
> +     }
> +      operands[1] = force_reg (V4DImode, operands[1]);
> +      if (CONST_INT_P (operands[2]))
> +     {
> +       vec_perm_builder sel (8, 8, 1);
> +       sel.quick_grow (8);
> +       rtx arg0, arg1;
> +       rtx op1 = lowpart_subreg (V8SImode, operands[1], V4DImode);
> +       rtx target = gen_reg_rtx (V8SImode);
> +       if (INTVAL (operands[2]) > 32)
> +         {
> +           arg0 = gen_reg_rtx (V8SImode);
> +           arg1 = gen_reg_rtx (V8SImode);
> +           emit_insn (gen_ashrv8si3 (arg1, op1, GEN_INT (31)));
> +           emit_insn (gen_ashrv8si3 (arg0, op1,
> +                                     GEN_INT (INTVAL (operands[2]) - 32)));
> +           sel[0] = 1;
> +           sel[1] = 9;
> +           sel[2] = 3;
> +           sel[3] = 11;
> +           sel[4] = 5;
> +           sel[5] = 13;
> +           sel[6] = 7;
> +           sel[7] = 15;
> +         }
> +       else if (INTVAL (operands[2]) == 32)
> +         {
> +           arg0 = op1;
> +           arg1 = gen_reg_rtx (V8SImode);
> +           emit_insn (gen_ashrv8si3 (arg1, op1, GEN_INT (31)));
> +           sel[0] = 1;
> +           sel[1] = 9;
> +           sel[2] = 3;
> +           sel[3] = 11;
> +           sel[4] = 5;
> +           sel[5] = 13;
> +           sel[6] = 7;
> +           sel[7] = 15;
> +         }
> +       else
> +         {
> +           arg0 = gen_reg_rtx (V4DImode);
> +           arg1 = gen_reg_rtx (V8SImode);
> +           emit_insn (gen_lshrv4di3 (arg0, operands[1], operands[2]));
> +           emit_insn (gen_ashrv8si3 (arg1, op1, operands[2]));
> +           arg0 = lowpart_subreg (V8SImode, arg0, V4DImode);
> +           sel[0] = 0;
> +           sel[1] = 9;
> +           sel[2] = 2;
> +           sel[3] = 11;
> +           sel[4] = 4;
> +           sel[5] = 13;
> +           sel[6] = 6;
> +           sel[7] = 15;
> +         }
> +       vec_perm_indices indices (sel, 2, 8);
> +       bool ok = targetm.vectorize.vec_perm_const (V8SImode, target,
> +                                                   arg0, arg1, indices);
> +       gcc_assert (ok);
> +       emit_move_insn (operands[0],
> +                       lowpart_subreg (V4DImode, target, V8SImode));
> +       DONE;
> +     }
> +
> +      rtx zero = force_reg (V4DImode, CONST0_RTX (V4DImode));
> +      rtx zero_or_all_ones = gen_reg_rtx (V4DImode);
> +      emit_insn (gen_avx2_gtv4di3 (zero_or_all_ones, zero, operands[1]));
> +      rtx lshr_res = gen_reg_rtx (V4DImode);
> +      emit_insn (gen_lshrv4di3 (lshr_res, operands[1], operands[2]));
> +      rtx ashl_res = gen_reg_rtx (V4DImode);
> +      rtx amount;
> +      if (TARGET_64BIT)
> +     {
> +       amount = gen_reg_rtx (DImode);
> +       emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)),
> +                              operands[2]));
> +     }
> +      else
> +     {
> +       rtx temp = gen_reg_rtx (SImode);
> +       emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)),
> +                              lowpart_subreg (SImode, operands[2],
> +                                              DImode)));
> +       amount = gen_reg_rtx (V4SImode);
> +       emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode),
> +                                     temp));
> +     }
> +      amount = lowpart_subreg (DImode, amount, GET_MODE (amount));
> +      emit_insn (gen_ashlv4di3 (ashl_res, zero_or_all_ones, amount));
> +      emit_insn (gen_iorv4di3 (operands[0], lshr_res, ashl_res));
> +      DONE;
> +    }
> +})
> +
>  (define_insn "<mask_codefor><insn><mode>3<mask_name>"
>    [(set (match_operand:VI248_AVX512BW_2 0 "register_operand" "=v,v")
>       (any_lshift:VI248_AVX512BW_2
> @@ -20313,10 +20433,132 @@ (define_expand "ashrv2di3"
>       (ashiftrt:V2DI
>         (match_operand:V2DI 1 "register_operand")
>         (match_operand:DI 2 "nonmemory_operand")))]
> -  "TARGET_XOP || TARGET_AVX512VL"
> +  "TARGET_SSE2"
>  {
>    if (!TARGET_AVX512VL)
>      {
> +      if (TARGET_SSE4_2
> +       && CONST_INT_P (operands[2])
> +       && UINTVAL (operands[2]) >= 63)
> +     {
> +       rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
> +       emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1]));
> +       DONE;
> +     }
> +      if (operands[2] == const0_rtx)
> +     {
> +       emit_move_insn (operands[0], operands[1]);
> +       DONE;
> +     }
> +      if (CONST_INT_P (operands[2])
> +       && (!TARGET_XOP || UINTVAL (operands[2]) >= 63))
> +     {
> +       vec_perm_builder sel (4, 4, 1);
> +       sel.quick_grow (4);
> +       rtx arg0, arg1;
> +       rtx op1 = lowpart_subreg (V4SImode, operands[1], V2DImode);
> +       rtx target = gen_reg_rtx (V4SImode);
> +       if (UINTVAL (operands[2]) >= 63)
> +         {
> +           arg0 = arg1 = gen_reg_rtx (V4SImode);
> +           emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31)));
> +           sel[0] = 1;
> +           sel[1] = 1;
> +           sel[2] = 3;
> +           sel[3] = 3;
> +         }
> +       else if (INTVAL (operands[2]) > 32)
> +         {
> +           arg0 = gen_reg_rtx (V4SImode);
> +           arg1 = gen_reg_rtx (V4SImode);
> +           emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
> +           emit_insn (gen_ashrv4si3 (arg0, op1,
> +                                     GEN_INT (INTVAL (operands[2]) - 32)));
> +           sel[0] = 1;
> +           sel[1] = 5;
> +           sel[2] = 3;
> +           sel[3] = 7;
> +         }
> +       else if (INTVAL (operands[2]) == 32)
> +         {
> +           arg0 = op1;
> +           arg1 = gen_reg_rtx (V4SImode);
> +           emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
> +           sel[0] = 1;
> +           sel[1] = 5;
> +           sel[2] = 3;
> +           sel[3] = 7;
> +         }
> +       else
> +         {
> +           arg0 = gen_reg_rtx (V2DImode);
> +           arg1 = gen_reg_rtx (V4SImode);
> +           emit_insn (gen_lshrv2di3 (arg0, operands[1], operands[2]));
> +           emit_insn (gen_ashrv4si3 (arg1, op1, operands[2]));
> +           arg0 = lowpart_subreg (V4SImode, arg0, V2DImode);
> +           sel[0] = 0;
> +           sel[1] = 5;
> +           sel[2] = 2;
> +           sel[3] = 7;
> +         }
> +       vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4);
> +       bool ok = targetm.vectorize.vec_perm_const (V4SImode, target,
> +                                                   arg0, arg1, indices);
> +       gcc_assert (ok);
> +       emit_move_insn (operands[0],
> +                       lowpart_subreg (V2DImode, target, V4SImode));
> +       DONE;
> +     }
> +      if (!TARGET_XOP)
> +     {
> +       rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
> +       rtx zero_or_all_ones;
> +       if (TARGET_SSE4_2)
> +         {
> +           zero_or_all_ones = gen_reg_rtx (V2DImode);
> +           emit_insn (gen_sse4_2_gtv2di3 (zero_or_all_ones, zero,
> +                                          operands[1]));
> +         }
> +       else
> +         {
> +           rtx temp = gen_reg_rtx (V4SImode);
> +           emit_insn (gen_ashrv4si3 (temp, lowpart_subreg (V4SImode,
> +                                                           operands[1],
> +                                                           V2DImode),
> +                                     GEN_INT (31)));
> +           zero_or_all_ones = gen_reg_rtx (V4SImode);
> +           emit_insn (gen_sse2_pshufd_1 (zero_or_all_ones, temp,
> +                                         const1_rtx, const1_rtx,
> +                                         GEN_INT (3), GEN_INT (3)));
> +           zero_or_all_ones = lowpart_subreg (V2DImode, zero_or_all_ones,
> +                                              V4SImode);
> +         }
> +       rtx lshr_res = gen_reg_rtx (V2DImode);
> +       emit_insn (gen_lshrv2di3 (lshr_res, operands[1], operands[2]));
> +       rtx ashl_res = gen_reg_rtx (V2DImode);
> +       rtx amount;
> +       if (TARGET_64BIT)
> +         {
> +           amount = gen_reg_rtx (DImode);
> +           emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)),
> +                                  operands[2]));
> +         }
> +       else
> +         {
> +           rtx temp = gen_reg_rtx (SImode);
> +           emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)),
> +                                  lowpart_subreg (SImode, operands[2],
> +                                                  DImode)));
> +           amount = gen_reg_rtx (V4SImode);
> +           emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode),
> +                                         temp));
> +         }
> +       amount = lowpart_subreg (DImode, amount, GET_MODE (amount));
> +       emit_insn (gen_ashlv2di3 (ashl_res, zero_or_all_ones, amount));
> +       emit_insn (gen_iorv2di3 (operands[0], lshr_res, ashl_res));
> +       DONE;
> +     }
> +
>        rtx reg = gen_reg_rtx (V2DImode);
>        rtx par;
>        bool negate = false;
> --- gcc/testsuite/gcc.target/i386/sse2-psraq-1.c.jj   2021-02-09 
> 10:12:30.968083657 +0100
> +++ gcc/testsuite/gcc.target/i386/sse2-psraq-1.c      2021-02-09 
> 10:12:30.968083657 +0100
> @@ -0,0 +1,53 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -msse2 -mno-sse3" } */
> +/* { dg-require-effective-target sse2 } */
> +
> +#ifndef CHECK_H
> +#define CHECK_H "sse2-check.h"
> +#endif
> +
> +#ifndef TEST
> +#define TEST sse2_test
> +#endif
> +
> +#include CHECK_H
> +
> +typedef long long V __attribute__((vector_size (16)));
> +
> +#define TESTN(N) \
> +static V                     \
> +__attribute__((noipa))               \
> +test##N (V x)                        \
> +{                            \
> +  return x >> N;             \
> +}
> +
> +#define TESTS TESTN (63) TESTN (49) TESTN (32) TESTN (31) TESTN (18)
> +TESTS
> +
> +struct
> +{
> +  int n;
> +  V (*fn) (V);
> +} tests[] = {
> +#undef TESTN
> +#define TESTN(N) { N, test##N },
> +  TESTS
> +};
> +
> +static void
> +TEST (void)
> +{
> +  V a = (V) { 0xdeadbeefcafebabeULL, 0x123456789abcdef0ULL };
> +  V b = (V) { 0x173a74be8a95134cULL, 0x817bae35ac0ebf12ULL };
> +  int i;
> +  for (i = 0; tests[i].n; i++)
> +    {
> +      V c = tests[i].fn (a);
> +      if (c[0] != a[0] >> tests[i].n || c[1] != a[1] >> tests[i].n)
> +     abort ();
> +      c = tests[i].fn (b);
> +      if (c[0] != b[0] >> tests[i].n || c[1] != b[1] >> tests[i].n)
> +     abort ();
> +    }
> +}
> --- gcc/testsuite/gcc.target/i386/sse4_2-psraq-1.c.jj 2021-02-09 
> 10:12:30.968083657 +0100
> +++ gcc/testsuite/gcc.target/i386/sse4_2-psraq-1.c    2021-02-09 
> 10:12:30.968083657 +0100
> @@ -0,0 +1,13 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -msse4.2 -mno-avx" } */
> +/* { dg-require-effective-target sse4 } */
> +
> +#ifndef CHECK_H
> +#define CHECK_H "sse4_2-check.h"
> +#endif
> +
> +#ifndef TEST
> +#define TEST sse4_2_test
> +#endif
> +
> +#include "sse2-psraq-1.c"
> --- gcc/testsuite/gcc.target/i386/avx-psraq-1.c.jj    2021-02-09 
> 10:12:30.968083657 +0100
> +++ gcc/testsuite/gcc.target/i386/avx-psraq-1.c       2021-02-09 
> 10:12:30.968083657 +0100
> @@ -0,0 +1,13 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -mno-avx2" } */
> +/* { dg-require-effective-target avx } */
> +
> +#ifndef CHECK_H
> +#define CHECK_H "avx-check.h"
> +#endif
> +
> +#ifndef TEST
> +#define TEST avx_test
> +#endif
> +
> +#include "sse2-psraq-1.c"
> --- gcc/testsuite/gcc.target/i386/avx2-psraq-1.c.jj   2021-02-09 
> 10:12:30.968083657 +0100
> +++ gcc/testsuite/gcc.target/i386/avx2-psraq-1.c      2021-02-09 
> 10:12:30.968083657 +0100
> @@ -0,0 +1,51 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#ifndef CHECK_H
> +#define CHECK_H "avx2-check.h"
> +#endif
> +
> +#ifndef TEST
> +#define TEST avx2_test
> +#endif
> +
> +#include CHECK_H
> +
> +typedef long long V __attribute__((vector_size (32)));
> +
> +#define TESTN(N) \
> +static V                     \
> +__attribute__((noipa))               \
> +test##N (V x)                        \
> +{                            \
> +  return x >> N;             \
> +}
> +
> +#define TESTS TESTN (63) TESTN (49) TESTN (32) TESTN (31) TESTN (18)
> +TESTS
> +
> +struct
> +{
> +  int n;
> +  V (*fn) (V);
> +} tests[] = {
> +#undef TESTN
> +#define TESTN(N) { N, test##N },
> +  TESTS
> +};
> +
> +static void
> +TEST (void)
> +{
> +  V a = (V) { 0xdeadbeefcafebabeULL, 0x123456789abcdef0ULL,
> +           0x173a74be8a95134cULL, 0x817bae35ac0ebf12ULL };
> +  int i;
> +  for (i = 0; tests[i].n; i++)
> +    {
> +      V c = tests[i].fn (a);
> +      if (c[0] != a[0] >> tests[i].n || c[1] != a[1] >> tests[i].n
> +       || c[2] != a[2] >> tests[i].n || c[3] != a[3] >> tests[i].n)
> +     abort ();
> +    }
> +}
> --- gcc/testsuite/gcc.target/i386/avx-pr82370.c.jj    2020-01-12 
> 11:54:37.872391397 +0100
> +++ gcc/testsuite/gcc.target/i386/avx-pr82370.c       2021-02-09 
> 11:47:34.274794698 +0100
> @@ -4,7 +4,7 @@
>  /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 3 } } */
>  /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 3 } } */
>  /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 3 } } */
> -/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 3 } } */
> +/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 6 } } */
>  /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 0 } } */
>  /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 3 } } */
>  /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 3 } } */
> --- gcc/testsuite/gcc.target/i386/avx2-pr82370.c.jj   2020-01-12 
> 11:54:37.877391321 +0100
> +++ gcc/testsuite/gcc.target/i386/avx2-pr82370.c      2021-02-09 
> 11:52:00.563790584 +0100
> @@ -4,7 +4,7 @@
>  /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
> -/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
> +/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 2 } } */
>  /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 0 } } */
>  /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
> @@ -13,7 +13,7 @@
>  /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
> -/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
> +/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 2 } } */
>  /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 0 } } */
>  /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
> --- gcc/testsuite/gcc.target/i386/avx512f-pr82370.c.jj        2020-01-12 
> 11:54:37.901390959 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-pr82370.c   2021-02-09 
> 11:53:09.626011462 +0100
> @@ -4,7 +4,7 @@
>  /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
> -/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
> +/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 2 } } */
>  /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 0 } } */
>  /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
> @@ -13,7 +13,7 @@
>  /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 3 } } */
> -/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
> +/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 2 } } */
>  /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 0 } } */
>  /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 3 } } */
>  /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
> --- gcc/testsuite/gcc.target/i386/avx512bw-pr82370.c.jj       2020-01-12 
> 11:54:37.887391170 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512bw-pr82370.c  2021-02-09 
> 11:52:34.963402512 +0100
> @@ -4,7 +4,7 @@
>  /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
> -/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
> +/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 2 } } */
>  /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 0 } } */
>  /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, 
> %xmm\[0-9]\+" 1 } } */
> @@ -13,7 +13,7 @@
>  /* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
> -/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
> +/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 2 } } */
>  /* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 0 } } */
>  /* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %ymm\[0-9]\+, 
> %ymm\[0-9]\+" 1 } } */
> --- gcc/testsuite/gcc.dg/torture/vshuf-4.inc.jj       2021-02-08 
> 19:07:21.906291652 +0100
> +++ gcc/testsuite/gcc.dg/torture/vshuf-4.inc  2021-02-09 10:12:30.982083499 
> +0100
> @@ -25,7 +25,9 @@ T (21,      2, 6, 3, 7) \
>  T (22,       1, 2, 3, 0) \
>  T (23,       2, 1, 0, 3) \
>  T (24,       2, 5, 6, 3) \
> -T (25,       0, 1, 4, 5)
> +T (25,       0, 1, 4, 5) \
> +T (26,       1, 5, 3, 7) \
> +T (27,       0, 5, 2, 7)
>  #define EXPTESTS \
>  T (116,      1, 2, 4, 3) \
>  T (117,      7, 3, 3, 0) \
> --- gcc/testsuite/gcc.dg/torture/vshuf-8.inc.jj       2021-02-08 
> 19:07:21.907291641 +0100
> +++ gcc/testsuite/gcc.dg/torture/vshuf-8.inc  2021-02-09 10:12:30.989083421 
> +0100
> @@ -27,7 +27,9 @@ T (23,      6, 5, 4, 3, 2, 1, 0, 7) \
>  T (24,       0, 1, 2, 3, 8, 9, 10, 11) \
>  T (25,       0, 1, 2, 3, 12, 13, 14, 15) \
>  T (26,       0, 1, 8, 9, 10, 11, 12, 13) \
> -T (27,       0, 8, 9, 10, 11, 12, 13, 14)
> +T (27,       0, 8, 9, 10, 11, 12, 13, 14) \
> +T (28,       1, 9, 3, 11, 5, 13, 7, 15) \
> +T (29,       0, 9, 2, 11, 4, 13, 6, 15)
>  #define EXPTESTS \
>  T (116,      9, 3, 9, 4, 7, 0, 0, 6) \
>  T (117,      4, 14, 12, 8, 9, 6, 0, 10) \
> 
>       Jakub
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

Reply via email to