On Tue, Aug 26, 2025 at 6:40 AM Jakub Jelinek <ja...@redhat.com> wrote:
>
> Hi!
>
> The vgf2p8affineqb_<mode><mask_name> pattern uses "register_operand"
> predicate for the first input operand, so using "general_operand"
> for the rotate operand passed to it leads to ICEs, and so does
> the "nonimmediate_operand" in the <insn>v16qi3 define_expand.
> The following patch fixes it by using "register_operand" in the former
> case (that pattern is TARGET_GFNI only) and using force_reg in
> the latter case (the pattern is TARGET_XOP || TARGET_GFNI and for XOP
> we can handle MEM operand).
>
> The rest of the changes are small formatting tweaks or use of const0_rtx
> instead of GEN_INT (0).
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
Ok.
>
> 2025-08-25  Jakub Jelinek  <ja...@redhat.com>
>
>         PR target/121658
>         * config/i386/sse.md (<insn><mode>3 any_shift): Use const0_rtx
>         instead of GEN_INT (0).
>         (cond_<insn><mode> any_shift): Likewise.  Formatting fix.
>         (<insn><mode>3 any_rotate): Use register_operand predicate instead of
>         general_operand for match_operand 1.  Use const0_rtx instead of
>         GEN_INT (0).
>         (<insn>v16qi3 any_rotate): Use force_reg on operands[1].  Formatting
>         fix.
>         * config/i386/i386.cc (ix86_shift_rotate_cost): Comment formatting
>         fixes.
>
>         * gcc.target/i386/pr121658.c: New test.
>
> --- gcc/config/i386/sse.md.jj   2025-08-25 10:52:21.365762677 +0200
> +++ gcc/config/i386/sse.md      2025-08-25 17:16:29.956103443 +0200
> @@ -26994,7 +26994,7 @@ (define_expand "<insn><mode>3"
>        rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2],
>                                                    <CODE>);
>        emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
> -                                           GEN_INT (0)));
> +                                           const0_rtx));
>      }
>    else
>      ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
> @@ -27014,20 +27014,21 @@ (define_expand "cond_<insn><mode>"
>  {
>    rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], 
> <CODE>);
>    emit_insn (gen_vgf2p8affineqb_<mode>_mask (operands[0], operands[1], 
> matrix,
> -               GEN_INT (0), operands[4], operands[1]));
> +                                            const0_rtx, operands[4],
> +                                            operands[1]));
>    DONE;
>  })
>
>  (define_expand "<insn><mode>3"
>    [(set (match_operand:VI1_AVX512_3264 0 "register_operand")
>         (any_rotate:VI1_AVX512_3264
> -         (match_operand:VI1_AVX512_3264 1 "general_operand")
> +         (match_operand:VI1_AVX512_3264 1 "register_operand")
>           (match_operand:SI 2 "const_int_operand")))]
>    "TARGET_GFNI"
>  {
>    rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], 
> <CODE>);
>    emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
> -             GEN_INT (0)));
> +             const0_rtx));
>    DONE;
>  })
>
> @@ -27073,8 +27074,9 @@ (define_expand "<insn>v16qi3"
>    else if (TARGET_GFNI && CONST_INT_P (operands[2]))
>      {
>        rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], 
> <CODE>);
> -      emit_insn (gen_vgf2p8affineqb_v16qi (operands[0], operands[1], matrix,
> -                 GEN_INT (0)));
> +      emit_insn (gen_vgf2p8affineqb_v16qi (operands[0],
> +                                          force_reg (V16QImode, operands[1]),
> +                                          matrix, const0_rtx));
>        DONE;
>      }
>    else
> --- gcc/config/i386/i386.cc.jj  2025-08-25 16:23:03.782043558 +0200
> +++ gcc/config/i386/i386.cc     2025-08-25 17:18:11.323759376 +0200
> @@ -22104,9 +22104,9 @@ ix86_shift_rotate_cost (const struct pro
>         case V32QImode:
>           if (TARGET_GFNI && constant_op1)
>             {
> -             /* Use vgf2p8affine. One extra load for the mask, but in a loop
> -                with enough registers it will be moved out. So for now don't
> -                account the constant mask load. This is not quite right
> +             /* Use vgf2p8affine.  One extra load for the mask, but in a loop
> +                with enough registers it will be moved out.  So for now don't
> +                account the constant mask load.  This is not quite right
>                  for non loop vectorization.  */
>               extra = 0;
>               return ix86_vec_cost (mode, cost->sse_op) + extra;
> --- gcc/testsuite/gcc.target/i386/pr121658.c.jj 2025-08-25 17:43:20.773614433 
> +0200
> +++ gcc/testsuite/gcc.target/i386/pr121658.c    2025-08-25 17:40:32.226871078 
> +0200
> @@ -0,0 +1,11 @@
> +/* PR target/121658 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512f -mgfni" } */
> +
> +__attribute__((__vector_size__(64))) unsigned char v;
> +
> +void
> +foo (void)
> +{
> +  v = (v << 7) | (v >> 1);
> +}
>
>         Jakub
>


-- 
BR,
Hongtao

Reply via email to