Re: [PATCH v2] AArch64: Block combine_and_move from creating FP literal loads

Richard Sandiford Tue, 12 Nov 2024 09:48:28 -0800

Wilco Dijkstra <wilco.dijks...@arm.com> writes:
> Hi Richard,
>
>> The idea was that, if we did the split during expand, the movsf/df
>> define_insns would then only accept the immediates that their
>> constraints can handle.
>
> Right, always disallowing these immediates works fine too (it seems
> reload doesn't require all immediates to be valid), and then the split is
> redundant. I've updated the patch:
>
>
> v2: split during expand, remove movsf/df splitter
>
> The IRA combine_and_move pass runs if the scheduler is disabled and 
> aggressively
> combines moves.  The movsf/df patterns allow all FP immediates since they rely
> on a split pattern.  However splits do not happen during IRA, so the result is
> extra literal loads.  To avoid this, split early during expand and block
> creation of FP immediates that need this split.
>
> double f(void) { return 128.0; }
>
> -O2 -fno-schedule-insns gives:
>
>       adrp    x0, .LC0
>       ldr     d0, [x0, #:lo12:.LC0]
>       ret
>
> After patch:
>
>       mov     x0, 4638707616191610880
>       fmov    d0, x0
>       ret
>
> Passes bootstrap & regress, OK for commit?
>
> gcc/ChangeLog:
>         * config/aarch64/aarch64.md (movhf_aarch64): Use 
> aarch64_valid_fp_move.
>         (movsf_aarch64): Likewise.
>         (movdf_aarch64): Likewise.
>         * config/aarch64/aarch64.cc (aarch64_valid_fp_move): New function.
>         * config/aarch64/aarch64-protos.h (aarch64_valid_fp_move): Likewise.


Thanks, this mostly looks good, but...

> ---
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 
> 05d3258abf7b43342d9058dd1b365a1a0870cdc2..6da81556110c978a9de6f6fad5775c9d77771b10
>  100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -843,6 +843,7 @@ bool aarch64_advsimd_struct_mode_p (machine_mode mode);
>  opt_machine_mode aarch64_vq_mode (scalar_mode);
>  opt_machine_mode aarch64_full_sve_mode (scalar_mode);
>  bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
> +bool aarch64_valid_fp_move (rtx, rtx, machine_mode);
>  bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
>  bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
>                                           HOST_WIDE_INT);
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> 00bcf18ae97cea94227c00798b7951daa255d360..ec2d391d1e3eb9bd28f66fb6ee85311b4ced4c94
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -11175,6 +11175,36 @@ aarch64_can_const_movi_rtx_p (rtx x, machine_mode 
> mode)
>    return aarch64_simd_valid_mov_imm (v_op);
>  }
>  
> +/* Return TRUE if DST and SRC with mode MODE is a valid fp move.  */
> +bool
> +aarch64_valid_fp_move (rtx dst, rtx src, machine_mode mode)
> +{
> +  if (!TARGET_FLOAT)
> +    return false;
> +
> +  if (aarch64_reg_or_fp_zero (src, mode))
> +    return true;
> +
> +  if (!register_operand (dst, mode))
> +    return false;
> +
> +  if (MEM_P (src))
> +    return true;
> +
> +  if (!DECIMAL_FLOAT_MODE_P (mode))
> +    {
> +      if (aarch64_can_const_movi_rtx_p (src, mode)
> +       || aarch64_float_const_representable_p (src)
> +       || aarch64_float_const_zero_rtx_p (src))
> +     return true;
> +
> +      /* Block FP immediates which are split during expand.  */
> +      if (aarch64_float_const_rtx_p (src))
> +     return false;
> +    }
> +
> +  return can_create_pseudo_p ();

...I still think we should avoid testing can_create_pseudo_p.
Does it work with the last part replaced by:

  if (!DECIMAL_FLOAT_MODE_P (mode))
    {
      if (aarch64_can_const_movi_rtx_p (src, mode)
          || aarch64_float_const_representable_p (src)
          || aarch64_float_const_zero_rtx_p (src))
        return true;
    }

  return false;

?  If so, the patch is OK with that change from my POV, but please wait
till Thursday morning for others' opinions.

Richard


> +}
>  
>  /* Return the fixed registers used for condition codes.  */
>  
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> 8d10197c9e8dd66b7a30a1034b629297b9992661..b865ae2ff5e23edc4d8990e1efd4a51dd195f41f
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1636,14 +1636,33 @@ (define_expand "mov<mode>"
>          && ! (GET_CODE (operands[1]) == CONST_DOUBLE
>             && aarch64_float_const_zero_rtx_p (operands[1])))
>        operands[1] = force_reg (<MODE>mode, operands[1]);
> +
> +    if (!DECIMAL_FLOAT_MODE_P (<MODE>mode)
> +     && GET_CODE (operands[1]) == CONST_DOUBLE
> +     && can_create_pseudo_p ()
> +     && !aarch64_can_const_movi_rtx_p (operands[1], <MODE>mode)
> +     && !aarch64_float_const_representable_p (operands[1])
> +     && !aarch64_float_const_zero_rtx_p (operands[1])
> +     &&  aarch64_float_const_rtx_p (operands[1]))
> +      {
> +     unsigned HOST_WIDE_INT ival;
> +     bool res = aarch64_reinterpret_float_as_int (operands[1], &ival);
> +     gcc_assert (res);
> +
> +     machine_mode intmode
> +       = int_mode_for_size (GET_MODE_BITSIZE (<MODE>mode), 0).require ();
> +     rtx tmp = gen_reg_rtx (intmode);
> +     emit_move_insn (tmp, gen_int_mode (ival, intmode));
> +     emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
> +     DONE;
> +      }
>    }
>  )
>  
>  (define_insn "*mov<mode>_aarch64"
>    [(set (match_operand:HFBF 0 "nonimmediate_operand")
>       (match_operand:HFBF 1 "general_operand"))]
> -  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
> -    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
> +  "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
>    {@ [ cons: =0 , 1   ; attrs: type , arch  ]
>       [ w        , Y   ; neon_move   , simd  ] movi\t%0.4h, #0
>       [ w        , ?rY ; f_mcr       , fp16  ] fmov\t%h0, %w1
> @@ -1666,8 +1685,7 @@ (define_insn "*mov<mode>_aarch64"
>  (define_insn "*mov<mode>_aarch64"
>    [(set (match_operand:SFD 0 "nonimmediate_operand")
>       (match_operand:SFD 1 "general_operand"))]
> -  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
> -    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
> +  "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
>    {@ [ cons: =0 , 1   ; attrs: type , arch  ]
>       [ w        , Y   ; neon_move   , simd  ] movi\t%0.2s, #0
>       [ w        , ?rY ; f_mcr       , *     ] fmov\t%s0, %w1
> @@ -1687,8 +1705,7 @@ (define_insn "*mov<mode>_aarch64"
>  (define_insn "*mov<mode>_aarch64"
>    [(set (match_operand:DFD 0 "nonimmediate_operand")
>       (match_operand:DFD 1 "general_operand"))]
> -  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
> -    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
> +  "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
>    {@ [ cons: =0 , 1   ; attrs: type , arch  ]
>       [ w        , Y   ; neon_move   , simd  ] movi\t%d0, #0
>       [ w        , ?rY ; f_mcr       , *     ] fmov\t%d0, %x1
> @@ -1705,27 +1722,6 @@ (define_insn "*mov<mode>_aarch64"
>    }
>  )
>  
> -(define_split
> -  [(set (match_operand:GPF_HF 0 "nonimmediate_operand")
> -     (match_operand:GPF_HF 1 "const_double_operand"))]
> -  "can_create_pseudo_p ()
> -   && !aarch64_can_const_movi_rtx_p (operands[1], <MODE>mode)
> -   && !aarch64_float_const_representable_p (operands[1])
> -   && !aarch64_float_const_zero_rtx_p (operands[1])
> -   &&  aarch64_float_const_rtx_p (operands[1])"
> -  [(const_int 0)]
> -  {
> -    unsigned HOST_WIDE_INT ival;
> -    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
> -      FAIL;
> -
> -    rtx tmp = gen_reg_rtx (<FCVT_TARGET>mode);
> -    emit_move_insn (tmp, gen_int_mode (ival, <FCVT_TARGET>mode));
> -    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
> -    DONE;
> -  }
> -)
> -
>  (define_insn "*mov<mode>_aarch64"
>    [(set (match_operand:TFD 0
>        "nonimmediate_operand" "=w,w,?r ,w ,?r,w,?w,w,m,?r,m ,m")

Re: [PATCH v2] AArch64: Block combine_and_move from creating FP literal loads

Reply via email to