Re: [PATCH][GCC][AArch64] optimize float immediate moves (1 /4) - infrastructure.

James Greenhalgh Tue, 13 Jun 2017 09:40:32 -0700

This patch is pretty huge, are there any opportunities to further split
it to aid review?


I have some comments in line.

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> a069427f576f6bd7336bbe4497249773bd33d138..2ab2d96e40e80a79b5648046ca2d6e202d3939a2
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -147,6 +147,8 @@ static bool aarch64_builtin_support_vector_misalignment 
> (machine_mode mode,
>                                                        const_tree type,
>                                                        int misalignment,
>                                                        bool is_packed);
> +static machine_mode
> +aarch64_simd_container_mode (machine_mode mode, unsigned width);
>  
>  /* Major revision number of the ARM Architecture implemented by the target.  
> */
>  unsigned aarch64_architecture_version;
> @@ -4613,6 +4615,66 @@ aarch64_legitimize_address_displacement (rtx *disp, 
> rtx *off, machine_mode mode)
>    return true;
>  }
>  
> +/* Return the binary representation of floating point constant VALUE in 
> INTVAL.
> +   If the value cannot be converted, return false without setting INTVAL.
> +   The conversion is done in the given MODE.  */
> +bool
> +aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
> +{
> +  machine_mode mode = GET_MODE (value);
> +  if (GET_CODE (value) != CONST_DOUBLE
> +      || !SCALAR_FLOAT_MODE_P (mode)
> +      || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT)
> +    return false;
> +
> +  unsigned HOST_WIDE_INT ival = 0;
> +
> +  /* Only support up to DF mode.  */
> +  gcc_assert (GET_MODE_BITSIZE (mode) <= 64);
> +  int needed = GET_MODE_BITSIZE (mode) == 64 ? 2 : 1;
> +
> +  long res[2];
> +  real_to_target (res,
> +               CONST_DOUBLE_REAL_VALUE (value),
> +               REAL_MODE_FORMAT (mode));
> +
> +  ival = zext_hwi (res[needed - 1], 32);
> +  for (int i = needed - 2; i >= 0; i--)
> +    {
> +      ival <<= 32;
> +      ival |= zext_hwi (res[i], 32);
> +    }
> +
> +  *intval = ival;

???

Two cases here, needed is either 2 if GET_MODE_BITSIZE (mode) == 64, or it
is 1 otherwise. So i starts at either -1 or 0. So this for loop either runs
0 or 1 times. What am I missing? I'm sure this is all an indirect way of
writing:

  *intval = 0;
  if (GET_MODE_BITSIZE (mode) == 64)
    *intval = zext_hwi (res[1], 32) << 32
  *intval |= zext_hwi (res[0], 32)



> +  return true;
> +}
> +
> +/* Return TRUE if rtx X is an immediate constant that can be moved using a
> +   single MOV(+MOVK) followed by an FMOV.  */
> +bool
> +aarch64_float_const_rtx_p (rtx x)
> +{
> +  machine_mode mode = GET_MODE (x);
> +  if (mode == VOIDmode)
> +    return false;
> +
> +  /* Determine whether it's cheaper to write float constants as
> +     mov/movk pairs over ldr/adrp pairs.  */
> +  unsigned HOST_WIDE_INT ival;
> +
> +  if (GET_CODE (x) == CONST_DOUBLE
> +      && SCALAR_FLOAT_MODE_P (mode)
> +      && aarch64_reinterpret_float_as_int (x, &ival))
> +    {
> +      machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode 
> (mode);
> +      int num_instr = aarch64_internal_mov_immediate
> +                     (NULL_RTX, gen_int_mode (ival, imode), false, imode);
> +      return num_instr < 3;

Should this cost model be static on a magin number? Is it not the case that
the decision should be based on the relative speeds of a memory access
compared with mov/movk/fmov ?

> +    }
> +
> +  return false;
> +}
> +
>  /* Return TRUE if rtx X is immediate constant 0.0 */
>  bool
>  aarch64_float_const_zero_rtx_p (rtx x)
> @@ -4625,6 +4687,46 @@ aarch64_float_const_zero_rtx_p (rtx x)
>    return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
>  }
>  
> +/* Return TRUE if rtx X is immediate constant that fits in a single
> +   MOVI immediate operation.  */
> +bool
> +aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
> +{
> +  if (!TARGET_SIMD)
> +     return false;
> +
> +  machine_mode vmode, imode;
> +  unsigned HOST_WIDE_INT ival;
> +
> +  /* Don't write float constants out to memory.  */
> +  if (GET_CODE (x) == CONST_DOUBLE
> +      && SCALAR_FLOAT_MODE_P (mode))
> +    {
> +      if (!aarch64_reinterpret_float_as_int (x, &ival))
> +     return false;
> +
> +      imode = int_mode_for_mode (mode);
> +    }
> +  else if (GET_CODE (x) == CONST_INT
> +        && SCALAR_INT_MODE_P (mode))
> +    {
> +       imode = mode;
> +       ival = INTVAL (x);
> +    }
> +  else
> +    return false;
> +
> +  unsigned width = GET_MODE_BITSIZE (mode) * 2;

Why * 2? It isn't obvious to me from my understanding of movi why that would
be better than just clamping to 64-bit?

> +  if (width < GET_MODE_BITSIZE (DFmode))
> +     width = GET_MODE_BITSIZE (DFmode);
> +
> +  vmode = aarch64_simd_container_mode (imode, width);
> +  rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
> +
> +  return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
> +}
> +
> +
>  /* Return the fixed registers used for condition codes.  */
>  
>  static bool
> @@ -5758,12 +5860,6 @@ aarch64_preferred_reload_class (rtx x, reg_class_t 
> regclass)
>        return NO_REGS;
>      }
>  
> -  /* If it's an integer immediate that MOVI can't handle, then
> -     FP_REGS is not an option, so we return NO_REGS instead.  */
> -  if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
> -      && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
> -    return NO_REGS;
> -
>    /* Register eliminiation can result in a request for
>       SP+constant->FP_REGS.  We cannot support such operations which
>       use SP as source and an FP_REG as destination, so reject out
> @@ -6674,26 +6770,44 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int 
> outer ATTRIBUTE_UNUSED,
>        return true;
>  
>      case CONST_DOUBLE:
> +
> +      /* First determine number of instructions to do the move
> +         as an integer constant.  */
> +     if (!aarch64_float_const_representable_p (x)
> +         && !aarch64_can_const_movi_rtx_p (x, mode)
> +         && aarch64_float_const_rtx_p (x))
> +       {
> +         unsigned HOST_WIDE_INT ival;
> +         bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
> +         gcc_assert (succeed);

Just:

  gcc_assert (aarch64_reinterpret_float_as_int (x, &ival));

There's not much extra information in the name "succeed", so no extra value
in the variable assignment.

> +
> +         machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode 
> (mode);
> +         int ncost = aarch64_internal_mov_immediate
> +             (NULL_RTX, gen_int_mode (ival, imode), false, imode);
> +         *cost += COSTS_N_INSNS (ncost);
> +         return true;
> +       }
> +
>        if (speed)
>       {
> -       /* mov[df,sf]_aarch64.  */
> -       if (aarch64_float_const_representable_p (x))
> -         /* FMOV (scalar immediate).  */
> -         *cost += extra_cost->fp[mode == DFmode].fpconst;
> -       else if (!aarch64_float_const_zero_rtx_p (x))
> -         {
> -           /* This will be a load from memory.  */
> -           if (mode == DFmode)
> +     /* mov[df,sf]_aarch64.  */
> +     if (aarch64_float_const_representable_p (x))
> +       /* FMOV (scalar immediate).  */
> +       *cost += extra_cost->fp[mode == DFmode].fpconst;
> +     else if (!aarch64_float_const_zero_rtx_p (x))
> +       {
> +         /* This will be a load from memory.  */
> +         if (mode == DFmode)
>               *cost += extra_cost->ldst.loadd;
> -           else
> +         else
>               *cost += extra_cost->ldst.loadf;
> -         }
> -       else
> -         /* Otherwise this is +0.0.  We get this using MOVI d0, #0
> -            or MOV v0.s[0], wzr - neither of which are modeled by the
> -            cost tables.  Just use the default cost.  */
> -         {
> -         }
> +       }
> +     else
> +       /* Otherwise this is +0.0.  We get this using MOVI d0, #0
> +          or MOV v0.s[0], wzr - neither of which are modeled by the
> +          cost tables.  Just use the default cost.  */
> +       {
> +       }
>       }
>  
>        return true;
> @@ -12599,15 +12698,28 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
>  }
>  
>  char*
> -aarch64_output_scalar_simd_mov_immediate (rtx immediate,
> -                                       machine_mode mode)
> +aarch64_output_scalar_simd_mov_immediate (rtx immediate,  machine_mode mode)
>  {
> +
> +  /* If a floating point number was passed and we desire to use it in an
> +     integer mode do the conversion to integer.  */
> +  if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
> +    {
> +      unsigned HOST_WIDE_INT ival;
> +      if (!aarch64_reinterpret_float_as_int (immediate, &ival))
> +       gcc_unreachable ();
> +      immediate = gen_int_mode (ival, mode);
> +    }
> +
>    machine_mode vmode;
> +  int width = GET_MODE_BITSIZE (mode) * 2;

Dubious * 2 again!

> +  if (width < 64)
> +     width = 64;
>  
>    gcc_assert (!VECTOR_MODE_P (mode));
> -  vmode = aarch64_simd_container_mode (mode, 64);
> +  vmode = aarch64_simd_container_mode (mode, width);
>    rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
> -  return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
> +  return aarch64_output_simd_mov_immediate (v_op, vmode, width);
>  }
>  
>  /* Split operands into moves from op[1] + op[2] into op[0].  */

Thanks,
James

Re: [PATCH][GCC][AArch64] optimize float immediate moves (1 /4) - infrastructure.

Reply via email to