This patch is pretty huge, are there any opportunities to further split it to aid review?
I have some comments in line. > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index > a069427f576f6bd7336bbe4497249773bd33d138..2ab2d96e40e80a79b5648046ca2d6e202d3939a2 > 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -147,6 +147,8 @@ static bool aarch64_builtin_support_vector_misalignment > (machine_mode mode, > const_tree type, > int misalignment, > bool is_packed); > +static machine_mode > +aarch64_simd_container_mode (machine_mode mode, unsigned width); > > /* Major revision number of the ARM Architecture implemented by the target. > */ > unsigned aarch64_architecture_version; > @@ -4613,6 +4615,66 @@ aarch64_legitimize_address_displacement (rtx *disp, > rtx *off, machine_mode mode) > return true; > } > > +/* Return the binary representation of floating point constant VALUE in > INTVAL. > + If the value cannot be converted, return false without setting INTVAL. > + The conversion is done in the given MODE. */ > +bool > +aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval) > +{ > + machine_mode mode = GET_MODE (value); > + if (GET_CODE (value) != CONST_DOUBLE > + || !SCALAR_FLOAT_MODE_P (mode) > + || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT) > + return false; > + > + unsigned HOST_WIDE_INT ival = 0; > + > + /* Only support up to DF mode. */ > + gcc_assert (GET_MODE_BITSIZE (mode) <= 64); > + int needed = GET_MODE_BITSIZE (mode) == 64 ? 2 : 1; > + > + long res[2]; > + real_to_target (res, > + CONST_DOUBLE_REAL_VALUE (value), > + REAL_MODE_FORMAT (mode)); > + > + ival = zext_hwi (res[needed - 1], 32); > + for (int i = needed - 2; i >= 0; i--) > + { > + ival <<= 32; > + ival |= zext_hwi (res[i], 32); > + } > + > + *intval = ival; ??? Two cases here, needed is either 2 if GET_MODE_BITSIZE (mode) == 64, or it is 1 otherwise. So i starts at either -1 or 0. So this for loop either runs 0 or 1 times. What am I missing? I'm sure this is all an indirect way of writing: *intval = 0; if (GET_MODE_BITSIZE (mode) == 64) *intval = zext_hwi (res[1], 32) << 32 *intval |= zext_hwi (res[0], 32) > + return true; > +} > + > +/* Return TRUE if rtx X is an immediate constant that can be moved using a > + single MOV(+MOVK) followed by an FMOV. */ > +bool > +aarch64_float_const_rtx_p (rtx x) > +{ > + machine_mode mode = GET_MODE (x); > + if (mode == VOIDmode) > + return false; > + > + /* Determine whether it's cheaper to write float constants as > + mov/movk pairs over ldr/adrp pairs. */ > + unsigned HOST_WIDE_INT ival; > + > + if (GET_CODE (x) == CONST_DOUBLE > + && SCALAR_FLOAT_MODE_P (mode) > + && aarch64_reinterpret_float_as_int (x, &ival)) > + { > + machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode > (mode); > + int num_instr = aarch64_internal_mov_immediate > + (NULL_RTX, gen_int_mode (ival, imode), false, imode); > + return num_instr < 3; Should this cost model be static on a magin number? Is it not the case that the decision should be based on the relative speeds of a memory access compared with mov/movk/fmov ? > + } > + > + return false; > +} > + > /* Return TRUE if rtx X is immediate constant 0.0 */ > bool > aarch64_float_const_zero_rtx_p (rtx x) > @@ -4625,6 +4687,46 @@ aarch64_float_const_zero_rtx_p (rtx x) > return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0); > } > > +/* Return TRUE if rtx X is immediate constant that fits in a single > + MOVI immediate operation. */ > +bool > +aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode) > +{ > + if (!TARGET_SIMD) > + return false; > + > + machine_mode vmode, imode; > + unsigned HOST_WIDE_INT ival; > + > + /* Don't write float constants out to memory. */ > + if (GET_CODE (x) == CONST_DOUBLE > + && SCALAR_FLOAT_MODE_P (mode)) > + { > + if (!aarch64_reinterpret_float_as_int (x, &ival)) > + return false; > + > + imode = int_mode_for_mode (mode); > + } > + else if (GET_CODE (x) == CONST_INT > + && SCALAR_INT_MODE_P (mode)) > + { > + imode = mode; > + ival = INTVAL (x); > + } > + else > + return false; > + > + unsigned width = GET_MODE_BITSIZE (mode) * 2; Why * 2? It isn't obvious to me from my understanding of movi why that would be better than just clamping to 64-bit? > + if (width < GET_MODE_BITSIZE (DFmode)) > + width = GET_MODE_BITSIZE (DFmode); > + > + vmode = aarch64_simd_container_mode (imode, width); > + rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival); > + > + return aarch64_simd_valid_immediate (v_op, vmode, false, NULL); > +} > + > + > /* Return the fixed registers used for condition codes. */ > > static bool > @@ -5758,12 +5860,6 @@ aarch64_preferred_reload_class (rtx x, reg_class_t > regclass) > return NO_REGS; > } > > - /* If it's an integer immediate that MOVI can't handle, then > - FP_REGS is not an option, so we return NO_REGS instead. */ > - if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS) > - && !aarch64_simd_imm_scalar_p (x, GET_MODE (x))) > - return NO_REGS; > - > /* Register eliminiation can result in a request for > SP+constant->FP_REGS. We cannot support such operations which > use SP as source and an FP_REG as destination, so reject out > @@ -6674,26 +6770,44 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int > outer ATTRIBUTE_UNUSED, > return true; > > case CONST_DOUBLE: > + > + /* First determine number of instructions to do the move > + as an integer constant. */ > + if (!aarch64_float_const_representable_p (x) > + && !aarch64_can_const_movi_rtx_p (x, mode) > + && aarch64_float_const_rtx_p (x)) > + { > + unsigned HOST_WIDE_INT ival; > + bool succeed = aarch64_reinterpret_float_as_int (x, &ival); > + gcc_assert (succeed); Just: gcc_assert (aarch64_reinterpret_float_as_int (x, &ival)); There's not much extra information in the name "succeed", so no extra value in the variable assignment. > + > + machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode > (mode); > + int ncost = aarch64_internal_mov_immediate > + (NULL_RTX, gen_int_mode (ival, imode), false, imode); > + *cost += COSTS_N_INSNS (ncost); > + return true; > + } > + > if (speed) > { > - /* mov[df,sf]_aarch64. */ > - if (aarch64_float_const_representable_p (x)) > - /* FMOV (scalar immediate). */ > - *cost += extra_cost->fp[mode == DFmode].fpconst; > - else if (!aarch64_float_const_zero_rtx_p (x)) > - { > - /* This will be a load from memory. */ > - if (mode == DFmode) > + /* mov[df,sf]_aarch64. */ > + if (aarch64_float_const_representable_p (x)) > + /* FMOV (scalar immediate). */ > + *cost += extra_cost->fp[mode == DFmode].fpconst; > + else if (!aarch64_float_const_zero_rtx_p (x)) > + { > + /* This will be a load from memory. */ > + if (mode == DFmode) > *cost += extra_cost->ldst.loadd; > - else > + else > *cost += extra_cost->ldst.loadf; > - } > - else > - /* Otherwise this is +0.0. We get this using MOVI d0, #0 > - or MOV v0.s[0], wzr - neither of which are modeled by the > - cost tables. Just use the default cost. */ > - { > - } > + } > + else > + /* Otherwise this is +0.0. We get this using MOVI d0, #0 > + or MOV v0.s[0], wzr - neither of which are modeled by the > + cost tables. Just use the default cost. */ > + { > + } > } > > return true; > @@ -12599,15 +12698,28 @@ aarch64_output_simd_mov_immediate (rtx const_vector, > } > > char* > -aarch64_output_scalar_simd_mov_immediate (rtx immediate, > - machine_mode mode) > +aarch64_output_scalar_simd_mov_immediate (rtx immediate, machine_mode mode) > { > + > + /* If a floating point number was passed and we desire to use it in an > + integer mode do the conversion to integer. */ > + if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT) > + { > + unsigned HOST_WIDE_INT ival; > + if (!aarch64_reinterpret_float_as_int (immediate, &ival)) > + gcc_unreachable (); > + immediate = gen_int_mode (ival, mode); > + } > + > machine_mode vmode; > + int width = GET_MODE_BITSIZE (mode) * 2; Dubious * 2 again! > + if (width < 64) > + width = 64; > > gcc_assert (!VECTOR_MODE_P (mode)); > - vmode = aarch64_simd_container_mode (mode, 64); > + vmode = aarch64_simd_container_mode (mode, width); > rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate)); > - return aarch64_output_simd_mov_immediate (v_op, vmode, 64); > + return aarch64_output_simd_mov_immediate (v_op, vmode, width); > } > > /* Split operands into moves from op[1] + op[2] into op[0]. */ Thanks, James