Sudakshina Das <sudi....@arm.com> writes:
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 
> 00b5f8438863bb52c348cfafd5d4db478fe248a7..bcb654809c9662db0f51fc1368e37e42969efd29
>  100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -1024,16 +1024,18 @@ typedef struct
>  #define MOVE_RATIO(speed) \
>    (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
>  
> -/* For CLEAR_RATIO, when optimizing for size, give a better estimate
> -   of the length of a memset call, but use the default otherwise.  */
> +/* Like MOVE_RATIO, without -mstrict-align, make decisions in "setmem" when
> +   we would use more than 3 scalar instructions.
> +   Otherwise follow a sensible default: when optimizing for size, give a 
> better
> +   estimate of the length of a memset call, but use the default otherwise.  
> */
>  #define CLEAR_RATIO(speed) \
> -  ((speed) ? 15 : AARCH64_CALL_RATIO)
> +  (!STRICT_ALIGNMENT ? 4 : (speed) ? 15 : AARCH64_CALL_RATIO)
>  
>  /* SET_RATIO is similar to CLEAR_RATIO, but for a non-zero constant, so when
>     optimizing for size adjust the ratio to account for the overhead of 
> loading
>     the constant.  */
>  #define SET_RATIO(speed) \
> -  ((speed) ? 15 : AARCH64_CALL_RATIO - 2)
> +  (!STRICT_ALIGNMENT ? 0 : (speed) ? 15 : AARCH64_CALL_RATIO - 2)

Think it would help to adjust the SET_RATIO comment too, otherwise it's
not obvious why its !STRICT_ALIGNMNENT value is 0.

>  
>  /* Disable auto-increment in move_by_pieces et al.  Use of auto-increment is
>     rarely a good idea in straight-line code since it adds an extra address
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> a8cc545c37044345c3f1d3bf09151c8a9578a032..16ac0c076adcc82627af43473a938e78d3a7ecdc
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -7058,6 +7058,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, 
> rtx reg1, rtx mem2,
>      case E_V4SImode:
>        return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
>  
> +    case E_V16QImode:
> +      return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
> +
>      default:
>        gcc_unreachable ();
>      }
> @@ -21373,6 +21376,134 @@ aarch64_expand_cpymem (rtx *operands)
>    return true;
>  }
>  
> +/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
> +   *src is a register we have created with the duplicated value to be set.  
> */

AIUI, *SRC doesn't accumulate across calls in the way that it does for
aarch64_copy_one_block_and_progress_pointers, so it might be better to
pass an rtx rather than an “rtx *”.

> +static void
> +aarch64_set_one_block_and_progress_pointer (rtx *src, rtx *dst,
> +                                         machine_mode mode)
> +{
> +  /* If we are copying 128bits or 256bits, we can do that straight from
> +      the SIMD register we prepared.  */

Nit: excess space before “the”.

> +  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> +    {
> +      mode =  GET_MODE (*src);

Excess space before “GET_MODE”.

> +      /* "Cast" the *dst to the correct mode.  */
> +      *dst = adjust_address (*dst, mode, 0);
> +      /* Emit the memset.  */
> +      emit_insn (aarch64_gen_store_pair (mode, *dst, *src,
> +                                      aarch64_progress_pointer (*dst), 
> *src));
> +
> +      /* Move the pointers forward.  */
> +      *dst = aarch64_move_pointer (*dst, 32);
> +      return;
> +    }
> +  else if (known_eq (GET_MODE_BITSIZE (mode), 128))

Nit: more usual in GCC not to have an “else” after an early return.

> +    {
> +      /* "Cast" the *dst to the correct mode.  */
> +      *dst = adjust_address (*dst, GET_MODE (*src), 0);
> +      /* Emit the memset.  */
> +      emit_move_insn (*dst, *src);
> +      /* Move the pointers forward.  */
> +      *dst = aarch64_move_pointer (*dst, 16);
> +      return;
> +    }
> +  /* For copying less, we have to extract the right amount from *src.  */
> +  machine_mode vq_mode = aarch64_vq_mode (GET_MODE_INNER(mode)).require ();

Nit: should be a space before “(mode)”.

> +  *src = convert_to_mode (vq_mode, *src, 0);
> +  rtx reg = simplify_gen_subreg (mode, *src, vq_mode, 0);

I was surprised that this needed a two-step conversion.  Does a direct
subreg of the original V16QI src blow up for some modes?  If so, it might
be worth a comment.

Even if we need two steps, it would be good if the first one was also
a subreg.  convert_to_mode is normally for arithmetic conversions rather
than bitcasts.

I think we need to use lowpart_subreg instead of simplify_gen_subreg
in order to get the right SUBREG_BYTE for big-endian.  It would be
good to have some big-endian tests just to make sure. :-)

> +
> +  /* "Cast" the *dst to the correct mode.  */
> +  *dst = adjust_address (*dst, mode, 0);
> +  /* Emit the memset.  */
> +  emit_move_insn (*dst, reg);
> +  /* Move the pointer forward.  */
> +  *dst = aarch64_progress_pointer (*dst);
> +}
> +
> +/* Expand setmem, as if from a __builtin_memset.  Return true if
> +   we succeed, otherwise return false.  */
> +
> +bool
> +aarch64_expand_setmem (rtx *operands)
> +{
> +  int n, mode_bits;
> +  unsigned HOST_WIDE_INT len;
> +  rtx dst = operands[0];
> +  rtx val = operands[2], src;
> +  rtx base;
> +  machine_mode cur_mode = BLKmode, next_mode;
> +  bool speed_p = !optimize_function_for_size_p (cfun);
> +  unsigned max_set_size = speed_p ? 256 : 128;

What's the basis for the size value?  AIUI (and I've probably got
this wrong), that effectively means a worst case of 3+2 stores
(3 STP Qs and 2 mop-up stores).  Then we need one instruction
to set up the constant.  So if that's right, it looks like the
worst-case size is 6 instructions.

AARCH64_CALL_RATIO has a value of 8, but I'm not sure how that
relates to the number of instructions in a call.  I guess the
best case is 4 (3 instructions for the parameters and one for
the call itself).

> +
> +  /* We can't do anything smart if the amount to copy is not constant.  */
> +  if (!CONST_INT_P (operands[1]))
> +    return false;
> +
> +  len = INTVAL (operands[1]);
> +
> +  /* Upper bound check.  */
> +  if (len > max_set_size)
> +    return false;
> +
> +  base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
> +  dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> +
> +  /* Prepare the val using a DUP v0.16B, val.  */
> +  if (CONST_INT_P (val))
> +    {
> +      val = force_reg (QImode, val);
> +    }
> +  src = gen_reg_rtx (V16QImode);
> +  emit_insn (gen_aarch64_simd_dupv16qi(src, val));

I think we should use:

  src = expand_vector_broadcast (V16QImode, val);

here (without the CONST_INT_P check), so that for constants we just move
a constant directly into a register.

> +
> +  /* Convert len to bits to make the rest of the code simpler.  */
> +  n = len * BITS_PER_UNIT;
> +
> +  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
> +     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  setmem expand
> +     pattern is only turned on for TARGET_SIMD.  */
> +  const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
> +                        & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> +                      ? GET_MODE_BITSIZE (TImode) : 256;
> +

Perhaps we should override this for !speed, since I guess the limits are
based on using STP Q in that case.  There again, we don't do that for the
memcpy code, so it's just a suggestion.

> +  while (n > 0)
> +    {
> +      /* Find the largest mode in which to do the copy in without over 
> reading
> +      or writing.  */

Over-reading isn't a problem here, just over-writing.

> +      opt_scalar_int_mode mode_iter;
> +      FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> +     if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
> +       cur_mode = mode_iter.require ();
> +
> +      gcc_assert (cur_mode != BLKmode);
> +
> +      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> +      aarch64_set_one_block_and_progress_pointer (&src, &dst, cur_mode);
> +
> +      n -= mode_bits;
> +
> +      /* Do certain trailing copies as overlapping if it's going to be
> +      cheaper.  i.e. less instructions to do so.  For instance doing a 15
> +      byte copy it's more efficient to do two overlapping 8 byte copies than
> +      8 + 6 + 1.  */

Realise this just comes from the existing code, but how would we do
a 6-byte copy?  Does it just mean 8 + 4 + 2 + 1?

> +      if (n > 0 && n < copy_limit / 2)
> +     {
> +       next_mode = smallest_mode_for_size (n, MODE_INT);
> +       /* Last 1-byte causes the compiler to optimize to STRB when it should
> +          use STR Bx, [mem] since we already used SIMD registers.
> +          So force it to HImode.  */
> +       if (next_mode == QImode)
> +         next_mode = HImode;

Is this always better?  E.g. for variable inputs and zero it seems
quite natural to store the original scalar GPR.

If we do do this, I think we should assert before the loop that n > 1.

Also, it would be good to cover this case in the tests.

Thanks,
Richard

Reply via email to