Hi All,

Sorry I just realized I never actually sent the updated patch...

So here it is :)

Regards,
Tamar
________________________________________
From: gcc-patches-ow...@gcc.gnu.org <gcc-patches-ow...@gcc.gnu.org> on behalf 
of Tamar Christina <tamar.christ...@arm.com>
Sent: Friday, June 9, 2017 4:51:52 PM
To: Richard Sandiford
Cc: GCC Patches; nd; James Greenhalgh; Richard Earnshaw; Marcus Shawcroft
Subject: RE: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned 
memory access

> > +  /* Optimize routines for MEM to REG copies.  */  if (n < 8 &&
> > + !REG_P (XEXP (operands[0], 0)))
>
> This seems to be checking that the address of the original destination
> memory isn't a plain base register.  Why's it important to reject that case 
> but
> allow e.g. base+offset?

this function is (as far as I could tell) only being called with two types of 
destinations:
a location on the stack or a plain register.

When the destination is a register such as with

void Fun3(struct struct3 foo3)
{
  L3 = foo3;
}

You run into the issue you had pointed to below where we might write too much. 
Ideally the constraint
I would like is to check if the destination is either a new register (no data) 
or that the structure was padded.

I couldn't figure out how to do this and the gains over the existing code for 
this case was quite small.
So I just disallowed it leaving it to the existing code, which isn't so bad, 
only 1 extra instruction.

>
> > +   {
> > +     unsigned int max_align = UINTVAL (operands[2]);
> > +     max_align = n < max_align ? max_align : n;
>
> Might be misunderstanding, but isn't max_align always equal to n here, since
> n was set by:

Correct, previously this patch was made to allow n < 16. These were left over 
from the cleanup.
I'll correct.

>
> > +     result = gen_rtx_IOR (dest_mode, reg, result);
> > +      }
> > +
> > +    dst = adjust_address (dst, dest_mode, 0);
> > +    emit_insn (gen_move_insn (dst, result));
>
> dest_mode was chosen by smallest_mode_for_size, so can be bigger than n.
> Doesn't that mean that we'll write beyond the end of the copy region when
> n is an awkward number?

Yes, see my answer above. For the other case, when we write onto a location on 
the stack, this is fine
due to the alignment.

>
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index
> >
> 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce
> 8e
> > e5a19297ab16 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode,
> tree
> > src)
> >
> >    n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
> >    dst_words = XALLOCAVEC (rtx, n_regs);
> > -  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> > +  bitsize = BITS_PER_WORD;
> > +  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE
> (src))))
> > +    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
>
> I think this ought to be testing word_mode instead of BLKmode.
> (Testing BLKmode doesn't really make sense in general, because the mode
> doesn't have a meaningful alignment.)

Ah, yes that makes sense. I'll update the patch.

New patch is validating and will submit it soon.

>
> Thanks,
> Richard
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ab1bdc0233afe7a3c41501cb724a5c4c719123b8..1cf6c4b891571745f740d7dbd03754bbba9264a6 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13466,7 +13466,6 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
 
 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
    we succeed, otherwise return false.  */
-
 bool
 aarch64_expand_movmem (rtx *operands)
 {
@@ -13498,16 +13497,55 @@ aarch64_expand_movmem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
+  /* Optimize routines for MEM to REG copies.
+     This particular function only handles copying to two
+     destination types: 1) a regular register and 2) the stack.
+     When writing to a regular register we may end up writting too much in cases
+     where the register already contains a live value or when no data padding is
+     happening so we disallow it.  */
+  if (n < 8 && !REG_P (XEXP (operands[0], 0)))
+   {
+     machine_mode mov_mode, dest_mode
+       = smallest_mode_for_size (n * BITS_PER_UNIT, MODE_INT);
+     rtx result = gen_reg_rtx (dest_mode);
+     emit_insn (gen_move_insn (result, GEN_INT (0)));
+
+     unsigned int shift_cnt = 0;
+     for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
+       {
+	 int nearest = 0;
+	 /* Find the mode to use.  */
+	 for (unsigned max = 1; max <= (n - shift_cnt); max *= 2)
+	      nearest = max;
+
+	  mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, MODE_INT);
+	  rtx reg = gen_reg_rtx (mov_mode);
+
+	  src = adjust_address (src, mov_mode, 0);
+	  emit_insn (gen_move_insn (reg, src));
+	  src = aarch64_progress_pointer (src);
+
+	  reg = gen_rtx_ZERO_EXTEND (dest_mode, reg);
+	  reg = gen_rtx_ASHIFT (dest_mode, reg,
+				GEN_INT (shift_cnt * BITS_PER_UNIT));
+	  result = gen_rtx_IOR (dest_mode, reg, result);
+       }
+
+    dst = adjust_address (dst, dest_mode, 0);
+    emit_insn (gen_move_insn (dst, result));
+    return true;
+  }
+
   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
      1-byte chunk.  */
   if (n < 4)
     {
       if (n >= 2)
-	{
-	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
-	  n -= 2;
-	}
 
+      {
+	aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
+	n -= 2;
+      }
       if (n == 1)
 	aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
 
diff --git a/gcc/expr.c b/gcc/expr.c
index 2d8868e52cefdfdc2d81135811cbf1cb3e6ff82b..7eca579e769acb131bd0db344815e344fdc948f2 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode, tree src)
 
   n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
   dst_words = XALLOCAVEC (rtx, n_regs);
-  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
+  bitsize = BITS_PER_WORD;
+  if (SLOW_UNALIGNED_ACCESS (word_mode, TYPE_ALIGN (TREE_TYPE (src))))
+    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
 
   /* Copy the structure BITSIZE bits at a time.  */
   for (bitpos = 0, xbitpos = padding_correction;

Reply via email to