On Thu, Sep 25, 2014 at 03:57:36PM +0100, James Greenhalgh wrote: > > Hi, > > This patch wires up our new target hooks for AArch64. This also means > we can bring back the two failing SRA tests. > > Bootstrapped on AArch64 with no issues. > > OK for trunk?
No way! This patch is nonsense as it stands! I'd like to withdraw this for now while I have a think about what has gone wrong! Thanks, James > > Thanks, > James > > --- > gcc/ > > 2014-09-25 James Greenhalgh <james.greenha...@arm.com> > > * config/aarch64/aarch64.c > (aarch64_estimate_movmem_ninsns): New. > (aarch64_expand_movmem): Refactor old move costs. > (aarch64_move_by_pieces_profitable_p): New. > (aarch64_estimate_block_copy_ninsns): Likewise. > (aarch64_max_scalarization_size): Likewise. > (TARGET_MAX_SCALARIZATION_SIZE): Likewise. > (TARGET_MOVE_BY_PIECES_PROFITABLE_P): Likewise. > * config/aarch64/aarch64.h (AARCH64_MOVE_RATIO): New. > (MOVE_RATIO): Delete. > > gcc/testsuite/ > > 2014-09-25 James Greenhalgh <james.greenha...@arm.com> > > * gcc.dg/tree-ssa/pr42585.c: Bring back for AArch64. > * gcc.dg/tree-ssa/sra-12.c: Likewise. > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index 3483081..d8b5a4a 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -9616,6 +9616,34 @@ aarch64_modes_tieable_p (enum machine_mode mode1, enum > machine_mode mode2) > return false; > } > > +static unsigned int > +aarch64_estimate_movmem_ninsns (HOST_WIDE_INT size) > +{ > + HOST_WIDE_INT chunks = 0; > + int n = size; > + > + /* 3 bytes is a 2-byte then a 1-byte copy. */ > + if (n == 3) > + return 2; > + > + /* 5, 6, 7 bytes need an extra copy. */ > + if (n > 4 && n < 8) > + chunks++; > + > + /* If n was greater than 8, it is dealt with in 8/16-byte chunks > + first. */ > + chunks += n / 16; > + n %= 16; > + chunks += n / 8; > + n %= 8; > + > + /* Anything left is dealt with in one instruction. */ > + if (n != 0) > + chunks++; > + > + return chunks; > +} > + > /* Return a new RTX holding the result of moving POINTER forward by > AMOUNT bytes. */ > > @@ -9673,7 +9701,7 @@ aarch64_expand_movmem (rtx *operands) > > /* When optimizing for size, give a better estimate of the length of a > memcpy call, but use the default otherwise. */ > - unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2; > + unsigned int max_instructions = AARCH64_MOVE_RATIO (speed_p); > > /* We can't do anything smart if the amount to copy is not constant. */ > if (!CONST_INT_P (operands[2])) > @@ -9681,10 +9709,9 @@ aarch64_expand_movmem (rtx *operands) > > n = UINTVAL (operands[2]); > > - /* Try to keep the number of instructions low. For cases below 16 bytes we > - need to make at most two moves. For cases above 16 bytes it will be one > - move for each 16 byte chunk, then at most two additional moves. */ > - if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions) > + /* Try to keep the number of instructions we emit low, fail expansion > + if we are unable to and leave it to memcpy. */ > + if (aarch64_estimate_movmem_ninsns (n) > max_instructions) > return false; > > base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); > @@ -9774,6 +9801,57 @@ aarch64_expand_movmem (rtx *operands) > return true; > } > > +/* Implement TARGET_MOVE_BY_PIECES_PROFITABLE_P. */ > + > +bool > +aarch64_move_by_pieces_profitable_p (unsigned int size > + unsigned int align > + bool speed_p) > +{ > + /* For strict alignment we don't want to use our unaligned > + movmem implementation. */ > + if (STRICT_ALIGNMENT) > + return (AARCH64_MOVE_RATIO (speed_p) > + < move_by_pieces_ninsns (size, align, speed_p)); > + > + /* If we have an overhang of 3, 6 or 7 bytes, we would emit an unaligned > + load to cover it, if this is likely to be slow we would do better > + going through move_by_pieces. */ > + if (size % 8 > 5) > + return SLOW_UNALIGNED_ACCESS (DImode, 1); > + else if (size % 4 == 3) > + return SLOW_UNALIGNED_ACCESS (SImode, 1); > + > + /* We can likely do a better job than the move_by_pieces infrastructure > + can. */ > + return false; > +} > + > +/* Implement TARGET_ESTIMATE_BLOCK_COPY_NINSNS. */ > + > +unsigned int > +aarch64_estimate_block_copy_ninsns (HOST_WIDE_INT size, bool speed_p) > +{ > + if (aarch64_move_by_pieces_profitable_p (size, 8, speed_p)) > + return move_by_pieces_ninsns (size, 8, MOVE_MAX_PIECES); > + else if (aarch64_estimate_movmem_ninsns (size) > + < AARCH64_MOVE_RATIO (speed_p)) > + return aarch64_estimate_movmem_ninsns (size); > + else > + /* memcpy. Set up 3 arguments and make a call. */ > + return 4; > +} > + > +/* Implement TARGET_MAX_SCALARIZATION_SIZE. */ > + > +unsigned int > +aarch64_max_scalarization_size (bool speed_p) > +{ > + /* The maximum number of instructions we are willing to use * the > + maximum size we can move in one instruction (LDP/STP). */ > + return AARCH64_MOVE_RATIO (speed_p) * 16; > +} > + > #undef TARGET_ADDRESS_COST > #define TARGET_ADDRESS_COST aarch64_address_cost > > @@ -9843,6 +9921,10 @@ aarch64_expand_movmem (rtx *operands) > #undef TARGET_BUILTIN_DECL > #define TARGET_BUILTIN_DECL aarch64_builtin_decl > > +#undef TARGET_ESTIMATE_BLOCK_COPY_NINSNS > +#define TARGET_ESTIMATE_BLOCK_COPY_NINSNS \ > + aarch64_estimate_block_copy_ninsns > + > #undef TARGET_EXPAND_BUILTIN > #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin > > @@ -9897,9 +9979,17 @@ aarch64_expand_movmem (rtx *operands) > #undef TARGET_MANGLE_TYPE > #define TARGET_MANGLE_TYPE aarch64_mangle_type > > +#undef TARGET_MAX_SCALARIZATION_SIZE > +#define TARGET_MAX_SCALARIZATION_SIZE \ > + aarch64_max_scalarization_size > + > #undef TARGET_MEMORY_MOVE_COST > #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost > > +#undef TARGET_MOVE_BY_PIECES_PROFITABLE_P > +#define TARGET_MOVE_BY_PIECES_PROFITABLE_P \ > + aarch64_move_by_pieces_profitable_p > + > #undef TARGET_MUST_PASS_IN_STACK > #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size > > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > index db950da..5c8d37d 100644 > --- a/gcc/config/aarch64/aarch64.h > +++ b/gcc/config/aarch64/aarch64.h > @@ -678,17 +678,10 @@ do { > \ > /* Maximum bytes moved by a single instruction (load/store pair). */ > #define MOVE_MAX (UNITS_PER_WORD * 2) > > -/* The base cost overhead of a memcpy call, for MOVE_RATIO and friends. */ > +/* The base cost overhead of a memcpy call, for CLEAR_RATIO and friends. */ > #define AARCH64_CALL_RATIO 8 > > -/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure. > - move_by_pieces will continually copy the largest safe chunks. So a > - 7-byte copy is a 4-byte + 2-byte + byte copy. This proves inefficient > - for both size and speed of copy, so we will instead use the "movmem" > - standard name to implement the copy. This logic does not apply when > - targeting -mstrict-align, so keep a sensible default in that case. */ > -#define MOVE_RATIO(speed) \ > - (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)) > +#define AARCH64_MOVE_RATIO(speed) (((speed) ? 15 : AARCH64_CALL_RATIO) / 2) > > /* For CLEAR_RATIO, when optimizing for size, give a better estimate > of the length of a memset call, but use the default otherwise. */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c > b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c > index 07f575d..a970c85 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c > @@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr) > /* Whether the structs are totally scalarized or not depends on the > MOVE_RATIO macro definition in the back end. The scalarization will > not take place when using small values for MOVE_RATIO. */ > -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { > target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* > sh*-*-*" } } } } */ > -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { > target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* > sh*-*-*" } } } } */ > +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { > target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } > } */ > +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { > target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } > } */ > /* { dg-final { cleanup-tree-dump "optimized" } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c > b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c > index 45aa963..59e5e6a 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c > @@ -21,5 +21,5 @@ int foo (struct S *p) > *p = l; > } > > -/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! > "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */ > +/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! > "avr*-*-* nds32*-*-*" } } } } */ > /* { dg-final { cleanup-tree-dump "release_ssa" } } */