Unfortunately, doesn't cause a performance improvement for coremark, but happens a few times in newlib, just enough to affect coremark 0.01% by size (or 4 bytes, and three cycles (__fwalk_sglue and __vfiprintf_r each two bytes).
gcc: * config/cris/cris.md (splitop): Add PLUS. * config/cris/cris.cc (cris_split_constant): Also handle PLUS when a split into two insns may be useful. gcc/testsuite: * gcc.target/cris/peep2-addsplit1.c: New test. --- gcc/config/cris/cris.cc | 25 +++++++- gcc/config/cris/cris.md | 6 +- .../gcc.target/cris/peep2-addsplit1.c | 59 +++++++++++++++++++ 3 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/cris/peep2-addsplit1.c diff --git a/gcc/config/cris/cris.cc b/gcc/config/cris/cris.cc index 331f5908a538..561ca1b3fa92 100644 --- a/gcc/config/cris/cris.cc +++ b/gcc/config/cris/cris.cc @@ -2642,7 +2642,30 @@ cris_split_constant (HOST_WIDE_INT wval, enum rtx_code code, int32_t ival = (int32_t) wval; uint32_t uval = (uint32_t) wval; - if (code != AND || IN_RANGE(ival, -32, 31) + /* Can we do with two addq or two subq, improving chances of filling a + delay-slot? At worst, we break even, both performance and + size-wise. */ + if (code == PLUS + && (IN_RANGE (ival, -63 * 2, -63 - 1) + || IN_RANGE (ival, 63 + 1, 63 * 2))) + { + if (generate) + { + int sign = ival < 0 ? -1 : 1; + int aval = abs (ival); + + if (mode != SImode) + { + dest = gen_rtx_REG (SImode, REGNO (dest)); + op = gen_rtx_REG (SImode, REGNO (op)); + } + emit_insn (gen_addsi3 (dest, op, GEN_INT (63 * sign))); + emit_insn (gen_addsi3 (dest, op, GEN_INT ((aval - 63) * sign))); + } + return 2; + } + + if (code != AND || IN_RANGE (ival, -32, 31) /* Implemented using movu.[bw] elsewhere. */ || ival == 255 || ival == 65535 /* Implemented using clear.[bw] elsewhere. */ diff --git a/gcc/config/cris/cris.md b/gcc/config/cris/cris.md index 53fc2f2de4af..243d47748b78 100644 --- a/gcc/config/cris/cris.md +++ b/gcc/config/cris/cris.md @@ -209,7 +209,7 @@ (define_code_iterator plusminusumin [plus minus umin]) (define_code_iterator plusumin [plus umin]) ;; For opsplit1. -(define_code_iterator splitop [and]) +(define_code_iterator splitop [and plus]) ;; The addsubbo and nd code-attributes form a hack. We need to output ;; "addu.b", "subu.b" but "bound.b" (no "u"-suffix) which means we'd @@ -2984,6 +2984,10 @@ (define_peephole2 ; movandsplit1 ;; Large (read: non-quick) numbers can sometimes be AND:ed by other means. ;; Testcase: gcc.target/cris/peep2-andsplit1.c +;; +;; Another case is add<ext> N,rx with -126..-64,64..126: it has the same +;; size and execution time as two addq or subq, but addq and subq can fill +;; a delay-slot. (define_peephole2 ; opsplit1 [(parallel [(set (match_operand 0 "register_operand") diff --git a/gcc/testsuite/gcc.target/cris/peep2-addsplit1.c b/gcc/testsuite/gcc.target/cris/peep2-addsplit1.c new file mode 100644 index 000000000000..7dff1d8c77c7 --- /dev/null +++ b/gcc/testsuite/gcc.target/cris/peep2-addsplit1.c @@ -0,0 +1,52 @@ +/* Check that "opsplit1" with PLUS does its job. */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fno-leading-underscore" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +int addsi (int x) +{ + return x + 64; +} + +char addqi (char x) +{ + return x + 126; +} + +short addhi (short x) +{ + return x - 64; +} + +unsigned short addhi2 (short x) +{ + return x - 126; +} + +/* +** addsi: +** addq 63,.r10 +** ret +** addq 1,.r10 +*/ + +/* +** addqi: +** addq 63,.r10 +** ret +** addq 63,.r10 +*/ + +/* +** addhi: +** subq 63,.r10 +** ret +** subq 1,.r10 +*/ + +/* +** addhi2: +** subq 63,.r10 +** ret +** subq 63,.r10 +*/ -- 2.30.2