On Mon, Sep 21, 2020 at 06:51:00PM +0200, Andreas Krebbel wrote: > On 18.09.20 13:10, Stefan Schulze Frielinghaus wrote: > > This patch enables a peephole2 optimization which transforms a load of > > constant zero into a temporary register which is then finally used to > > compare against a floating-point register of interest into a single load > > and test instruction. However, the optimization is only applied if both > > registers are dead afterwards and if we test for (in)equality only. > > This is relaxed in case of fast math. > > > > This is a follow up to PR88856. > > > > Bootstrapped and regtested on IBM Z. > > > > gcc/ChangeLog: > > > > * config/s390/s390.md ("*cmp<mode>_ccs_0", "*cmp<mode>_ccz_0", > > "*cmp<mode>_ccs_0_fastmath"): Basically change "*cmp<mode>_ccs_0" into > > "*cmp<mode>_ccz_0" and for fast math add "*cmp<mode>_ccs_0_fastmath". > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/s390/load-and-test-fp-1.c: Change test to include all > > possible combinations of dead/live registers and comparisons (equality, > > relational). > > * gcc.target/s390/load-and-test-fp-2.c: Same as load-and-test-fp-1.c > > but for fast math. > > * gcc.target/s390/load-and-test-fp.h: New test included by > > load-and-test-fp-{1,2}.c. > > Ok for mainline. Please see below for some comments.
Pushed with the mentioned changes in commit 1a84651d164. Thanks for the review! Cheers, Stefan > > Thanks! > > Andreas > > > --- > > gcc/config/s390/s390.md | 54 +++++++++++++++---- > > .../gcc.target/s390/load-and-test-fp-1.c | 19 +++---- > > .../gcc.target/s390/load-and-test-fp-2.c | 17 ++---- > > .../gcc.target/s390/load-and-test-fp.h | 12 +++++ > > 4 files changed, 67 insertions(+), 35 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/s390/load-and-test-fp.h > > > > diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md > > index 4c3e5400a2b..e591aa7c324 100644 > > --- a/gcc/config/s390/s390.md > > +++ b/gcc/config/s390/s390.md > > @@ -1391,23 +1391,55 @@ > > ; (TF|DF|SF|TD|DD|SD) instructions > > > > > > -; FIXME: load and test instructions turn SNaN into QNaN what is not > > -; acceptable if the target will be used afterwards. On the other hand > > -; they are quite convenient for implementing comparisons with 0.0. So > > -; try to enable them via splitter/peephole if the value isn't needed > > anymore. > > -; See testcases: load-and-test-fp-1.c and load-and-test-fp-2.c > > +; load and test instructions turn a signaling NaN into a quiet NaN. Thus > > they > > +; may only be used if the target register is dead afterwards or if fast > > math > > +; is enabled. The former is done via a peephole optimization. Note, load > > and > > +; test instructions may only be used for (in)equality comparisons because > > +; relational comparisons must treat a quiet NaN like a signaling NaN which > > is > > +; not the case for load and test instructions. For fast math insn > > +; "cmp<mode>_ccs_0_fastmath" applies. > > +; See testcases load-and-test-fp-{1,2}.c > > + > > +(define_peephole2 > > + [(set (match_operand:FP 0 "register_operand") > > + (match_operand:FP 1 "const0_operand")) > > + (set (reg:CCZ CC_REGNUM) > > + (compare:CCZ (match_operand:FP 2 "register_operand") > > + (match_operand:FP 3 "register_operand")))] > > + "TARGET_HARD_FLOAT > > + && FP_REG_P (operands[2]) > > + && REGNO (operands[0]) == REGNO (operands[3]) > > + && peep2_reg_dead_p (2, operands[0]) > > + && peep2_reg_dead_p (2, operands[2])" > > + [(parallel > > + [(set (reg:CCZ CC_REGNUM) > > + (match_op_dup 4 [(match_dup 2) (match_dup 1)])) > > + (clobber (match_dup 2))])] > > + "operands[4] = gen_rtx_COMPARE (CCZmode, operands[2], operands[1]);") > > Couldn't this be written as: > > [(parallel > [(set (reg:CCZ CC_REGNUM) > (compare:CCZ (match_dup 2) (match_dup 1))) > (clobber (match_dup 2))])]) > > > > > ; ltxbr, ltdbr, ltebr, ltxtr, ltdtr > > -(define_insn "*cmp<mode>_ccs_0" > > - [(set (reg CC_REGNUM) > > - (compare (match_operand:FP 0 "register_operand" "f") > > - (match_operand:FP 1 "const0_operand" ""))) > > - (clobber (match_operand:FP 2 "register_operand" "=0"))] > > - "s390_match_ccmode(insn, CCSmode) && TARGET_HARD_FLOAT" > > +(define_insn "*cmp<mode>_ccz_0" > > + [(set (reg:CCZ CC_REGNUM) > > + (compare:CCZ (match_operand:FP 0 "register_operand" "f") > > + (match_operand:FP 1 "const0_operand"))) > > + (clobber (match_operand:FP 2 "register_operand" "=0"))] > > + "TARGET_HARD_FLOAT" > > "lt<xde><bt>r\t%0,%0" > > [(set_attr "op_type" "RRE") > > (set_attr "type" "fsimp<mode>")]) > > > > +(define_insn "*cmp<mode>_ccs_0_fastmath" > > + [(set (reg CC_REGNUM) > > + (compare (match_operand:FP 0 "register_operand" "f") > > + (match_operand:FP 1 "const0_operand")))] > > + "s390_match_ccmode (insn, CCSmode) > > + && TARGET_HARD_FLOAT > > + && !flag_trapping_math > > + && !flag_signaling_nans" > > + "lt<xde><bt>r\t%0,%0" > > + [(set_attr "op_type" "RRE") > > + (set_attr "type" "fsimp<mode>")]) > > + > > ; VX: TFmode in FPR pairs: use cxbr instead of wfcxb > > ; cxtr, cdtr, cxbr, cdbr, cebr, cdb, ceb, wfcsb, wfcdb > > (define_insn "*cmp<mode>_ccs" > > diff --git a/gcc/testsuite/gcc.target/s390/load-and-test-fp-1.c > > b/gcc/testsuite/gcc.target/s390/load-and-test-fp-1.c > > index 2a7e88c0f1b..ebb8a88c574 100644 > > --- a/gcc/testsuite/gcc.target/s390/load-and-test-fp-1.c > > +++ b/gcc/testsuite/gcc.target/s390/load-and-test-fp-1.c > > @@ -1,17 +1,12 @@ > > /* { dg-do compile } */ > > /* { dg-options "-O3 -mzarch" } */ > > I think -march=z196 would be needed here as well. Otherwise the 0.0 floating > point constant won't > survive until peephole pass. We don't accept FP zeroes in reload for machines > earlier than z196. See > legitimate_reload_fp_constant_p. > > It should be ok as is for load-and-test-fp-2.c. There the comparison pattern > supporting a FP zero is > matched right from the start. > > > > > -/* a is used after the comparison. We cannot use load and test here > > - since it would turn SNaNs into QNaNs. */ > > +/* Use load-and-test instructions if compared for (in)equality and if > > variable > > + `a` is dead after the comparison. For all other cases use > > + compare-and-signal instructions. */ > > > > -double gl; > > +#include "load-and-test-fp.h" > > > > -double > > -foo (double dummy, double a) > > -{ > > - if (a == 0.0) > > - gl = 1; > > - return a; > > -} > > - > > -/* { dg-final { scan-assembler {\tcdbr?\t} } } */ > > +/* { dg-final { scan-assembler-times "ltdbr\t" 2 } } */ > > +/* { dg-final { scan-assembler-times "cdbr\t" 2 } } */ > > +/* { dg-final { scan-assembler-times "kdbr\t" 8 } } */ > > diff --git a/gcc/testsuite/gcc.target/s390/load-and-test-fp-2.c > > b/gcc/testsuite/gcc.target/s390/load-and-test-fp-2.c > > index 7646fdd5def..53dab3c4424 100644 > > --- a/gcc/testsuite/gcc.target/s390/load-and-test-fp-2.c > > +++ b/gcc/testsuite/gcc.target/s390/load-and-test-fp-2.c > > @@ -1,16 +1,9 @@ > > /* { dg-do compile } */ > > -/* { dg-options "-O3" } */ > > +/* { dg-options "-O3 -mzarch -ffast-math" } */ > > > > -/* a is not used after the comparison. So we should use load and test > > - here. */ > > +/* Fast-math implies -fno-trapping-math -fno-signaling-nans which imply > > + that no user visible trap will happen. */ > > > > -double gl; > > +#include "load-and-test-fp.h" > > > > -void > > -bar (double a) > > -{ > > - if (a == 0.0) > > - gl = 1; > > -} > > - > > -/* { dg-final { scan-assembler "ltdbr\t" } } */ > > +/* { dg-final { scan-assembler-times "ltdbr\t" 12 } } */ > > diff --git a/gcc/testsuite/gcc.target/s390/load-and-test-fp.h > > b/gcc/testsuite/gcc.target/s390/load-and-test-fp.h > > new file mode 100644 > > index 00000000000..f153d96698d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/s390/load-and-test-fp.h > > @@ -0,0 +1,12 @@ > > +double gl; > > + > > +#define test(N, CMP) \ > > + void N ## _dead(double a) { if (a CMP 0.0) gl = 1; } \ > > + double N ## _live(double a) { if (a CMP 0.0) gl = 1; return a; } > > + > > +test(eq, ==) > > +test(ne, !=) > > +test(ge, >=) > > +test(gt, >) > > +test(le, <=) > > +test(lt, <) > > >