[PATCHv5][PR 57371] Remove useless floating point casts in comparisons
Hi all, This is an updated version of patch in https://gcc.gnu.org/ml/gcc-patches/2017-07/msg00924.html . It prevents optimization in presense of sNaNs (and qNaNs when comparison operator is > >= < <=) to preserve FP exceptions. I've fixed some stylistic issues mentioned in previous review. Rebased, bootstrapped and regtested on x64. Ok for trunk? -Y 2017-07-30 Yury Gribov PR tree-optimization/57371 gcc/ * match.pd: New pattern. gcc/testsuite/ * testsuite/c-c++-common/pr57371-1.c: New test. * testsuite/c-c++-common/pr57371-2.c: New test. * testsuite/c-c++-common/pr57371-3.c: New test. * testsuite/c-c++-common/pr57371-4.c: New test. * testsuite/gcc.dg/pr57371-5.c: New test. diff -rupN gcc/gcc/match.pd gcc-57371/gcc/match.pd --- gcc/gcc/match.pd2017-07-18 22:21:16.0 +0200 +++ gcc-57371/gcc/match.pd 2017-07-28 05:00:05.0 +0200 @@ -2805,6 +2805,80 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (if (! HONOR_NANS (@0)) (cmp @0 @1)) +/* Optimize various special cases of (FTYPE) N CMP CST. */ +(for cmp (lt le eq ne ge gt) + icmp (le le eq ne ge ge) + (simplify + (cmp (float @0) REAL_CST@1) + (if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (@1)) + && ! DECIMAL_FLOAT_TYPE_P (TREE_TYPE (@1))) +(with + { + tree itype = TREE_TYPE (@0); + signop isign = TYPE_SIGN (itype); + format_helper fmt (REAL_MODE_FORMAT (TYPE_MODE (TREE_TYPE (@1; + const REAL_VALUE_TYPE *cst = TREE_REAL_CST_PTR (@1); + /* Be careful to preserve any potential exceptions due to + NaNs. qNaNs are ok in == or != context. + TODO: relax under -fno-trapping-math or + -fno-signaling-nans. */ + bool exception_p + = real_isnan (cst) && (cst->signalling + || (cmp != EQ_EXPR || cmp != NE_EXPR)); + /* INT?_MIN is power-of-two so it takes + only one mantissa bit. */ + bool signed_p = isign == SIGNED; + bool itype_fits_ftype_p += TYPE_PRECISION (itype) - signed_p <= significand_size (fmt); + } + /* TODO: allow non-fitting itype and SNaNs when + -fno-trapping-math. */ + (if (itype_fits_ftype_p && ! exception_p) + (with + { +REAL_VALUE_TYPE imin, imax; +real_from_integer (&imin, fmt, wi::min_value (itype), isign); +real_from_integer (&imax, fmt, wi::max_value (itype), isign); + +REAL_VALUE_TYPE icst; +if (cmp == GT_EXPR || cmp == GE_EXPR) + real_ceil (&icst, fmt, cst); +else if (cmp == LT_EXPR || cmp == LE_EXPR) + real_floor (&icst, fmt, cst); +else + real_trunc (&icst, fmt, cst); + +bool cst_int_p = real_identical (&icst, cst); + +bool overflow_p = false; +wide_int icst_val + = real_to_integer (&icst, &overflow_p, TYPE_PRECISION (itype)); + } + (switch + /* Optimize cases when CST is outside of ITYPE's range. */ + (if (real_compare (LT_EXPR, cst, &imin)) +{ constant_boolean_node (cmp == GT_EXPR || cmp == GE_EXPR || cmp == NE_EXPR, + type); }) + (if (real_compare (GT_EXPR, cst, &imax)) +{ constant_boolean_node (cmp == LT_EXPR || cmp == LE_EXPR || cmp == NE_EXPR, + type); }) + /* Remove cast if CST is an integer representable by ITYPE. */ + (if (cst_int_p) +(cmp @0 { gcc_assert (!overflow_p); + wide_int_to_tree (itype, icst_val); }) + ) + /* When CST is fractional, optimize + (FTYPE) N == CST -> 0 + (FTYPE) N != CST -> 1. */ + (if (cmp == EQ_EXPR || cmp == NE_EXPR) +{ constant_boolean_node (cmp == NE_EXPR, type); }) + /* Otherwise replace with sensible integer constant. */ + (with +{ + gcc_checking_assert (!overflow_p); +} +(icmp @0 { wide_int_to_tree (itype, icst_val); }) + /* Fold A /[ex] B CMP C to A CMP B * C. */ (for cmp (eq ne) (simplify diff -rupN gcc/gcc/testsuite/c-c++-common/pr57371-1.c gcc-57371/gcc/testsuite/c-c++-common/pr57371-1.c --- gcc/gcc/testsuite/c-c++-common/pr57371-1.c 1970-01-01 01:00:00.0 +0100 +++ gcc-57371/gcc/testsuite/c-c++-common/pr57371-1.c2017-07-28 05:00:05.0 +0200 @@ -0,0 +1,341 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-original" } */ + +#include + +/* Original testcase from PR. */ + +int foo1 (short x) { + return (double) x != 0; + /* { dg-final { scan-tree-dump "return ( = )?x != 0" "original" } } */ +} + +int foo2 (short x) { + return (float) x != 0; + /* { dg-final { scan-tree-dump "return ( = )?x != 0" "original" } } */ +} + +int foo3 (int x) { + return (double) x != 0; + /* { dg-final { scan-tree-dump "return ( = )?x != 0" "original" } } */ +} + +/* Tests when RHS is within range of integer type. */ + +void in_range (un
Re: [PATCH] i386: Rewrite check for AVX512 features
On Sat, Jul 29, 2017 at 3:06 PM, H.J. Lu wrote: > Add a new file, avx512-check.h, to check all AVX512 features. The test > is skipped if any requested AVX512 features are unavailable. > > Tested on Skylake server and Haswell. OK for trunk? No, I'd rather leave it in in the way they are now, so test can include individual checks. Uros. > > H.J. > --- > PR target/81590 > * gcc.target/i386/avx512-check.h: New file. > * gcc.target/i386/avx5124fmaps-check.h: Removed. > * gcc.target/i386/avx5124vnniw-check.h: Likewise. > * gcc.target/i386/avx512cd-check.h: Likewise. > * gcc.target/i386/avx512ifma-check.h: Likewise. > * gcc.target/i386/avx512vbmi-check.h: Likewise. > * gcc.target/i386/avx512vpopcntdq-check.h: Likewise. > * gcc.target/i386/avx512bw-check.h: Rewrite. > * gcc.target/i386/avx512dq-check.h: Likewise. > * gcc.target/i386/avx512er-check.h: Likewise. > * gcc.target/i386/avx512f-check.h: Likewise. > * gcc.target/i386/avx512vl-check.h: Likewise. > * gcc.target/i386/avx512f-helper.h: Include "avx512-check.h" > only. > (test_512): Removed. > (avx512*_test): Likewise. > * gcc.target/i386/avx512f-pr71559.c (TEST): Undef. > --- > gcc/testsuite/gcc.target/i386/avx512-check.h | 113 > + > gcc/testsuite/gcc.target/i386/avx5124fmaps-check.h | 47 - > gcc/testsuite/gcc.target/i386/avx5124vnniw-check.h | 47 - > gcc/testsuite/gcc.target/i386/avx512bw-check.h | 50 + > gcc/testsuite/gcc.target/i386/avx512cd-check.h | 46 - > gcc/testsuite/gcc.target/i386/avx512dq-check.h | 50 + > gcc/testsuite/gcc.target/i386/avx512er-check.h | 49 + > gcc/testsuite/gcc.target/i386/avx512f-check.h | 49 + > gcc/testsuite/gcc.target/i386/avx512f-helper.h | 64 +--- > gcc/testsuite/gcc.target/i386/avx512f-pr71559.c| 1 + > gcc/testsuite/gcc.target/i386/avx512ifma-check.h | 46 - > gcc/testsuite/gcc.target/i386/avx512vbmi-check.h | 46 - > gcc/testsuite/gcc.target/i386/avx512vl-check.h | 51 +- > .../gcc.target/i386/avx512vpopcntdq-check.h| 47 - > 14 files changed, 130 insertions(+), 576 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/avx512-check.h > delete mode 100644 gcc/testsuite/gcc.target/i386/avx5124fmaps-check.h > delete mode 100644 gcc/testsuite/gcc.target/i386/avx5124vnniw-check.h > delete mode 100644 gcc/testsuite/gcc.target/i386/avx512cd-check.h > delete mode 100644 gcc/testsuite/gcc.target/i386/avx512ifma-check.h > delete mode 100644 gcc/testsuite/gcc.target/i386/avx512vbmi-check.h > delete mode 100644 gcc/testsuite/gcc.target/i386/avx512vpopcntdq-check.h > > diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h > b/gcc/testsuite/gcc.target/i386/avx512-check.h > new file mode 100644 > index 000..bfe14960100 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512-check.h > @@ -0,0 +1,113 @@ > +#include > +#include "cpuid.h" > +#include "m512-check.h" > +#include "avx512f-os-support.h" > + > +#ifndef DO_TEST > +#define DO_TEST do_test > +#ifdef AVX512VL > +static void test_256 (void); > +static void test_128 (void); > +#else > +static void test_512 (void); > +#endif > + > +__attribute__ ((noinline)) > +static void > +do_test (void) > +{ > +#ifdef AVX512VL > + test_256 (); > + test_128 (); > +#else > + test_512 (); > +#endif > +} > +#endif > + > +int > +main () > +{ > + unsigned int eax, ebx, ecx, edx; > + > + if (!__get_cpuid (1, &eax, &ebx, &ecx, &edx)) > +goto skipped; > + > + /* Run AVX512F test only if host has AVX512F support. */ > + if (!(ecx & bit_OSXSAVE)) > +goto skipped; > + > + if (__get_cpuid_max (0, NULL) < 7) > +goto skipped; > + > + __cpuid_count (7, 0, eax, ebx, ecx, edx); > + > + if (!(ebx & bit_AVX512F)) > +goto skipped; > + > +#ifdef AVX512VL > + if (!(ebx & bit_AVX512VL)) > +goto skipped; > +#endif > + > +#ifdef AVX512ER > + if (!(ebx & bit_AVX512ER)) > +goto skipped; > +#endif > + > +#ifdef AVX512CD > + if (!(ebx & bit_AVX512CD)) > +goto skipped; > +#endif > + > +#ifdef AVX512DQ > + if (!(ebx & bit_AVX512DQ)) > +goto skipped; > +#endif > + > +#ifdef AVX512BW > + if (!(ebx & bit_AVX512BW)) > +goto skipped; > +#endif > + > +#ifdef AVX512IFMA > + if (!(ebx & bit_AVX512IFMA)) > +goto skipped; > +#endif > + > +#ifdef AVX512VBMI > + if (!(ecx & bit_AVX512VBMI)) > +goto skipped; > +#endif > + > +#ifdef AVX5124FMAPS > + if (!(edx & bit_AVX5124FMAPS)) > +goto skipped; > +#endif > + > +#ifdef AVX5124VNNIW > + if (!(edx & bit_AVX5124VNNIW)) > +goto skipped; > +#endif > + > +#ifdef AVX512VPOPCNTDQ > + if (!(ecx & bit_AVX512VPOPCNTDQ)) > +goto skipped; > +#endif > + > + if (!avx512f_os_support ()) > +goto skipped; > + > + DO_TEST (); > + > +#ifdef DEBUG > + printf ("
Re: [PATCH] i386: Update INCOMING_FRAME_SP_OFFSET for exception handler
On Sat, Jul 29, 2017 at 4:22 PM, H.J. Lu wrote: > Since there is an extra error code passed to the exception handler, > INCOMING_FRAME_SP_OFFSET is return address plus error code for the > exception handler. This patch updates INCOMING_FRAME_SP_OFFSET to > the correct value for the exception handler. > > This patch exposed a bug in DWARF stack frame CFI generation, which > assumes that INCOMING_FRAME_SP_OFFSET is the same for all functions: > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81570 > > It sets and caches the incoming stack frame offset with the same > INCOMING_FRAME_SP_OFFSET for all functions. When there are both > exception handler and normal function in the same input, the wrong > incoming stack frame offset is used for exception handler or normal > function, which leads to > > FAIL: gcc.dg/guality/pr68037-1.c -O2 -flto -fuse-linker-plugin > -fno-fat-lto-objects line 33 error == 0x12345670 > FAIL: gcc.dg/guality/pr68037-1.c -O2 -flto -fuse-linker-plugin > -fno-fat-lto-objects line 33 frame->ip == 0x12345671 > FAIL: gcc.dg/guality/pr68037-1.c -O2 -flto -fuse-linker-plugin > -fno-fat-lto-objects line 33 frame->cs == 0x12345672 > FAIL: gcc.dg/guality/pr68037-1.c -O2 -flto -fuse-linker-plugin > -fno-fat-lto-objects line 33 frame->flags == 0x12345673 > FAIL: gcc.dg/guality/pr68037-1.c -O2 -flto -fuse-linker-plugin > -fno-fat-lto-objects line 33 frame->sp == 0x12345674 > FAIL: gcc.dg/guality/pr68037-1.c -O2 -flto -fuse-linker-plugin > -fno-fat-lto-objects line 33 frame->ss == 0x12345675 > > With the patch for PR 81570: > > https://gcc.gnu.org/ml/gcc-patches/2017-07/msg01851.html > > applied, there are no regressions on i686 and x86-64. > > OK for trunk? I have two nits below. LGTM - you know this part better than I, so it is your call. Uros. > H.J. > --- > gcc/ > > PR target/79793 > * config/i386/i386.c (ix86_function_arg): Update arguments for > exception handler. > (ix86_compute_frame_layout): Set the initial stack offset to > INCOMING_FRAME_SP_OFFSET. Update red-zone offset with > INCOMING_FRAME_SP_OFFSET. > (ix86_expand_epilogue): Don't pop the 'ERROR_CODE' off the > stack before exception handler returns. > * config/i386/i386.h (INCOMING_FRAME_SP_OFFSET): Add the > the 'ERROR_CODE' for exception handler. > > gcc/testsuite/ > > PR target/79793 > * gcc.dg/guality/pr68037-1.c: Update gdb breakpoints. > * gcc.target/i386/interrupt-5.c (interrupt_frame): New struct. > (foo): Check the builtin return address against the return address > in interrupt frame. > * gcc.target/i386/pr79793-1.c: New test. > * gcc.target/i386/pr79793-2.c: Likewise. > --- > gcc/config/i386/i386.c | 36 > + > gcc/config/i386/i386.h | 6 +++-- > gcc/testsuite/gcc.dg/guality/pr68037-1.c| 12 +- > gcc/testsuite/gcc.target/i386/interrupt-5.c | 13 +-- > gcc/testsuite/gcc.target/i386/pr79793-1.c | 14 +++ > gcc/testsuite/gcc.target/i386/pr79793-2.c | 16 + > 6 files changed, 62 insertions(+), 35 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr79793-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr79793-2.c > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index f1486ff3750..e1582f8c9a6 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -10422,25 +10422,21 @@ ix86_function_arg (cumulative_args_t cum_v, > machine_mode omode, > { > /* This is the pointer argument. */ > gcc_assert (TYPE_MODE (type) == Pmode); > - if (cfun->machine->func_type == TYPE_INTERRUPT) > - /* -WORD(AP) in the current frame in interrupt handler. */ > - arg = plus_constant (Pmode, arg_pointer_rtx, > --UNITS_PER_WORD); > - else > - /* (AP) in the current frame in exception handler. */ > - arg = arg_pointer_rtx; > + /* It is at -WORD(AP) in the current frame in interrupt and > +exception handlers. */ > + arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD); > } >else > { > gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION > && TREE_CODE (type) == INTEGER_TYPE > && TYPE_MODE (type) == word_mode); > - /* The integer argument is the error code at -WORD(AP) in > + /* The integer argument is the error code at -2 * WORD(AP) in > the current frame in exception handler. */ The comment should probably say: /* The error code is the word-mode integer argument at -2 * WORD(AP) in the current frame of the exception handler. */ > arg = gen_rtx_MEM (word_mode, > plus_constant (Pmode, >
Re: [PATCH 1/2, c-family]: Add 'z' to asm_fprintf_char_table
On Wed, Jul 26, 2017 at 2:18 PM, Uros Bizjak wrote: > Hello! > > This patch is the prerequisite for my next patch. It enables %z > extension for asm_fprintf that will emit 'q' or 'l' suffixes for > instructions with word-sized operands. > > 2017-07-26 Uros Bizjak > > * c-format.c (asm_fprintf_char_table): Add 'z' to format_chars. > > Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}. > > OK for mainline? I went ahead and committed the patch. Uros.
Re: [PATCH] PR libstdc++/79162 ambiguity in string assignment due to string_view overload (LWG 2946)
2017-07-28 22:40 GMT+02:00 Daniel Krügler : > 2017-07-28 22:29 GMT+02:00 Daniel Krügler : >> 2017-07-28 22:25 GMT+02:00 Tim Song : >>> On Fri, Jul 28, 2017 at 4:10 PM, Daniel Krügler >>> wrote: + // Performs an implicit conversion from _Tp to __sv_type. + template +static __sv_type _S_to_string_view(const _Tp& __svt) +{ + return __svt; +} >>> >>> I might have gone for >>> >>> +static __sv_type _S_to_string_view(__sv_type __svt) noexcept >>> +{ >>> + return __svt; >>> +} >>> >>> With that, we can also use noexcept(_S_to_string_view(__t)) to make up >>> for the absence of is_nothrow_convertible (basically the same thing I >>> did in LWG 2993's PR). >> >> Agreed, that makes very much sense. I will adjust the P/R, but before >> I resubmit I would like to get feedback whether the other two compare >> functions also should become conditionally noexcept. > > Locally I have now performed the sole change of the _S_to_string_view > declaration getting rid of the template, but would also like to gather > feedback from the maintainers whether I should also change the form of > the conditional noexcept to use the expression > > noexcept(_S_to_string_view(__t)) > > instead of the current > > is_same<_Tp, __sv_type>::value > > as suggested by Tim Song. > > I'm asking also, because I have a paper proposing to standardize > is_nothrow_convertible submitted for the upcoming C++ mailing - This > would be one of the first applications in the library ;-) A slightly revised patch update: It replaces the _S_to_string_view template by a simpler _S_to_string_view function as of Tim Song's suggestion, but still uses the simplified noexcept specification deferring it to a future application case for is_nothrow_convertible. Furthermore now all three compare function templates are now (conditionally) noexcept by an (off-list) suggestion from Jonathan Wakely. Thanks, - Daniel ChangeLog_79162.patch Description: Binary data 79162.patch Description: Binary data
Re: [PATCH], PR target/81593, Optimize PowerPC vector sets coming from a vector extracts
> On Jul 28, 2017, at 4:08 PM, Segher Boessenkool > wrote: > > Hi! > > On Thu, Jul 27, 2017 at 07:21:14PM -0400, Michael Meissner wrote: >> This patches optimizes the PowerPC vector set operation for 64-bit doubles >> and >> longs where the elements in the vector set may have been extracted from >> another >> vector (PR target/81593): > >> * config/rs6000/rs6000.c (rs6000_emit_xxpermdi): New function to >> emit XXPERMDI accessing either double word in either vector >> register inputs. > > "emit" is not a good name for this: that is generally used for something > that does emit_insn, i.e. put an insn in the instruction stream. This > function returns a string a define_insn can return. For the rl* insns > I called the similar functions rs6000_insn_for_*, maybe something like > that is better here? > >> +/* Emit a XXPERMDI instruction that can extract from either double word of >> the >> + two arguments. ELEMENT1 and ELEMENT2 are either NULL or they are 0/1 >> giving >> + which double word to be used for the operand. */ >> + >> +const char * >> +rs6000_emit_xxpermdi (rtx operands[], rtx element1, rtx element2) >> +{ >> + int op1_dword = (!element1) ? 0 : INTVAL (element1); >> + int op2_dword = (!element2) ? 0 : INTVAL (element2); >> + >> + gcc_assert (IN_RANGE (op1_dword | op2_dword, 0, 1)); >> + >> + if (BYTES_BIG_ENDIAN) >> +{ >> + operands[3] = GEN_INT (2*op1_dword + op2_dword); >> + return "xxpermdi %x0,%x1,%x2,%3"; >> +} >> + else >> +{ >> + if (element1) >> +op1_dword = 1 - op1_dword; >> + >> + if (element2) >> +op2_dword = 1 - op2_dword; >> + >> + operands[3] = GEN_INT (op1_dword + 2*op2_dword); >> + return "xxpermdi %x0,%x2,%x1,%3"; >> +} >> +} > > I think calling this with the rtx elementN args makes this only more > complicated (the function comment doesn't say what they are or what > NULL means, btw). > >> (define_insn "vsx_concat_" >> - [(set (match_operand:VSX_D 0 "gpc_reg_operand" "=,we") >> + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa,we") >> (vec_concat:VSX_D >> - (match_operand: 1 "gpc_reg_operand" ",b") >> - (match_operand: 2 "gpc_reg_operand" ",b")))] >> + (match_operand: 1 "gpc_reg_operand" "wa,b") >> + (match_operand: 2 "gpc_reg_operand" "wa,b")))] >> "VECTOR_MEM_VSX_P (mode)" >> { >> if (which_alternative == 0) >> -return (BYTES_BIG_ENDIAN >> -? "xxpermdi %x0,%x1,%x2,0" >> -: "xxpermdi %x0,%x2,%x1,0"); >> +return rs6000_emit_xxpermdi (operands, NULL_RTX, NULL_RTX); >> >> else if (which_alternative == 1) >> -return (BYTES_BIG_ENDIAN >> +return (VECTOR_ELT_ORDER_BIG >> ? "mtvsrdd %x0,%1,%2" >> : "mtvsrdd %x0,%2,%1"); > > This one could be > > { > if (!BYTES_BIG_ENDIAN) !VECTOR_ELT_ORDER_BIG (to accommodate -maltivec=be). (We have some general bitrot associated with -maltivec=be that needs to be addressed, or we need to give up on it altogether. Still of two minds about this.) Bill >std::swap (operands[1], operands[2]); > > switch (which_alternative) >{ >case 0: > return "xxpermdi %x0,%x1,%x2,0"; >case 1: > return "mtvsrdd %x0,%1,%2"; >default: > gcc_unreachable (); >} > } > > (Could/should we use xxmrghd instead? Do all supported assemblers know > that extended mnemonic, is it actually more readable?) > >> --- gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c >> (svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c) >> (revision 0) >> +++ gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c >> (.../gcc/testsuite/gcc.target/powerpc/vsx-extract-7.c) (revision 250640) >> @@ -0,0 +1,15 @@ >> +/* { dg-do compile { target { powerpc*-*-* } } } */ >> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */ >> +/* { dg-require-effective-target powerpc_vsx_ok } */ >> +/* { dg-options "-O2 -mvsx" } */ >> + >> +vector double >> +test_vpasted (vector double high, vector double low) >> +{ >> + vector double res; >> + res[1] = high[1]; >> + res[0] = low[0]; >> + return res; >> +} >> + >> +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */ > > In this and the other testcase, should you test no other insns at all > are generated? > > > Segher >
[PATCH] Fix PR81354 (rewrite gimple_split_edge)
Hi, PR81354 identifies a latent bug that can happen in SLSR since the conditional candidate support was first added. SLSR relies on the address of a GIMPLE PHI remaining constant during the course of the optimization pass, but it needs to split edges. The use of make_single_succ_edge and reinstall_phi_args in gimple_split_edge causes GIMPLE PHI statements to be temporarily expanded to add a predecessor, and then rebuilt to have the original number of predecessors. The expansion usually, if not always, causes the PHI statement to change address. Thus gimple_split_edge needs to be rewritten to perform in-situ replacement of PHI arguments. The required pieces of make_single_succ_edge have been extracted into two places: make_replacement_pred_edge, and some fixup code at the end of gimple_split_edge. The division is necessary because the destination of the original edge must remember its original predecessors for the switch processing in gimple_redirect_edge_and_branch_1 to work properly. The function gimple_redirect_edge_and_branch was factored into two pieces so that most of it can be used by gimple_split_edge without calling ssa_redirect_edge, which also interferes with PHIs. The useful bits of ssa_redirect_edge are factored out into the next three lines of gimple_split_edge. Similarly, redirect_eh_edge had already been similarly factored into redirect_eh_edge_1 and ssa_redirect_edge. I took advantage of that and exposed redirect_eh_edge_1 for use in gimple_redirect_edge_and_branch_1. I've added the test from PR81354 as a torture test, but as we've seen, small changes elsewhere in the optimizer can easily hide the problem. Bootstrapped and tested on powerpc64le-linux-gnu with no regressions. Is this ok for trunk? Eventually this needs to be backported to GCC 5, 6, and 7 if that's acceptable, since PR81354 was observed on gcc-5-branch. I haven't yet prepared the backports. Thanks, Bill [gcc] 2017-07-30 Bill Schmidt PR tree-optimization/81354 * tree-cfg.c (gimple_redirect_edge_and_branch_1): New decl. (reinstall_phi_args): Delete function. (make_replacement_pred_edge): New function. (gimple_split_edge): Rewrite. (gimple_redirect_edge_and_branch_1): New function, factored from... (gimple_redirect_edge_and_branch): ...here. (split_critical_edges): Don't re-split already split edges. * tree-eh.c (redirect_eh_edge_1): Make visible. * tree-eh.h (redirect_eh_edge_1): Likewise. [gcc/testsuite] 2017-07-30 Bill Schmidt PR tree-optimization/81354 * g++.dg/torture/pr81354.C: New file. Index: gcc/testsuite/g++.dg/torture/pr81354.C === --- gcc/testsuite/g++.dg/torture/pr81354.C (nonexistent) +++ gcc/testsuite/g++.dg/torture/pr81354.C (working copy) @@ -0,0 +1,24 @@ +// PR81354 reported this test as crashing in a limited range of revisions. +// { dg-do compile } + +struct T { double a; double b; }; + +void foo(T Ad[], int As[2]) +{ + int j; + int i; + int Bs[2] = {0,0}; + T Bd[16]; + + for (j = 0; j < 4; j++) { +for (i = 0; i + 1 <= j + 1; i++) { + Ad[i + As[0] * j] = Bd[i + Bs[0] * j]; +} + +i = j + 1; // <- comment out this line and it does not crash +for (; i + 1 < 5; i++) { + Ad[i + As[0] * j].a = 0.0; + Ad[i + As[0] * j].b = 0.0; +} + } +} Index: gcc/tree-cfg.c === --- gcc/tree-cfg.c (revision 250721) +++ gcc/tree-cfg.c (working copy) @@ -154,6 +154,7 @@ static void make_gimple_switch_edges (gswitch *, b static bool make_goto_expr_edges (basic_block); static void make_gimple_asm_edges (basic_block); static edge gimple_redirect_edge_and_branch (edge, basic_block); +static edge gimple_redirect_edge_and_branch_1 (edge, basic_block); static edge gimple_try_redirect_by_replacing_jump (edge, basic_block); /* Various helpers. */ @@ -2776,35 +2777,6 @@ last_and_only_stmt (basic_block bb) return NULL; } -/* Reinstall those PHI arguments queued in OLD_EDGE to NEW_EDGE. */ - -static void -reinstall_phi_args (edge new_edge, edge old_edge) -{ - edge_var_map *vm; - int i; - gphi_iterator phis; - - vec *v = redirect_edge_var_map_vector (old_edge); - if (!v) -return; - - for (i = 0, phis = gsi_start_phis (new_edge->dest); - v->iterate (i, &vm) && !gsi_end_p (phis); - i++, gsi_next (&phis)) -{ - gphi *phi = phis.phi (); - tree result = redirect_edge_var_map_result (vm); - tree arg = redirect_edge_var_map_def (vm); - - gcc_assert (result == gimple_phi_result (phi)); - - add_phi_arg (phi, arg, new_edge, redirect_edge_var_map_location (vm)); -} - - redirect_edge_var_map_clear (old_edge); -} - /* Returns the basic block after which the new basic block created by splitting edge EDGE_IN should be placed. Tries to keep the new block near it
[PATCH, i386]: Implement attribute ((naked))
Hello! attribute ((naked)) generates function body without function frame, and as shown in PR 25967 [1], users are looking for this feature also for x86 targets. Recently, Daniel introduced a testcase that would benefit from this attribute. Attached patch also adds a trap instruction (ud2) at the end of the naked function, where epilogue would be. If there is a path through the function that would reach epilogue, trap insn triggers, as shown in added naked-3.c test. As documented (many times) in the documentation, the only correct usage is with classic "asm" statement. The documentation is perhaps too pessimistic; with some caution, quite complex c code can be mixed with asm prologue and epilogue. YMMV, the warning is there. 2017-07-30 Uros Bizjak PR target/25967 * config/i386/i386.c (ix86_function_naked): New function. (ix86_can_use_return_insn_p): Return false for naked functions. (ix86_expand_prologue): Skip prologue for naked functions. (ix86_expand_epilogue): Skip epilogue for naked functions and emit trap instruction. (ix86_warn_func_return): New function. (ix86_attribute_table): Add "naked" attribute specification. (TARGET_WARN_FUNC_RETURN): Define. * doc/extend.texi (x86 Function Attributes) : Document it. testsuite/ChangeLog: 2017-07-30 Uros Bizjak PR target/25967 * gcc.target/i386/naked-1.c: New test. * gcc.target/i386/naked-2.c: Ditto. * gcc.target/i386/naked-3.c: Ditto. * gcc.target/x86_64/abi/ms-sysv/ms-sysv.c: Remove do_test_body0 stub function, use attribute "naked" instead. * gcc.dg/pr44290-1.c: Use naked_functions effective target. * gcc.dg/pr44290-2.c: Ditto. Patch was bootstrapped and regression tested on x86_64-linux-gnu {,-m32}. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=25967 Uros. Index: config/i386/i386.c === --- config/i386/i386.c (revision 250721) +++ config/i386/i386.c (working copy) @@ -8748,6 +8748,15 @@ ix86_function_ms_hook_prologue (const_tree fn) return false; } +static bool +ix86_function_naked (const_tree fn) +{ + if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn))) +return true; + + return false; +} + /* Write the extra assembler code needed to declare a function properly. */ void @@ -12249,6 +12258,9 @@ ix86_can_use_return_insn_p (void) { struct ix86_frame frame; + if (ix86_function_naked (current_function_decl)) +return false; + /* Don't use `ret' instruction in interrupt handler. */ if (! reload_completed || frame_pointer_needed @@ -14327,6 +14339,9 @@ ix86_expand_prologue (void) bool sse_registers_saved; rtx static_chain = NULL_RTX; + if (ix86_function_naked (current_function_decl)) +return; + ix86_finalize_stack_realign_flags (); /* DRAP should not coexist with stack_realign_fp */ @@ -15184,6 +15199,13 @@ ix86_expand_epilogue (int style) bool using_drap; bool restore_stub_is_tail = false; + if (ix86_function_naked (current_function_decl)) +{ + /* The program should not reach this point. */ + emit_insn (gen_trap ()); + return; +} + ix86_finalize_stack_realign_flags (); frame = m->frame; @@ -31652,6 +31674,14 @@ ix86_trampoline_init (rtx m_tramp, tree fndecl, rt LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode); #endif } + +static bool +ix86_warn_func_return (tree decl) +{ + /* Naked functions are implemented entirely in assembly, including the + return sequence, so suppress warnings about this. */ + return !ix86_function_naked (decl); +} /* The following file contains several enumerations and data structures built from the definitions in i386-builtin-types.def. */ @@ -46486,6 +46516,8 @@ static const struct attribute_spec ix86_attribute_ ix86_handle_interrupt_attribute, false }, { "no_caller_saved_registers", 0, 0, false, true, true, ix86_handle_no_caller_saved_registers_attribute, false }, + { "naked", 0, 0, true, false, false, +ix86_handle_fndecl_attribute, false }, /* End element. */ { NULL,0, 0, false, false, false, NULL, false } @@ -52722,6 +52754,9 @@ ix86_run_selftests (void) #undef TARGET_RETURN_POPS_ARGS #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args +#undef TARGET_WARN_FUNC_RETURN +#define TARGET_WARN_FUNC_RETURN ix86_warn_func_return + #undef TARGET_LEGITIMATE_COMBINED_INSN #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn Index: doc/extend.texi === --- doc/extend.texi (revision 250721) +++ doc/extend.texi (working copy) @@ -5370,6 +5370,17 @@ this function attribute to make GCC generate the ` prologue used in Win32 API functions in Microsoft Windows XP Service Pack 2 and newer. +@item naked +@cindex @code{naked} function attribute, x86 +This attribute allows the compiler to construct the +requisite functio
Re: [ patch, libfortran] Fix PR 81581
On 07/30/2017 11:55 AM, Thomas Koenig wrote: > Hello world, > > the attached patch performs a runtime-check for the dim arguments > for transformational intrinsic functions. > > I debated with myself a bit if I should make this depend on > bounds checking, but I finally decided against it. The tests > performed are extemely cheap, the values are still in registers, > so any runtime overhead should be small. By contrast, loading > something from an external structure is probably more expensive > (but I didn't bother actually check). > > While visiting the code, I also took the opportunity to make > the rank and dim variables in the 'm' class of functions to index_type, > to make their use consistent. > > Regarding the test case names, I was unable to resist. > > Regression-tested. OK for trunk? > Looks OK, thanks, Jerry
[PATCH][RFA/RFC] Stack clash mitigation patch 00/08 - V3
OK, so about a week later than I wanted. Too many fires, not enough water. The V3 patch has expanded a bit... 1. For constant sized dynamic allocations we'll allocate/probe up to 4 STACK_CLASH_PROTECTION_PROBE_INTERVAL regions inline and unrolled. 2. For larger constant sized dynamic allocations we rotate the loop, saving a compare/jump. 3. blockage insns added to prevent scheduler reordering, particularly in the inline/unrolled loop case. 4. Generic code for dynamic handles case where target makes optimistic assumptions about probing state in its prologue (ie, aarch64). 5. PARAMs to control the assumed size of the guard and the probing interval. Both default to 4k. Note that the backends may not support all possible values for these PARAMs. a. The size of the guard helps determine how big of a local static frame can be allocated without probing on targets that have an implicit probe in the caller b. The interval determines how often we probe once we decide probing is required. c. Backends can override the default values. aarch64 for example overrides the guard size 6. More aarch64 improvements based on discussions with Wilco, Richard and Ramana. a. Support for a probing interval > 4k. b. Assume guard of 64k, with 1k for outgoing arglist. Thus frames less than 63k require no probing. c. Fix missed probe for outgoing arguments d. Add missing notes and barriers e. Some aarch64 specific testcases for issues identified by Wilco and some of my own f. Some simplifications based on invariants I was previously unaware of for aarch64 prologues 7. Scheduler honors the stack probing notes and avoids breaking memory dependencies when it encounters them 8. PPC port takes advantage of improved generic code for dynamic stack allocations 9 Many s390 improvements from IBM 10. Additional tests for the unrolled inline dynamic case, rotated loop case and use of a large guard value to avoid probing (x86 and ppc only) Jeff
[PATCH][RFA/RFC] Stack clash mitigation patch 01/08 - V3
This patch introduces the stack clash protection options Changes since V2: Adds two new params. The first controls the size of the guard area. This controls the threshold for when a function prologue requires probes. The second controls the probing interval -- ie, once probes are needed, how often do we emit them. These are really meant more for developers to experiment with than users. Regardless I did go ahead and document them./PARAM It also adds some sanity checking WRT combining stack clash protection with -fstack-check. Jeff * common.opt (-fstack-clash-protection): New option. * flag-types.h (enum stack_check_type): Note difference between -fstack-check= and -fstack-clash-protection. * params.h (set_param_value): Verify stack-clash-protection params are powers of two. * params.def (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE): New PARAM. (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL): Likewise. * toplev.c (process_options): Issue warnings/errors for cases not handled with -fstack-clash-protection. * doc/invoke.texi (-fstack-clash-protection): Document new option. (-fstack-check): Note additional problem with -fstack-check=generic. Note that -fstack-check is primarily for Ada and refer users to -fstack-clash-protection for stack-clash-protection. Document new params for stack clash protection. * toplev.c (process_options): Handle -fstack-clash-protection. testsuite/ * gcc.dg/stack-check-2.c: New test. * lib/target-supports.exp (check_effective_target_supports_stack_clash_protection): New function. (check_effective_target_frame_pointer_for_non_leaf): Likewise. (check_effective_target_caller_implicit_probes): Likewise. diff --git a/gcc/common.opt b/gcc/common.opt index e81165c..cfaf2bc 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -2306,6 +2306,11 @@ fstack-check Common Alias(fstack-check=, specific, no) Insert stack checking code into the program. Same as -fstack-check=specific. +fstack-clash-protection +Common Report Var(flag_stack_clash_protection) +Insert code to probe each page of stack space as it is allocated to protect +from stack-clash style attacks + fstack-limit Common Var(common_deferred_options) Defer diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 3e5cee8..8095dc2 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -10128,6 +10128,20 @@ compilation without. The value for compilation with profile feedback needs to be more conservative (higher) in order to make tracer effective. +@item stack-clash-protection-guard-size +The size (in bytes) of the stack guard. Acceptable values are powers of 2 +between 4096 and 134217728 and defaults to 4096. Higher values may reduce the +number of explicit probes, but a value larger than the operating system +provided guard will leave code vulnerable to stack clash style attacks. + +@item stack-clash-protection-probe-interval +Stack clash protection involves probing stack space as it is allocated. This +param controls the maximum distance (in bytes) between probes into the stack. +Acceptable values are powers of 2 between 4096 and 65536 and defaults to +4096. Higher values may reduce the number of explicit probes, but a value +larger than the operating system provided guard will leave code vulnerable to +stack clash style attacks. + @item max-cse-path-length The maximum number of basic blocks on path that CSE considers. @@ -11333,7 +11347,8 @@ target support in the compiler but comes with the following drawbacks: @enumerate @item Modified allocation strategy for large objects: they are always -allocated dynamically if their size exceeds a fixed threshold. +allocated dynamically if their size exceeds a fixed threshold. Note this +may change the semantics of some code. @item Fixed limit on the size of the static frame of functions: when it is @@ -11348,6 +11363,25 @@ generic implementation, code performance is hampered. Note that old-style stack checking is also the fallback method for @samp{specific} if no target support has been added in the compiler. +@samp{-fstack-check=} is designed for Ada's needs to detect infinite recursion +and stack overflows. @samp{specific} is an excellent choice when compiling +Ada code. It is not generally sufficient to protect against stack-clash +attacks. To protect against those you want @samp{-fstack-clash-protection}. + +@item -fstack-clash-protection +@opindex fstack-clash-protection +Generate code to prevent stack clash style attacks. When this option is +enabled, the compiler will only allocate one page of stack space at a time +and each page is accessed immediately after allocation. Thus, it prevents +allocations from jumping over any stack guard page provided by the +operating system. + +Most targets do not fully support stack clash protection. However, on +those target
[PATCH][RFA/RFC] Stack clash mitigation patch 02/08 - V3
This patch introduces generic mechanisms to protect the dynamically allocated stack space against stack-clash attacks. Changes since V2: Dynamic allocations can be emitted as unrolled inlined probes or with a rotated loop. Blockage insns are also properly emitted for the dynamic area probes and the dynamic area probing now supports targets that may make optimistic assumptions in their prologues. Finally it uses the new param to control the probing interval. Tests were updated to explicitly specify the guard and probing interval. New test to check inline/unrolled probes as well as rotated loop. * explow.c: Include "params.h". (anti_adjust_stack_and_probe_stack_clash): New function. (get_stack_check_protect): Likewise. (compute_stack_clash_protection_loop_data): Likewise. (emit_stack_clash_protection_loop_start): Likewise. (emit_stack_clash_protection_loop_end): Likewise. (allocate_dynamic_stack_space): Use get_stack_check_protect. Use anti_adjust_stack_and_probe_stack_clash. * explow.h (compute_stack_clash_protection_loop_data): Prototype. (emit_stack_clash_protection_loop_start): Likewise. (emit_stack_clash_protection_loop_end): Likewise. * rtl.h (get_stack_check_protect): Prototype. * defaults.h (STACK_CLASH_PROTECTION_NEEDS_FINAL_DYNAMIC_PROBE): Define new default. * doc/tm.texi.in (STACK_CLASH_PROTECTION_NEEDS_FINAL_DYNAMIC_PROBE): Define. * doc/tm.texi: Rebuilt. * config/aarch64/aarch64.c (aarch64_expand_prologue): Use get_stack_check_protect. * config/alpha/alpha.c (alpha_expand_prologue): Likewise. * config/arm/arm.c (arm_expand_prologue): Likewise. * config/i386/i386.c (ix86_expand_prologue): Likewise. * config/ia64/ia64.c (ia64_expand_prologue): Likewise. * config/mips/mips.c (mips_expand_prologue): Likewise. * config/powerpcspe/powerpcspe.c (rs6000_emit_prologue): Likewise. * config/rs6000/rs6000.c (rs6000_emit_prologue): Likewise. * config/sparc/sparc.c (sparc_expand_prologue): Likewise. testsuite * gcc.dg/stack-check-3.c: New test. diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index ef1b5a8..0a8b40a 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -3676,12 +3676,14 @@ aarch64_expand_prologue (void) { if (crtl->is_leaf && !cfun->calls_alloca) { - if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT) - aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, - frame_size - STACK_CHECK_PROTECT); + if (frame_size > PROBE_INTERVAL + && frame_size > get_stack_check_protect ()) + aarch64_emit_probe_stack_range (get_stack_check_protect (), + (frame_size +- get_stack_check_protect ())); } else if (frame_size > 0) - aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size); + aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size); } aarch64_sub_sp (IP0_REGNUM, initial_adjust, true); diff --git a/gcc/config/alpha/alpha.c b/gcc/config/alpha/alpha.c index 00a69c1..91f3d7c 100644 --- a/gcc/config/alpha/alpha.c +++ b/gcc/config/alpha/alpha.c @@ -7741,7 +7741,7 @@ alpha_expand_prologue (void) probed_size = frame_size; if (flag_stack_check) -probed_size += STACK_CHECK_PROTECT; +probed_size += get_stack_check_protect (); if (probed_size <= 32768) { diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index c6101ef..9822ca7 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -21680,13 +21680,13 @@ arm_expand_prologue (void) if (crtl->is_leaf && !cfun->calls_alloca) { - if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT) - arm_emit_probe_stack_range (STACK_CHECK_PROTECT, - size - STACK_CHECK_PROTECT, + if (size > PROBE_INTERVAL && size > get_stack_check_protect ()) + arm_emit_probe_stack_range (get_stack_check_protect (), + size - get_stack_check_protect (), regno, live_regs_mask); } else if (size > 0) - arm_emit_probe_stack_range (STACK_CHECK_PROTECT, size, + arm_emit_probe_stack_range (get_stack_check_protect (), size, regno, live_regs_mask); } @@ -27854,7 +27854,7 @@ arm_frame_pointer_required (void) { /* We don't have the final size of the frame so adjust. */ size += 32 * UNITS_PER_WORD; - if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT) + if (size > PROBE_INTERVAL && size > get_stack_check_protect ()) return true
[PATCH][RFA/RFC] Stack clash mitigatoin patch 02b/08 - V3
This patch adds checking of flag_stack_protection to various backends that will rely on partial protection against stack-clash via -fstack-check. I don't think this patch changed at all relative to V2. Jeff * config/alpha/alpha.c (alpha_expand_prologue): Also check flag_stack_clash_protection. * config/arm/arm.c (arm_compute_static_chain_stack_bytes): Likewise. (arm_expand_prologue, thumb1_expand_prologue): Likewise. (arm_frame_pointer_required): Likewise. * config/ia64/ia64.c (ia64_compute_frame_size): Likewise. (ia64_expand_prologue): Likewise. * config/mips/mips.c (mips_expand_prologue): Likewise. * config/powerpcspe/powerpcspe.c (rs6000_expand_prologue): Likewise. * config/sparc/sparc.c (sparc_expand_prologue): Likewise. (sparc_flat_expand_prologue): Likewise. * config/spu/spu.c (spu_expand_prologue): Likewise. diff --git a/gcc/config/alpha/alpha.c b/gcc/config/alpha/alpha.c index 91f3d7c..36e78a0 100644 --- a/gcc/config/alpha/alpha.c +++ b/gcc/config/alpha/alpha.c @@ -7740,7 +7740,7 @@ alpha_expand_prologue (void) Note that we are only allowed to adjust sp once in the prologue. */ probed_size = frame_size; - if (flag_stack_check) + if (flag_stack_check || flag_stack_clash_protection) probed_size += get_stack_check_protect (); if (probed_size <= 32768) @@ -7755,7 +7755,7 @@ alpha_expand_prologue (void) /* We only have to do this probe if we aren't saving registers or if we are probing beyond the frame because of -fstack-check. */ if ((sa_size == 0 && probed_size > probed - 4096) - || flag_stack_check) + || flag_stack_check || flag_stack_clash_protection) emit_insn (gen_probe_stack (GEN_INT (-probed_size))); } @@ -7785,7 +7785,8 @@ alpha_expand_prologue (void) late in the compilation, generate the loop as a single insn. */ emit_insn (gen_prologue_stack_probe_loop (count, ptr)); - if ((leftover > 4096 && sa_size == 0) || flag_stack_check) + if ((leftover > 4096 && sa_size == 0) + || flag_stack_check || flag_stack_clash_protection) { rtx last = gen_rtx_MEM (DImode, plus_constant (Pmode, ptr, -leftover)); @@ -7793,7 +7794,7 @@ alpha_expand_prologue (void) emit_move_insn (last, const0_rtx); } - if (flag_stack_check) + if (flag_stack_check || flag_stack_clash_protection) { /* If -fstack-check is specified we have to load the entire constant into a register and subtract from the sp in one go, diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 9822ca7..4a93767 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -19151,7 +19151,8 @@ arm_compute_static_chain_stack_bytes (void) /* See the defining assertion in arm_expand_prologue. */ if (IS_NESTED (arm_current_func_type ()) && ((TARGET_APCS_FRAME && frame_pointer_needed && TARGET_ARM) - || (flag_stack_check == STATIC_BUILTIN_STACK_CHECK + || ((flag_stack_check == STATIC_BUILTIN_STACK_CHECK + || flag_stack_clash_protection) && !df_regs_ever_live_p (LR_REGNUM))) && arm_r3_live_at_start_p () && crtl->args.pretend_args_size == 0) @@ -21453,7 +21454,8 @@ arm_expand_prologue (void) clobbered when creating the frame, we need to save and restore it. */ clobber_ip = IS_NESTED (func_type) && ((TARGET_APCS_FRAME && frame_pointer_needed && TARGET_ARM) - || (flag_stack_check == STATIC_BUILTIN_STACK_CHECK + || ((flag_stack_check == STATIC_BUILTIN_STACK_CHECK + || flag_stack_clash_protection) && !df_regs_ever_live_p (LR_REGNUM) && arm_r3_live_at_start_p ())); @@ -21667,7 +21669,8 @@ arm_expand_prologue (void) stack checking. We use IP as the first scratch register, except for the non-APCS nested functions if LR or r3 are available (see clobber_ip). */ if (!IS_INTERRUPT (func_type) - && flag_stack_check == STATIC_BUILTIN_STACK_CHECK) + && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK + || flag_stack_clash_protection)) { unsigned int regno; @@ -24959,7 +24962,9 @@ thumb1_expand_prologue (void) current_function_static_stack_size = size; /* If we have a frame, then do stack checking. FIXME: not implemented. */ - if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK && size) + if ((flag_stack_check == STATIC_BUILTIN_STACK_CHECK + || flag_stack_clash_protection) + && size) sorry ("-fstack-check=specific for Thumb-1"); amount = offsets->outgoing_args - offsets->saved_regs; @@ -27839,7 +27844,8 @@ arm_frame_pointer_required (void) instruction prior to the stack adjustment and this requires a frame pointer if
[PATCH][RFA/RFC] Stack clash mitigation patch 03/08 - V3
This patch introduces some routines for logging of stack clash protection actions. I don't think this patch changed at all relative to V2. Jeff * function.c (dump_stack_clash_frame_info): New function. * function.h (dump_stack_clash_frame_info): Prototype. (enum stack_clash_probes): New enum. diff --git a/gcc/function.c b/gcc/function.c index f625489..ca48b3f 100644 --- a/gcc/function.c +++ b/gcc/function.c @@ -5695,6 +5695,58 @@ get_arg_pointer_save_area (void) return ret; } + +/* If debugging dumps are requested, dump information about how the + target handled -fstack-check=clash for the prologue. + + PROBES describes what if any probes were emitted. + + RESIDUALS indicates if the prologue had any residual allocation + (i.e. total allocation was not a multiple of PROBE_INTERVAL). */ + +void +dump_stack_clash_frame_info (enum stack_clash_probes probes, bool residuals) +{ + if (!dump_file) +return; + + switch (probes) +{ +case NO_PROBE_NO_FRAME: + fprintf (dump_file, + "Stack clash no probe no stack adjustment in prologue.\n"); + break; +case NO_PROBE_SMALL_FRAME: + fprintf (dump_file, + "Stack clash no probe small stack adjustment in prologue.\n"); + break; +case PROBE_INLINE: + fprintf (dump_file, "Stack clash inline probes in prologue.\n"); + break; +case PROBE_LOOP: + fprintf (dump_file, "Stack clash probe loop in prologue.\n"); + break; +} + + if (residuals) +fprintf (dump_file, "Stack clash residual allocation in prologue.\n"); + else +fprintf (dump_file, "Stack clash no residual allocation in prologue.\n"); + + if (frame_pointer_needed) +fprintf (dump_file, "Stack clash frame pointer needed.\n"); + else +fprintf (dump_file, "Stack clash no frame pointer needed.\n"); + + if (TREE_THIS_VOLATILE (cfun->decl)) +fprintf (dump_file, +"Stack clash noreturn prologue, assuming no implicit" +" probes in caller.\n"); + else +fprintf (dump_file, +"Stack clash not noreturn prologue.\n"); +} + /* Add a list of INSNS to the hash HASHP, possibly allocating HASHP for the first time. */ diff --git a/gcc/function.h b/gcc/function.h index 0f34bcd..87dac80 100644 --- a/gcc/function.h +++ b/gcc/function.h @@ -553,6 +553,14 @@ do { \ ((TARGET_PTRMEMFUNC_VBIT_LOCATION == ptrmemfunc_vbit_in_pfn) \ ? MAX (FUNCTION_BOUNDARY, 2 * BITS_PER_UNIT) : FUNCTION_BOUNDARY) +enum stack_clash_probes { + NO_PROBE_NO_FRAME, + NO_PROBE_SMALL_FRAME, + PROBE_INLINE, + PROBE_LOOP +}; + +extern void dump_stack_clash_frame_info (enum stack_clash_probes, bool); extern void push_function_context (void);
[PATCH][RFA/RFC] Stack clash mitigation patch 04/08 - V3
This patch adds stack clash protection to the x86 port and provides the first set of tests that will be used by the other ports in the future. Changes since V2: This uses the new PARAMs to determine if probing is needed and how often to probe once probing is needed. New tests were added to verify behavior of the new flags. Old tests explicitly set the guard size and probing interval Jeff * config/i386/i386.c (ix86_adjust_stack_and_probe_stack_clash): New. (ix86_expand_prologue): Dump stack clash info as needed. Call ix86_adjust_stack_and_probe_stack_clash as needed. testsuite/ * gcc.dg/stack-check-4.c: New test. * gcc.dg/stack-check-5.c: New test. * gcc.dg/stack-check-6.c: New test. * gcc.dg/stack-check-6a.c: New test. * gcc.dg/stack-check-7.c: New test. * gcc.dg/stack-check-8.c: New test. * gcc.dg/stack-check-9.c: New test. * gcc.dg/stack-check-10.c: New test. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 0947b3c..b4ddc83 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -13779,6 +13779,142 @@ release_scratch_register_on_entry (struct scratch_reg *sr) #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP) +/* Emit code to adjust the stack pointer by SIZE bytes while probing it. + + This differs from the next routine in that it tries hard to prevent + attacks that jump the stack guard. Thus it is never allowed to allocate + more than PROBE_INTERVAL bytes of stack space without a suitable + probe. */ + +static void +ix86_adjust_stack_and_probe_stack_clash (const HOST_WIDE_INT size) +{ + struct machine_function *m = cfun->machine; + + /* If this function does not statically allocate stack space, then + no probes are needed. */ + if (!size) +{ + dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false); + return; +} + + /* If we are a noreturn function, then we have to consider the + possibility that we're called via a jump rather than a call. + + Thus we don't have the implicit probe generated by saving the + return address into the stack at the call. Thus, the stack + pointer could be anywhere in the guard page. The safe thing + to do is emit a probe now. + + ?!? This should be revamped to work like aarch64 and s390 where + we track the offset from the most recent probe. Normally that + offset would be zero. For a non-return function we would reset + it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then + we just probe when we cross PROBE_INTERVAL. */ + if (TREE_THIS_VOLATILE (cfun->decl)) +emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, +-GET_MODE_SIZE (word_mode))); + + /* If we allocate less than the size of the guard statically, + then no probing is necessary, but we do need to allocate + the stack. */ + if (size < PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)) +{ + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, +GEN_INT (-size), -1, +m->fs.cfa_reg == stack_pointer_rtx); + dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true); + return; +} + + /* We're allocating a large enough stack frame that we need to + emit probes. Either emit them inline or in a loop depending + on the size. */ + HOST_WIDE_INT probe_interval += PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL); + if (size <= 4 * probe_interval) +{ + HOST_WIDE_INT i; + for (i = probe_interval; i <= size; i += probe_interval) + { + /* Allocate PROBE_INTERVAL bytes. */ + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, +GEN_INT (-probe_interval), -1, +m->fs.cfa_reg == stack_pointer_rtx); + + /* And probe at *sp. */ + emit_stack_probe (stack_pointer_rtx); + } + + /* We need to allocate space for the residual, but we do not need +to probe the residual. */ + HOST_WIDE_INT residual = (i - probe_interval - size); + if (residual) + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (residual), -1, + m->fs.cfa_reg == stack_pointer_rtx); + dump_stack_clash_frame_info (PROBE_INLINE, residual != 0); +} + else +{ + struct scratch_reg sr; + get_scratch_register_on_entry (&sr); + + /* Step 1: round SIZE down to a multiple of the interval. */ + HOST_WIDE_INT rounded_size = size & -probe_interval; + + /* Step 2: compute final value of the loop counter. Use lea if +possible. */ + rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size); + rtx insn; + if (address_no_seg_operand (addr, Pmode)) +
[PATCH][RFA/RFC] Stack clash mitigation patch 05/08 - V3
This patch catches a couple places where the generic parts of the compiler would transform prologue probing sequences in unsafe ways. Changes since V2: Testing for aarch64 showed the scheduler would perform transformations similar to combine-stack-adjustments via its dependency breaking transformations. So the scheduler was taught to not break dependencies when the increment insn contains a STACK_CHECK note. A test for this specific issue shows up in the aarch64 tests in the next patch. * combine-stack-adj.c (combine_stack_adjustments_for_block): Do nothing for stack adjustments with REG_STACK_CHECK. * sched-deps.c (parse_add_or_inc): Reject insns with REG_STACK_CHECK from dependency breaking. * config/i386/i386.c (pro_epilogue_adjust_stack): Return insn. (ix86_adjust_satck_and_probe_stack_clash): Add REG_STACK_NOTEs. * reg-notes.def (STACK_CHECK): New note. testsuite/ * gcc.target/i386/stack-check-11.c: New test. diff --git a/gcc/combine-stack-adj.c b/gcc/combine-stack-adj.c index 9ec14a3..82d6dba 100644 --- a/gcc/combine-stack-adj.c +++ b/gcc/combine-stack-adj.c @@ -508,6 +508,8 @@ combine_stack_adjustments_for_block (basic_block bb) continue; set = single_set_for_csa (insn); + if (set && find_reg_note (insn, REG_STACK_CHECK, NULL_RTX)) + set = NULL_RTX; if (set) { rtx dest = SET_DEST (set); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index b4ddc83..69fecb9 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -13405,7 +13405,7 @@ ix86_add_queued_cfa_restore_notes (rtx insn) zero if %r11 register is live and cannot be freely used and positive otherwise. */ -static void +static rtx pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, int style, bool set_cfa) { @@ -13496,6 +13496,7 @@ pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, m->fs.sp_valid = valid; m->fs.sp_realigned = realigned; } + return insn; } /* Find an available register to be used as dynamic realign argument @@ -13839,9 +13840,11 @@ ix86_adjust_stack_and_probe_stack_clash (const HOST_WIDE_INT size) for (i = probe_interval; i <= size; i += probe_interval) { /* Allocate PROBE_INTERVAL bytes. */ - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, -GEN_INT (-probe_interval), -1, -m->fs.cfa_reg == stack_pointer_rtx); + rtx insn + = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, +GEN_INT (-PROBE_INTERVAL), -1, +m->fs.cfa_reg == stack_pointer_rtx); + add_reg_note (insn, REG_STACK_CHECK, const0_rtx); /* And probe at *sp. */ emit_stack_probe (stack_pointer_rtx); diff --git a/gcc/reg-notes.def b/gcc/reg-notes.def index 8734d26..18cf7e3 100644 --- a/gcc/reg-notes.def +++ b/gcc/reg-notes.def @@ -223,6 +223,10 @@ REG_NOTE (ARGS_SIZE) pseudo reg. */ REG_NOTE (RETURNED) +/* Indicates the instruction is a stack check probe that should not + be combined with other stack adjustments. */ +REG_NOTE (STACK_CHECK) + /* Used to mark a call with the function decl called by the call. The decl might not be available in the call due to splitting of the call insn. This note is a SYMBOL_REF. */ diff --git a/gcc/sched-deps.c b/gcc/sched-deps.c index 4c459e6..a661426 100644 --- a/gcc/sched-deps.c +++ b/gcc/sched-deps.c @@ -4713,6 +4713,11 @@ parse_add_or_inc (struct mem_inc_info *mii, rtx_insn *insn, bool before_mem) if (RTX_FRAME_RELATED_P (insn) || !pat) return false; + /* Do not allow breaking data dependencies for insns that are marked + with REG_STACK_CHECK. */ + if (find_reg_note (insn, REG_STACK_CHECK, NULL)) +return false; + /* Result must be single reg. */ if (!REG_P (SET_DEST (pat))) return false; diff --git a/gcc/testsuite/gcc.target/i386/stack-check-11.c b/gcc/testsuite/gcc.target/i386/stack-check-11.c new file mode 100644 index 000..183103f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/stack-check-11.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fstack-clash-protection" } */ +/* { dg-require-effective-target supports_stack_clash_protection } */ + +extern void arf (unsigned long int *, unsigned long int *); +void +frob () +{ + unsigned long int num[859]; + unsigned long int den[859]; + arf (den, num); +} + +/* { dg-final { scan-assembler-times "subq" 4 } } */ +/* { dg-final { scan-assembler-times "orq" 3 } } */ +
[PATCH][RFA/RFC] Stack clash mitigation patch 08/08 - V3
These are the s390 patches for stack clash mitigation. Changes since V2: Incorporate changes from IBM to address missing functionality. Hopefully no longer regresses Ada as we no longer define STACK_CHECK_STATIC_BUILTIN. Jeff * config/s390/s390.c (MIN_UNROLL_PROBES): Define. (allocate_stack_space): New function, partially extracted from s390_emit_prologue. (s390_emit_prologue): Track offset to most recent stack probe. Code to allocate space moved into allocate_stack_space. Dump actions when no stack is allocated. (s390_prologue_plus_offset): New function. (s390_emit_stack_probe): Likewise. testsuite/ * gcc.dg/stack-check-5.c: Add argument for s390. diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 958ee3b..7d2481e 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -10999,6 +10999,178 @@ pass_s390_early_mach::execute (function *fun) } // anon namespace +/* Calculate TARGET = REG + OFFSET as s390_emit_prologue would do it. + - push too big immediates to the literal pool and annotate the refs + - emit frame related notes for stack pointer changes. */ + +static rtx +s390_prologue_plus_offset (rtx target, rtx reg, rtx offset, bool frame_related_p) +{ + rtx insn; + rtx orig_offset = offset; + + gcc_assert (REG_P (target)); + gcc_assert (REG_P (reg)); + gcc_assert (CONST_INT_P (offset)); + + if (offset == const0_rtx) /* lr/lgr */ +{ + insn = emit_move_insn (target, reg); +} + else if (DISP_IN_RANGE (INTVAL (offset))) /* la */ +{ + insn = emit_move_insn (target, gen_rtx_PLUS (Pmode, reg, + offset)); +} + else +{ + if (!satisfies_constraint_K (offset)/* ahi/aghi */ + && (!TARGET_EXTIMM + || (!satisfies_constraint_Op (offset) /* alfi/algfi */ + && !satisfies_constraint_On (offset /* slfi/slgfi */ + offset = force_const_mem (Pmode, offset); + + if (target != reg) + { + insn = emit_move_insn (target, reg); + RTX_FRAME_RELATED_P (insn) = frame_related_p ? 1 : 0; + } + + insn = emit_insn (gen_add2_insn (target, offset)); + + if (!CONST_INT_P (offset)) + { + annotate_constant_pool_refs (&PATTERN (insn)); + + if (frame_related_p) + add_reg_note (insn, REG_FRAME_RELATED_EXPR, + gen_rtx_SET (target, + gen_rtx_PLUS (Pmode, target, +orig_offset))); + } +} + + RTX_FRAME_RELATED_P (insn) = frame_related_p ? 1 : 0; + + return insn; +} + +/* Emit a compare instruction with a volatile memory access as stack + probe. It does not waste store tags and does not clobber any + registers apart from the condition code. */ +static void +s390_emit_stack_probe (rtx addr) +{ + rtx tmp = gen_rtx_MEM (Pmode, addr); + MEM_VOLATILE_P (tmp) = 1; + s390_emit_compare (EQ, gen_rtx_REG (Pmode, 0), tmp); +} + +/* Use a runtime loop if we have to emit more probes than this. */ +#define MIN_UNROLL_PROBES 3 + +/* Allocate SIZE bytes of stack space, using TEMP_REG as a temporary + if necessary. LAST_PROBE_OFFSET contains the offset of the closest + probe relative to the stack pointer. + + Note that SIZE is negative. + + The return value is true if TEMP_REG has been clobbered. */ +static bool +allocate_stack_space (rtx size, HOST_WIDE_INT last_probe_offset, + rtx temp_reg) +{ + bool temp_reg_clobbered_p = false; + HOST_WIDE_INT probe_interval += PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL); + HOST_WIDE_INT guard_size += PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE); + + if (flag_stack_clash_protection) +{ + if (last_probe_offset + -INTVAL (size) < guard_size) + dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true); + else + { + rtx offset = GEN_INT (probe_interval - UNITS_PER_LONG); + HOST_WIDE_INT rounded_size = -INTVAL (size) & -probe_interval; + HOST_WIDE_INT num_probes = rounded_size / probe_interval; + HOST_WIDE_INT residual = -INTVAL (size) - rounded_size; + + if (num_probes < MIN_UNROLL_PROBES) + { + /* Emit unrolled probe statements. */ + + for (unsigned int i = 0; i < num_probes; i++) + { + s390_prologue_plus_offset (stack_pointer_rtx, +stack_pointer_rtx, +GEN_INT (-probe_interval), true); + s390_emit_stack_probe (gen_rtx_PLUS (Pmode, + stack_pointer_rtx, + offset)); + } + dump_stack_
[RFA/RFC] Stack clash mitigation patch 07/08 - V3
This patch adds aarch64 supports for stack clash mitigation. Changes since V2: Uses new PARAMS to control the guard size and probing interval. Uses a 64k guard by default and a 1k outgoing args size. Simplified code slightly as a result of "can't happen" cases. Probe the last allocated word in the outgoing argument space if >1kb of outgoing argument space was allocated. Defines STACK_CLASH_PROTECTION_NEEDS_FINAL_DYNAMIC_PROBE to force generic handling of dynamic area to probe its last word when necessary. Epilogue doesn't assume IP0/IP1 contain usable values for -fstack-clash-protection. Adds some aarch64 specific tests for problems found by Wilco and myself. Jeff * config/aarch/aarch64.c (aarch64_output_probe_stack_range): Handle -fstack-clash-protection probing too. (aarch64_allocate_and_probe_stack_space): New function. (aarch64_expand_prologue): Assert we never have both an initial adjustment and callee save adjustment.Dump as needed when -fstack-clash-protection is enabled. For an initial or final adjustment that is large enough, call aarch64_allocate_and_probe_stack_space. For final adjustment that is large enough, also emit a trailing probe. (aarch64_expand_epilogue): Do not assume the prologue has set IP1_REGNUM or IP0_REGNUM when -fstack-clash-protection is active. * config/aarch64/aarch64.h (STACK_CLASH_PROTECTION_NEEDS_FINAL_DYNAMIC_PROBE): Define. * testsuite/gcc.target/aarch64.c/stack-check-12.c: New test. * testsuite/gcc.target/aarch64.c/stack-check-13.c: New test. * testsuite/gcc.target/aarch64.c/stack-check-14.c: New test. * testsuite/gcc.target/aarch64.c/stack-check-15.c: New test. diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 0a8b40a..7d57299 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -2830,6 +2830,9 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2) char loop_lab[32]; rtx xops[2]; + if (flag_stack_clash_protection) +reg1 = stack_pointer_rtx; + ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++); /* Loop. */ @@ -2841,7 +2844,14 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2) output_asm_insn ("sub\t%0, %0, %1", xops); /* Probe at TEST_ADDR. */ - output_asm_insn ("str\txzr, [%0]", xops); + if (flag_stack_clash_protection) +{ + gcc_assert (xops[0] == stack_pointer_rtx); + xops[1] = GEN_INT (PROBE_INTERVAL - 8); + output_asm_insn ("str\txzr, [%0, %1]", xops); +} + else +output_asm_insn ("str\txzr, [%0]", xops); /* Test if TEST_ADDR == LAST_ADDR. */ xops[1] = reg2; @@ -3605,6 +3615,96 @@ aarch64_set_handled_components (sbitmap components) cfun->machine->reg_is_wrapped_separately[regno] = true; } +/* Allocate SIZE bytes of stack space using SCRATCH_REG as a scratch + register. */ + +static void +aarch64_allocate_and_probe_stack_space (int scratchreg, HOST_WIDE_INT size) +{ + HOST_WIDE_INT probe_interval += PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL); + HOST_WIDE_INT guard_size += PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE); + HOST_WIDE_INT guard_used_by_caller = 1024; + + /* SIZE should be large enough to require probing here. ie, it + must be larger than GUARD_SIZE - GUARD_USED_BY_CALLER. + + We can allocate GUARD_SIZE - GUARD_USED_BY_CALLER as a single chunk + without any probing. */ + gcc_assert (size >= guard_size - guard_used_by_caller); + aarch64_sub_sp (scratchreg, guard_size - guard_used_by_caller, true); + size -= (guard_size - guard_used_by_caller); + + HOST_WIDE_INT rounded_size = size & -probe_interval; + HOST_WIDE_INT residual = size - rounded_size; + + /* We can handle a small number of allocations/probes inline. Otherwise + punt to a loop. */ + if (rounded_size && rounded_size <= 4 * probe_interval) +{ + /* We don't use aarch64_sub_sp here because we don't want to +repeatedly load SCRATCHREG. */ + rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg); + if (probe_interval > ARITH_FACTOR) + emit_move_insn (scratch_rtx, GEN_INT (-probe_interval)); + else + scratch_rtx = GEN_INT (-probe_interval); + + for (HOST_WIDE_INT i = 0; i < rounded_size; i += probe_interval) + { + rtx_insn *insn = emit_insn (gen_add2_insn (stack_pointer_rtx, +scratch_rtx)); + add_reg_note (insn, REG_STACK_CHECK, const0_rtx); + + if (probe_interval > ARITH_FACTOR) + { + RTX_FRAME_RELATED_P (insn) = 1; + rtx adj = plus_constant (Pmode, stack_pointer_rtx, rounded_size); + add_reg_note (insn, REG_CFA_ADJUST_CFA, + gen_rtx_SET (stack_pointer_rtx, adj)); + } + + emit_stack_probe (plus_constant (Pmode, stac
[PATCH][RFA/RFC] Stack clash mitigation patch 06/08 - V3
This contains the PPC bits for stack clash protection. Changes since V2: Exploits inlined/unrolled probes and rotated loops for the dynamic area. Some trivial simplifications. It also uses the new params to control if probes are needed and how often to probe. Jeff * config/rs6000/rs6000-protos.h (output_probe_stack_range): Update prototype for new argument. * config/rs6000/rs6000.c (wrap_frame_mem): New function extracted from rs6000_emit_allocate_stack. (handle_residual): New function. (rs6000_emit_probe_stack_range_stack_clash): New function. (rs6000_emit_allocate_stack): Use wrap_frame_mem. Call rs6000_emit_probe_stack_range_stack_clash as needed. (rs6000_emit_probe_stack_range): Add additional argument to call to gen_probe_stack_range{si,di}. (output_probe_stack_range): New. (output_probe_stack_range_1): Renamed from output_probe_stack_range. (output_probe_stack_range_stack_clash): New. (rs6000_emit_prologue): Emit notes into dump file as requested. * rs6000.md (allocate_stack): Handle -fstack-clash-protection (probe_stack_range): Operand 0 is now early-clobbered. Add additional operand and pass it to output_probe_stack_range. diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index aeec9b2..451c442 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -134,7 +134,7 @@ extern void rs6000_emit_sISEL (machine_mode, rtx[]); extern void rs6000_emit_sCOND (machine_mode, rtx[]); extern void rs6000_emit_cbranch (machine_mode, rtx[]); extern char * output_cbranch (rtx, const char *, int, rtx_insn *); -extern const char * output_probe_stack_range (rtx, rtx); +extern const char * output_probe_stack_range (rtx, rtx, rtx); extern void rs6000_emit_dot_insn (rtx dst, rtx src, int dot, rtx ccreg); extern bool rs6000_emit_set_const (rtx, rtx); extern int rs6000_emit_cmove (rtx, rtx, rtx, rtx); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index aa70e30..7936451 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -25618,6 +25618,211 @@ rs6000_emit_stack_tie (rtx fp, bool hard_frame_needed) emit_insn (gen_stack_tie (gen_rtx_PARALLEL (VOIDmode, p))); } +/* INSN allocates SIZE bytes on the stack (STACK_REG) using a store + with update style insn. + + Set INSN's alias set/attributes and add suitable flags and notes + for the dwarf CFI machinery. */ +static void +wrap_frame_mem (rtx insn, rtx stack_reg, HOST_WIDE_INT size) +{ + rtx par = PATTERN (insn); + gcc_assert (GET_CODE (par) == PARALLEL); + rtx set = XVECEXP (par, 0, 0); + gcc_assert (GET_CODE (set) == SET); + rtx mem = SET_DEST (set); + gcc_assert (MEM_P (mem)); + MEM_NOTRAP_P (mem) = 1; + set_mem_alias_set (mem, get_frame_alias_set ()); + + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_FRAME_RELATED_EXPR, + gen_rtx_SET (stack_reg, gen_rtx_PLUS (Pmode, stack_reg, + GEN_INT (-size; +} + +/* Allocate ROUNDED_SIZE - ORIG_SIZE bytes on the stack, storing + ORIG_SP into *sp after the allocation. + + ROUNDED_SIZE will be a multiple of + STACK_CLASH_PROTECTION_PROBE_INTERVAL and ORIG_SIZE - ROUNDED_SIZE + will be less than STACK_CLASH_PROTECTION_PROBE_INTERVAL. + + Return the insn that allocates the residual space. */ +static rtx_insn * +handle_residual (HOST_WIDE_INT orig_size, +HOST_WIDE_INT rounded_size, +rtx orig_sp) +{ + /* Allocate (and implicitly probe) any residual space. */ + HOST_WIDE_INT residual = orig_size - rounded_size; + rtx_insn *insn; + + if (Pmode == SImode) +insn = emit_insn (gen_movsi_update_stack (stack_pointer_rtx, + stack_pointer_rtx, + GEN_INT (-residual), + orig_sp)); + else +insn = emit_insn (gen_movdi_di_update_stack (stack_pointer_rtx, +stack_pointer_rtx, +GEN_INT (-residual), +orig_sp)); + wrap_frame_mem (insn, stack_pointer_rtx, residual); + return insn; +} + +/* Allocate ORIG_SIZE bytes on the stack and probe the newly + allocated space every STACK_CLASH_PROTECTION_PROBE_INTERVAL bytes. + + COPY_REG, if non-null, should contain a copy of the original + stack pointer modified by COPY_OFF at exit from this function. + + This is subtly different than the Ada probing in that it tries hard to + prevent attacks that jump the stack guard. Thus it is never allowed to + allocate more than STACK_CLASH_PROTECTION_PROBE_INTERVAL bytes of stack + space without a suitable probe. */ +static rtx_insn * +rs6000_emit_probe_stack_range_stack_clash
[PATCH v2] Dump BB number when dumping a BB with label.
On 07/28/2017 01:21 PM, Richard Biener wrote: > On Fri, Jul 28, 2017 at 12:52 PM, Martin Liška wrote: >> On 07/28/2017 09:58 AM, Richard Biener wrote: >>> On Fri, Jul 28, 2017 at 9:50 AM, Martin Liška wrote: On 07/28/2017 09:21 AM, Richard Biener wrote: > On Thu, Jul 27, 2017 at 4:24 PM, Martin Liška wrote: >> Hi. >> >> Following simple patch adds support for dumping of BBs when it's a BB >> that contains a label. That makes it easier for debugging as one can >> find destination for an edge in dump file. >> >> Sample, before: >> >> foo (int a) >> { >> int D.1821; >> int _1; >> int _4; >> int _5; >> >>[0.00%] [count: INV]: >> switch (a_2(D)) [INV] [count: INV], case 0: [INV] >> [count: INV], case 1: [INV] [count: INV]> >> >> [0.00%] [count: INV]: >> a_3 = a_2(D) + 2; >> >> [0.00%] [count: INV]: >> _4 = 2; >> goto (); [INV] [count: INV] >> >> [0.00%] [count: INV]: >> _5 = 123; >> >> # _1 = PHI <_4(4), _5(5)> >> [0.00%] [count: INV]: >> return _1; >> >> } >> >> After: >> >> foo (int a) >> { >> int D.1821; >> int _1; >> int _4; >> int _5; >> >>[0.00%] [count: INV]: >> switch (a_2(D)) [INV] [count: INV], case 0: [INV] >> [count: INV], case 1: [INV] [count: INV]> >> >> () [0.00%] [count: INV]: >> a_3 = a_2(D) + 2; >> >> () [0.00%] [count: INV]: >> _4 = 2; >> goto (); [INV] [count: INV] >> >> () [0.00%] [count: INV]: >> _5 = 123; >> >> # _1 = PHI <_4(4), _5(5)> >> () [0.00%] [count: INV]: >> return _1; >> >> } >> >> Patch can bootstrap on ppc64le-redhat-linux and survives regression >> tests. >> >> Thoughts? > > I think I prefer to always see > >: > > and if there's a label just dump that as well, thus > >: > L0: > > I think that's how we dump the case with multiple labels. And always use > the > implicit bb N when dumping destinations (in gotos, switches, etc). > > That is, what we have now is IMHO premature prettifying losing BB > indices in the dumps > unnecessarily. > > Richard. Hi. I like your ideas, there's difference in between 7.1 and modified trunk: foo (int a) { int D.1824; int _1; int _4; int _6; [0.00%] [count: INV]: switch (a_2(D)) [INV] [count: INV], case 0: [INV] [count: INV], case 1: [INV] [count: INV]> [0.00%] [count: INV]: a_3 = a_2(D) + 2; [0.00%] [count: INV]: _4 = 2; goto (); [INV] [count: INV] [0.00%] [count: INV]: [0.00%] [count: INV]: a_5 = a_2(D) + 2; label_XXX [0.00%] [count: INV]: label_YYY [0.00%] [count: INV]: _6 = 101; # _1 = PHI <_4(4), _6(7)> [0.00%] [count: INV]: return _1; } after: foo (int a) { int D.1824; int _1; int _4; int _6; [0.00%] [count: INV]: switch (a_2(D)) [INV] [count: INV], case 0: [INV] [count: INV], case 1: [INV] [count: INV]> [0.00%] [count: INV]: : a_3 = a_2(D) + 2; [0.00%] [count: INV]: : _4 = 2; goto ; [INV] [count: INV] [0.00%] [count: INV]: : [0.00%] [count: INV]: a_5 = a_2(D) + 2; [0.00%] [count: INV]: label_XXX: label_YYY: _6 = 101; [0.00%] [count: INV]: # _1 = PHI <_4(4), _6(7)> : return _1; } Do you like it? What about indentation of labels, should I increase it or leave it? >>> >>> Leave it. >>> I guess there will be some tests that will need to be adjusted. >>> >>> I guess so. >>> >>> I think : and friends are all DECL_ARTIFICIAL -- maybe we can avoid >>> dumping >>> them? Hmm, I guess doing it like above, while it preserves BB numbering, >>> does >>> reflect the actual IL a bit less so I guess I'd leave the s in >>> switches (those >>> have labels) and gotos if there's still the label args (not in case of >>> we are just >>> dumping CFG edges). >> >> Good, thus said there's how it will look like: >> >> $ cat /tmp/switch.c >> int c; >> >> int foo(int a) >> { >> switch (a) >> { >> case 0: >> a += 2; >> case 1: >> if (c) >> goto label_XXX; >> return 2; >> default: >> break; >> } >> >> a += 2; >> >> label_XXX: >> label_YYY: >> return 99 + 2; >> } >> >> $ ./xgcc -B. /tmp/switch.c -fdump-tree-optimized=/dev/stdout >> >> ;; Function foo (foo, funcdef_no=0, decl_uid=1816, cgraph_uid=0, >> symbol_order=1) >> >> foo (int a) >> { >> int D.1827; >> int c.0_1; >> int _2; >> int _6; >> i