I would suggest breaking this patch into two parts: RISC-V part and the rest part (shrink-wrap.h / shrink-wrap.cc).
On Wed, Jun 7, 2023 at 1:55 PM Fei Gao <gao...@eswincomputing.com> wrote: > > Disable zcmp multi push/pop if shrink-wrap-separate is active. > > So in -Os that prefers smaller code size, by default shrink-wrap-separate > is disabled while zcmp multi push/pop is enabled. > > And in -O2 and others that prefers speed, by default shrink-wrap-separate > is enabled while zcmp multi push/pop is disabled. To force enabling zcmp multi > push/pop in this case, -fno-shrink-wrap-separate has to be explictly given. > > The following TC shows the issues in -O2 before this patch with both > shrink-wrap-separate and zcmp multi push/pop active. > 1. duplicated store of s regs. > 2. cm.push pushes ra, s0-s11 in reverse order than what normal > prologue does, causing stack corruption and failure to resotre s regs. > > TC: zcmp_shrink_wrap_separate.c included in this patch. > > output asm before this patch: > calc_func: > cm.push {ra, s0-s3}, -32 > ... > beq a5,zero,.L2 > ... > .L2: > ... > sw s1,20(sp) //issue here > sw s3,12(sp) //issue here > ... > sw s2,16(sp) //issue here > > output asm after this patch: > calc_func: > addi sp,sp,-32 > sw s0,24(sp) > ... > beq a5,zero,.L2 > ... > .L2: > ... > sw s1,20(sp) > sw s3,12(sp) > ... > sw s2,16(sp) > gcc/ChangeLog: > > * config/riscv/riscv.cc > (riscv_avoid_shrink_wrapping_separate): wrap the condition check in > riscv_avoid_shrink_wrapping_separate. > (riscv_avoid_multi_push): avoid multi push if shrink_wrapping_separate > is active. > (riscv_get_separate_components): call > riscv_avoid_shrink_wrapping_separate > * shrink-wrap.cc (try_shrink_wrapping_separate): call > use_shrink_wrapping_separate. > (use_shrink_wrapping_separate):wrap the condition > check in use_shrink_wrapping_separate > * shrink-wrap.h (use_shrink_wrapping_separate): add to extern > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/zcmp_shrink_wrap_separate.c: New test. > * gcc.target/riscv/zcmp_shrink_wrap_separate2.c: New test. > > Signed-off-by: Fei Gao <gao...@eswincomputing.com> > Co-Authored-By: Zhangjin Liao <liaozhang...@eswincomputing.com> > --- > gcc/config/riscv/riscv.cc | 19 +++- > gcc/shrink-wrap.cc | 25 +++-- > gcc/shrink-wrap.h | 1 + > .../riscv/zcmp_shrink_wrap_separate.c | 97 +++++++++++++++++++ > .../riscv/zcmp_shrink_wrap_separate2.c | 97 +++++++++++++++++++ > 5 files changed, 228 insertions(+), 11 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/riscv/zcmp_shrink_wrap_separate.c > create mode 100644 > gcc/testsuite/gcc.target/riscv/zcmp_shrink_wrap_separate2.c > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc > index f60c241a526..b505cdeca34 100644 > --- a/gcc/config/riscv/riscv.cc > +++ b/gcc/config/riscv/riscv.cc > @@ -64,6 +64,7 @@ along with GCC; see the file COPYING3. If not see > #include "cfghooks.h" > #include "cfgloop.h" > #include "cfgrtl.h" > +#include "shrink-wrap.h" > #include "sel-sched.h" > #include "fold-const.h" > #include "gimple-iterator.h" > @@ -389,6 +390,7 @@ static const struct riscv_tune_param > optimize_size_tune_info = { > false, /* use_divmod_expansion */ > }; > > +static bool riscv_avoid_shrink_wrapping_separate (); > static tree riscv_handle_fndecl_attribute (tree *, tree, tree, int, bool *); > static tree riscv_handle_type_attribute (tree *, tree, tree, int, bool *); > > @@ -4910,6 +4912,8 @@ riscv_avoid_multi_push(const struct riscv_frame_info > *frame) > || cfun->machine->interrupt_handler_p > || cfun->machine->varargs_size != 0 > || crtl->args.pretend_args_size != 0 > + || (use_shrink_wrapping_separate () > + && !riscv_avoid_shrink_wrapping_separate ()) > || (frame->mask & ~ MULTI_PUSH_GPR_MASK)) > return true; > > @@ -6077,6 +6081,17 @@ riscv_epilogue_uses (unsigned int regno) > return false; > } > > +static bool > +riscv_avoid_shrink_wrapping_separate () > +{ > + if (riscv_use_save_libcall (&cfun->machine->frame) > + || cfun->machine->interrupt_handler_p > + || !cfun->machine->frame.gp_sp_offset.is_constant ()) > + return true; > + > + return false; > +} > + > /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */ > > static sbitmap > @@ -6086,9 +6101,7 @@ riscv_get_separate_components (void) > sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER); > bitmap_clear (components); > > - if (riscv_use_save_libcall (&cfun->machine->frame) > - || cfun->machine->interrupt_handler_p > - || !cfun->machine->frame.gp_sp_offset.is_constant ()) > + if (riscv_avoid_shrink_wrapping_separate ()) > return components; > > offset = cfun->machine->frame.gp_sp_offset.to_constant (); > diff --git a/gcc/shrink-wrap.cc b/gcc/shrink-wrap.cc > index b8d7b557130..d534964321a 100644 > --- a/gcc/shrink-wrap.cc > +++ b/gcc/shrink-wrap.cc > @@ -1776,16 +1776,14 @@ insert_prologue_epilogue_for_components (sbitmap > components) > commit_edge_insertions (); > } > > -/* The main entry point to this subpass. FIRST_BB is where the prologue > - would be normally put. */ > -void > -try_shrink_wrapping_separate (basic_block first_bb) > +bool > +use_shrink_wrapping_separate (void) > { > if (!(SHRINK_WRAPPING_ENABLED > - && flag_shrink_wrap_separate > - && optimize_function_for_speed_p (cfun) > - && targetm.shrink_wrap.get_separate_components)) > - return; > + && flag_shrink_wrap_separate > + && optimize_function_for_speed_p (cfun) > + && targetm.shrink_wrap.get_separate_components)) > + return false; > > /* We don't handle "strange" functions. */ > if (cfun->calls_alloca > @@ -1794,6 +1792,17 @@ try_shrink_wrapping_separate (basic_block first_bb) > || crtl->calls_eh_return > || crtl->has_nonlocal_goto > || crtl->saves_all_registers) > + return false; > + > + return true; > +} > + > +/* The main entry point to this subpass. FIRST_BB is where the prologue > + would be normally put. */ > +void > +try_shrink_wrapping_separate (basic_block first_bb) > +{ > + if (!use_shrink_wrapping_separate ()) > return; > > /* Ask the target what components there are. If it returns NULL, don't > diff --git a/gcc/shrink-wrap.h b/gcc/shrink-wrap.h > index 161647711a3..82386c2b712 100644 > --- a/gcc/shrink-wrap.h > +++ b/gcc/shrink-wrap.h > @@ -26,6 +26,7 @@ along with GCC; see the file COPYING3. If not see > extern bool requires_stack_frame_p (rtx_insn *, HARD_REG_SET, HARD_REG_SET); > extern void try_shrink_wrapping (edge *entry_edge, rtx_insn *prologue_seq); > extern void try_shrink_wrapping_separate (basic_block first_bb); > +extern bool use_shrink_wrapping_separate (void); > #define SHRINK_WRAPPING_ENABLED \ > (flag_shrink_wrap && targetm.have_simple_return ()) > > diff --git a/gcc/testsuite/gcc.target/riscv/zcmp_shrink_wrap_separate.c > b/gcc/testsuite/gcc.target/riscv/zcmp_shrink_wrap_separate.c > new file mode 100644 > index 00000000000..11f87aee607 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/zcmp_shrink_wrap_separate.c > @@ -0,0 +1,97 @@ > +/* { dg-do compile } */ > +/* { dg-options " -O2 -march=rv32imaf_zca_zcmp -mabi=ilp32f" } */ > +/* { dg-skip-if "" { *-*-* } {"-O0" "-O1" "-Os" "-Og" "-O3" "-Oz" "-flto"} } > */ > + > +typedef struct MAT_PARAMS_S > +{ > + int N; > + signed short *A; > + signed short *B; > + signed int *C; > +} mat_params; > + > +typedef struct CORE_PORTABLE_S > +{ > + unsigned char portable_id; > +} core_portable; > + > +typedef struct RESULTS_S > +{ > + /* inputs */ > + signed short seed1; /* Initializing seed */ > + signed short seed2; /* Initializing seed */ > + signed short seed3; /* Initializing seed */ > + void * memblock[4]; /* Pointer to safe memory location */ > + unsigned int size; /* Size of the data */ > + unsigned int iterations; /* Number of iterations to > execute */ > + unsigned int execs; /* Bitmask of operations to > execute */ > + struct list_head_s *list; > + mat_params mat; > + /* outputs */ > + unsigned short crc; > + unsigned short crclist; > + unsigned short crcmatrix; > + unsigned short crcstate; > + signed short err; > + /* ultithread specific */ > + core_portable port; > +} core_results; > + > +extern signed short > +core_bench_state(unsigned int, void *, signed short, signed short, signed > short, unsigned short); > + > +extern signed short > +core_bench_matrix(mat_params *, signed short, unsigned short); > + > +extern unsigned short > +crcu16(signed short, unsigned short); > + > +signed short > +calc_func(signed short *pdata, core_results *res) > +{ > + signed short data = *pdata; > + signed short retval; > + unsigned char optype > + = (data >> 7) > + & 1; /* bit 7 indicates if the function result has been cached */ > + if (optype) /* if cached, use cache */ > + return (data & 0x007f); > + else > + { /* otherwise calculate and cache the > result */ > + signed short flag = data & 0x7; /* bits 0-2 is type of function to > perform */ > + signed short dtype > + = ((data >> 3) > + & 0xf); /* bits 3-6 is specific data for the operation > */ > + dtype |= dtype << 4; /* replicate the lower 4 bits to get an 8b > value */ > + switch (flag) > + { > + case 0: > + if (dtype < 0x22) /* set min period for bit corruption */ > + dtype = 0x22; > + retval = core_bench_state(res->size, > + res->memblock[3], > + res->seed1, > + res->seed2, > + dtype, > + res->crc); > + if (res->crcstate == 0) > + res->crcstate = retval; > + break; > + case 1: > + retval = core_bench_matrix(&(res->mat), dtype, res->crc); > + if (res->crcmatrix == 0) > + res->crcmatrix = retval; > + break; > + default: > + retval = data; > + break; > + } > + res->crc = crcu16(retval, res->crc); > + retval &= 0x007f; > + *pdata = (data & 0xff00) | 0x0080 | retval; /* cache the result */ > + return retval; > + } > +} > + > +/* { dg-final { scan-assembler-not "cm\.push" } } */ > + > diff --git a/gcc/testsuite/gcc.target/riscv/zcmp_shrink_wrap_separate2.c > b/gcc/testsuite/gcc.target/riscv/zcmp_shrink_wrap_separate2.c > new file mode 100644 > index 00000000000..ec7e9c39b5d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/zcmp_shrink_wrap_separate2.c > @@ -0,0 +1,97 @@ > +/* { dg-do compile } */ > +/* { dg-options " -O2 -fno-shrink-wrap-separate -march=rv32imaf_zca_zcmp > -mabi=ilp32f" } */ > +/* { dg-skip-if "" { *-*-* } {"-O0" "-O1" "-Os" "-Og" "-O3" "-Oz" "-flto"} } > */ > + > +typedef struct MAT_PARAMS_S > +{ > + int N; > + signed short *A; > + signed short *B; > + signed int *C; > +} mat_params; > + > +typedef struct CORE_PORTABLE_S > +{ > + unsigned char portable_id; > +} core_portable; > + > +typedef struct RESULTS_S > +{ > + /* inputs */ > + signed short seed1; /* Initializing seed */ > + signed short seed2; /* Initializing seed */ > + signed short seed3; /* Initializing seed */ > + void * memblock[4]; /* Pointer to safe memory location */ > + unsigned int size; /* Size of the data */ > + unsigned int iterations; /* Number of iterations to > execute */ > + unsigned int execs; /* Bitmask of operations to > execute */ > + struct list_head_s *list; > + mat_params mat; > + /* outputs */ > + unsigned short crc; > + unsigned short crclist; > + unsigned short crcmatrix; > + unsigned short crcstate; > + signed short err; > + /* ultithread specific */ > + core_portable port; > +} core_results; > + > +extern signed short > +core_bench_state(unsigned int, void *, signed short, signed short, signed > short, unsigned short); > + > +extern signed short > +core_bench_matrix(mat_params *, signed short, unsigned short); > + > +extern unsigned short > +crcu16(signed short, unsigned short); > + > +signed short > +calc_func(signed short *pdata, core_results *res) > +{ > + signed short data = *pdata; > + signed short retval; > + unsigned char optype > + = (data >> 7) > + & 1; /* bit 7 indicates if the function result has been cached */ > + if (optype) /* if cached, use cache */ > + return (data & 0x007f); > + else > + { /* otherwise calculate and cache the > result */ > + signed short flag = data & 0x7; /* bits 0-2 is type of function to > perform */ > + signed short dtype > + = ((data >> 3) > + & 0xf); /* bits 3-6 is specific data for the operation > */ > + dtype |= dtype << 4; /* replicate the lower 4 bits to get an 8b > value */ > + switch (flag) > + { > + case 0: > + if (dtype < 0x22) /* set min period for bit corruption */ > + dtype = 0x22; > + retval = core_bench_state(res->size, > + res->memblock[3], > + res->seed1, > + res->seed2, > + dtype, > + res->crc); > + if (res->crcstate == 0) > + res->crcstate = retval; > + break; > + case 1: > + retval = core_bench_matrix(&(res->mat), dtype, res->crc); > + if (res->crcmatrix == 0) > + res->crcmatrix = retval; > + break; > + default: > + retval = data; > + break; > + } > + res->crc = crcu16(retval, res->crc); > + retval &= 0x007f; > + *pdata = (data & 0xff00) | 0x0080 | retval; /* cache the result */ > + return retval; > + } > +} > + > +/* { dg-final { scan-assembler "cm\.push" } } */ > + > -- > 2.17.1 >