* Claudiu Zissulescu <claudiu.zissule...@synopsys.com> [2017-11-27 12:09:59 
+0100]:

> From: Claudiu Zissulescu <claz...@gmail.com>
> 
> The new implementation attempts to clean up the existing trampoline
> implementation for ARC making it to work for linux type of systems.
> 
> gcc/
> 2017-11-10  Claudiu Zissulescu  <claz...@synopsys.com>
> 
>       * config/arc/arc.c (TARGET_TRAMPOLINE_ADJUST_ADDRESS): Delete.
>       (emit_store_direct): Likewise.
>       (arc_trampoline_adjust_address): Likewise.
>       (arc_asm_trampoline_template): New function.
>       (arc_initialize_trampoline): Use asm_trampoline_template.
>       (TARGET_ASM_TRAMPOLINE_TEMPLATE): Define.
>       * config/arc/arc.h (TRAMPOLINE_SIZE): Adjust to 16.
>       *config/arc/arc.md (flush_icache): Delete pattern.

         ^-- Missing space here.

Otherwise, looks fine.

Thanks,
Andrew



> ---
>  gcc/config/arc/arc.c  | 89 
> +++++++++++++++++++++++++--------------------------
>  gcc/config/arc/arc.h  |  2 +-
>  gcc/config/arc/arc.md |  9 ------
>  3 files changed, 44 insertions(+), 56 deletions(-)
> 
> diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
> index 0eeeb42..053f3c2 100644
> --- a/gcc/config/arc/arc.c
> +++ b/gcc/config/arc/arc.c
> @@ -588,8 +588,6 @@ static void arc_finalize_pic (void);
>  
>  #define TARGET_TRAMPOLINE_INIT arc_initialize_trampoline
>  
> -#define TARGET_TRAMPOLINE_ADJUST_ADDRESS arc_trampoline_adjust_address
> -
>  #define TARGET_CAN_ELIMINATE arc_can_eliminate
>  
>  #define TARGET_FRAME_POINTER_REQUIRED arc_frame_pointer_required
> @@ -3727,69 +3725,65 @@ output_shift (rtx *operands)
>  
>  /* Nested function support.  */
>  
> -/* Directly store VALUE into memory object BLOCK at OFFSET.  */
> -
> -static void
> -emit_store_direct (rtx block, int offset, int value)
> -{
> -  emit_insn (gen_store_direct (adjust_address (block, SImode, offset),
> -                            force_reg (SImode,
> -                                       gen_int_mode (value, SImode))));
> -}
> +/* Output assembler code for a block containing the constant parts of
> +   a trampoline, leaving space for variable parts.
>  
> -/* Emit RTL insns to initialize the variable parts of a trampoline.
> -   FNADDR is an RTX for the address of the function's pure code.
> -   CXT is an RTX for the static chain value for the function.  */
> -/* With potentially multiple shared objects loaded, and multiple stacks
> -   present for multiple thereds where trampolines might reside, a simple
> -   range check will likely not suffice for the profiler to tell if a callee
> -   is a trampoline.  We a speedier check by making the trampoline start at
> -   an address that is not 4-byte aligned.
>     A trampoline looks like this:
>  
> -   nop_s          0x78e0
> -entry:
>     ld_s r12,[pcl,12] 0xd403
>     ld   r11,[pcl,12] 0x170c 700b
>     j_s [r12]         0x7c00
> -   nop_s          0x78e0
> +   .word function's address
> +   .word static chain value
> +
> +*/
> +
> +static void
> +arc_asm_trampoline_template (FILE *f)
> +{
> +  asm_fprintf (f, "\tld_s\t%s,[pcl,8]\n", ARC_TEMP_SCRATCH_REG);
> +  asm_fprintf (f, "\tld\t%s,[pcl,12]\n", reg_names[STATIC_CHAIN_REGNUM]);
> +  asm_fprintf (f, "\tj_s\t[%s]\n", ARC_TEMP_SCRATCH_REG);
> +  assemble_aligned_integer (UNITS_PER_WORD, const0_rtx);
> +  assemble_aligned_integer (UNITS_PER_WORD, const0_rtx);
> +}
> +
> +/* Emit RTL insns to initialize the variable parts of a trampoline.
> +   FNADDR is an RTX for the address of the function's pure code.  CXT
> +   is an RTX for the static chain value for the function.
>  
>     The fastest trampoline to execute for trampolines within +-8KB of CTX
>     would be:
> +
>     add2 r11,pcl,s12
>     j [limm]           0x20200f80 limm
> -   and that would also be faster to write to the stack by computing the 
> offset
> -   from CTX to TRAMP at compile time.  However, it would really be better to
> -   get rid of the high cost of cache invalidation when generating 
> trampolines,
> -   which requires that the code part of trampolines stays constant, and
> -   additionally either
> -   - making sure that no executable code but trampolines is on the stack,
> -     no icache entries linger for the area of the stack from when before the
> -     stack was allocated, and allocating trampolines in trampoline-only
> -     cache lines
> -  or
> -   - allocate trampolines fram a special pool of pre-allocated trampolines.  
> */
> +
> +   and that would also be faster to write to the stack by computing
> +   the offset from CTX to TRAMP at compile time.  However, it would
> +   really be better to get rid of the high cost of cache invalidation
> +   when generating trampolines, which requires that the code part of
> +   trampolines stays constant, and additionally either making sure
> +   that no executable code but trampolines is on the stack, no icache
> +   entries linger for the area of the stack from when before the stack
> +   was allocated, and allocating trampolines in trampoline-only cache
> +   lines or allocate trampolines fram a special pool of pre-allocated
> +   trampolines.  */
>  
>  static void
>  arc_initialize_trampoline (rtx tramp, tree fndecl, rtx cxt)
>  {
>    rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
>  
> -  emit_store_direct (tramp, 0, TARGET_BIG_ENDIAN ? 0x78e0d403 : 0xd40378e0);
> -  emit_store_direct (tramp, 4, TARGET_BIG_ENDIAN ? 0x170c700b : 0x700b170c);
> -  emit_store_direct (tramp, 8, TARGET_BIG_ENDIAN ? 0x7c0078e0 : 0x78e07c00);
> -  emit_move_insn (adjust_address (tramp, SImode, 12), fnaddr);
> -  emit_move_insn (adjust_address (tramp, SImode, 16), cxt);
> -  emit_insn (gen_flush_icache (adjust_address (tramp, SImode, 0)));
> -}
> +  emit_block_move (tramp, assemble_trampoline_template (),
> +                GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
>  
> -/* Allow the profiler to easily distinguish trampolines from normal
> -  functions.  */
> +  emit_move_insn (adjust_address (tramp, SImode, 8), fnaddr);
> +  emit_move_insn (adjust_address (tramp, SImode, 12), cxt);
>  
> -static rtx
> -arc_trampoline_adjust_address (rtx addr)
> -{
> -  return plus_constant (Pmode, addr, 2);
> +  emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
> +                  LCT_NORMAL, VOIDmode, 2, XEXP (tramp, 0), Pmode,
> +                  plus_constant (Pmode, XEXP (tramp, 0), TRAMPOLINE_SIZE),
> +                  Pmode);
>  }
>  
>  /* Add the given function declaration to emit code in JLI section.  */
> @@ -11412,6 +11406,9 @@ arc_cannot_substitute_mem_equiv_p (rtx)
>  #undef TARGET_CANNOT_SUBSTITUTE_MEM_EQUIV_P
>  #define TARGET_CANNOT_SUBSTITUTE_MEM_EQUIV_P 
> arc_cannot_substitute_mem_equiv_p
>  
> +#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
> +#define TARGET_ASM_TRAMPOLINE_TEMPLATE arc_asm_trampoline_template
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>  
>  #include "gt-arc.h"
> diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h
> index 8c31fb2..317a653 100644
> --- a/gcc/config/arc/arc.h
> +++ b/gcc/config/arc/arc.h
> @@ -829,7 +829,7 @@ extern int arc_initial_elimination_offset(int from, int 
> to);
>  /* Trampolines.  */
>  
>  /* Length in units of the trampoline for entering a nested function.  */
> -#define TRAMPOLINE_SIZE 20
> +#define TRAMPOLINE_SIZE 16
>  
>  /* Alignment required for a trampoline in bits .  */
>  /* For actual data alignment we just need 32, no more than the stack;
> diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
> index 155ee6c..e1418a9 100644
> --- a/gcc/config/arc/arc.md
> +++ b/gcc/config/arc/arc.md
> @@ -4345,15 +4345,6 @@ archs4xd, archs4xd_slow, core_3"
>     (set_attr "iscompact" "true")
>     (set_attr "length" "2")])
>  
> -;; Special pattern to flush the icache.
> -;; ??? Not sure what to do here.  Some ARC's are known to support this.
> -
> -(define_insn "flush_icache"
> -  [(unspec_volatile [(match_operand:SI 0 "memory_operand" "m")] 0)]
> -  ""
> -  "* return \"\";"
> -  [(set_attr "type" "misc")])
> -
>  ;; Split up troublesome insns for better scheduling.
>  
>  ;; Peepholes go at the end.
> -- 
> 1.9.1
> 

Reply via email to