On Mon, Jul 21, 2025 at 8:09 PM Hongtao Liu <crazy...@gmail.com> wrote: > > On Tue, Jul 22, 2025 at 4:47 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > For TLS calls: > > > > 1. UNSPEC_TLS_GD: > > > > (parallel [ > > (set (reg:DI 0 ax) > > (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr"))) > > (const_int 0 [0]))) > > (unspec:DI [(symbol_ref:DI ("e") [flags 0x50]) > > (reg/f:DI 7 sp)] UNSPEC_TLS_GD) > > (clobber (reg:DI 5 di))]) > > > > 2. UNSPEC_TLS_LD_BASE: > > > > (parallel [ > > (set (reg:DI 0 ax) > > (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr"))) > > (const_int 0 [0]))) > > (unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)]) > > > > 3. UNSPEC_TLSDESC: > > > > (parallel [ > > (set (reg/f:DI 104) > > (plus:DI (unspec:DI [ > > (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10]) > > (reg:DI 114) > > (reg/f:DI 7 sp)] UNSPEC_TLSDESC) > > (const:DI (unspec:DI [ > > (symbol_ref:DI ("e") [flags 0x1a]) > > ] UNSPEC_DTPOFF)))) > > (clobber (reg:CC 17 flags))]) > > > > (parallel [ > > (set (reg:DI 101) > > (unspec:DI [(symbol_ref:DI ("e") [flags 0x50]) > > (reg:DI 112) > > (reg/f:DI 7 sp)] UNSPEC_TLSDESC)) > > (clobber (reg:CC 17 flags))]) > > > > they return the same value for the same input value. But multiple calls > > with the same input value may be generated for simple programs like: > > > > void a(long *); > > int b(void); > > void c(void); > > static __thread long e; > > long > > d(void) > > { > > a(&e); > > if (b()) > > c(); > > return e; > > } > > > > When compiled with -O2 -fPIC -mtls-dialect=gnu2, the following codes are > > generated: > > > > .type d, @function > > d: > > .LFB0: > > .cfi_startproc > > pushq %rbx > > .cfi_def_cfa_offset 16 > > .cfi_offset 3, -16 > > leaq e@TLSDESC(%rip), %rbx > > movq %rbx, %rax > > call *e@TLSCALL(%rax) > > addq %fs:0, %rax > > movq %rax, %rdi > > call a@PLT > > call b@PLT > > testl %eax, %eax > > jne .L8 > > movq %rbx, %rax > > call *e@TLSCALL(%rax) > > popq %rbx > > .cfi_remember_state > > .cfi_def_cfa_offset 8 > > movq %fs:(%rax), %rax > > ret > > .p2align 4,,10 > > .p2align 3 > > .L8: > > .cfi_restore_state > > call c@PLT > > movq %rbx, %rax > > call *e@TLSCALL(%rax) > > popq %rbx > > .cfi_def_cfa_offset 8 > > movq %fs:(%rax), %rax > > ret > > .cfi_endproc > > > > There are 3 "call *e@TLSCALL(%rax)". They all return the same value. > > Rename the remove_redundant_vector pass to the x86_cse pass, for 64bit, > > extend it to also remove redundant TLS calls to generate: > > > > d: > > .LFB0: > > .cfi_startproc > > pushq %rbx > > .cfi_def_cfa_offset 16 > > .cfi_offset 3, -16 > > leaq e@TLSDESC(%rip), %rax > > movq %fs:0, %rdi > > call *e@TLSCALL(%rax) > > addq %rax, %rdi > > movq %rax, %rbx > > call a@PLT > > call b@PLT > > testl %eax, %eax > > jne .L8 > > movq %fs:(%rbx), %rax > > popq %rbx > > .cfi_remember_state > > .cfi_def_cfa_offset 8 > > ret > > .p2align 4,,10 > > .p2align 3 > > .L8: > > .cfi_restore_state > > call c@PLT > > movq %fs:(%rbx), %rax > > popq %rbx > > .cfi_def_cfa_offset 8 > > ret > > .cfi_endproc > > > > with only one "call *e@TLSCALL(%rax)". This reduces the number of > > __tls_get_addr calls in libgcc.a by 72%: > > > > __tls_get_addr calls before after > > libgcc.a 868 243 > > > > gcc/ > > > > PR target/81501 > > * config/i386/i386-features.cc (x86_cse_kind): Add X86_CSE_TLS_GD, > > X86_CSE_TLS_LD_BASE and X86_CSE_TLSDESC. > > (redundant_load): Renamed to ... > > (redundant_pattern): This. > > (replace_tls_call): New. > > (ix86_place_single_tls_call): Likewise. > > (pass_remove_redundant_vector_load): Renamed to ... > > (pass_x86_cse): This. Add val, def_insn, mode, scalar_mode, > > kind, candidate_kind, x86_cse, candidate_gnu_tls_p, > > candidate_gnu2_tls_p and candidate_vector_p. > > (pass_x86_cse::candidate_gnu_tls_p): New. > > (pass_x86_cse::candidate_gnu2_tls_p): Likewise. > > (pass_x86_cse::candidate_vector_p): Likewise. > > (remove_redundant_vector_load): Renamed to ... > > (pass_x86_cse::x86_cse): This. Extend to remove redundant TLS > > calls. > > (make_pass_remove_redundant_vector_load): Renamed to ... > > (make_pass_x86_cse): This. > > (config/i386/i386-passes.def): Replace > > pass_remove_redundant_vector_load with pass_x86_cse. > > config/i386/i386-protos.h (ix86_tls_get_addr): New. > > (make_pass_remove_redundant_vector_load): Renamed to ... > > (make_pass_x86_cse): This. > > * config/i386/i386.cc (ix86_tls_get_addr): Remove static. > > * config/i386/i386.h (machine_function): Add > > tls_descriptor_call_multiple_p. > > * config/i386/i386.md (tls64): New attribute. > > (@tls_global_dynamic_64_<mode>): Set tls_descriptor_call_multiple_p. > > (@tls_local_dynamic_base_64_<mode>): Likewise. > > (@tls_dynamic_gnu2_64_<mode>): Likewise. > > (*tls_global_dynamic_64_<mode>): Set tls64 attribute to gd. > > (*tls_local_dynamic_base_64_<mode>): Set tls64 attribute to ld_base. > > (*tls_dynamic_gnu2_lea_64_<mode>): Set tls64 attribute to lea. > > (*tls_dynamic_gnu2_call_64_<mode>): Set tls64 attribute to call. > > (*tls_dynamic_gnu2_combine_64_<mode>): Set tls64 attribute to > > combine. > > > > gcc/testsuite/ > > > > PR target/81501 > > * g++.target/i386/pr81501-1.C: New test. > > * gcc.target/i386/pr81501-1a.c: Likewise. > > * gcc.target/i386/pr81501-1b.c: Likewise. > > * gcc.target/i386/pr81501-2a.c: Likewise. > > * gcc.target/i386/pr81501-2b.c: Likewise. > > * gcc.target/i386/pr81501-3.c: Likewise. > > * gcc.target/i386/pr81501-4a.c: Likewise. > > * gcc.target/i386/pr81501-4b.c: Likewise. > > * gcc.target/i386/pr81501-5.c: Likewise. > > * gcc.target/i386/pr81501-6a.c: Likewise. > > * gcc.target/i386/pr81501-6b.c: Likewise. > > * gcc.target/i386/pr81501-7.c: Likewise. > > * gcc.target/i386/pr81501-8a.c: Likewise. > > * gcc.target/i386/pr81501-8b.c: Likewise. > > * gcc.target/i386/pr81501-9a.c: Likewise. > > * gcc.target/i386/pr81501-9b.c: Likewise. > > > > Signed-off-by: H.J. Lu <hjl.to...@gmail.com> > > --- > > gcc/config/i386/i386-features.cc | 766 ++++++++++++++++++--- > > gcc/config/i386/i386-passes.def | 2 +- > > gcc/config/i386/i386-protos.h | 4 +- > > gcc/config/i386/i386.cc | 2 +- > > gcc/config/i386/i386.h | 3 + > > gcc/config/i386/i386.md | 25 +- > > gcc/testsuite/g++.target/i386/pr81501-1.C | 16 + > > gcc/testsuite/gcc.target/i386/pr81501-1a.c | 17 + > > gcc/testsuite/gcc.target/i386/pr81501-1b.c | 6 + > > gcc/testsuite/gcc.target/i386/pr81501-2a.c | 17 + > > gcc/testsuite/gcc.target/i386/pr81501-2b.c | 6 + > > gcc/testsuite/gcc.target/i386/pr81501-3.c | 9 + > > gcc/testsuite/gcc.target/i386/pr81501-4a.c | 51 ++ > > gcc/testsuite/gcc.target/i386/pr81501-4b.c | 6 + > > gcc/testsuite/gcc.target/i386/pr81501-5.c | 13 + > > gcc/testsuite/gcc.target/i386/pr81501-6a.c | 67 ++ > > gcc/testsuite/gcc.target/i386/pr81501-6b.c | 28 + > > gcc/testsuite/gcc.target/i386/pr81501-7.c | 20 + > > gcc/testsuite/gcc.target/i386/pr81501-8a.c | 82 +++ > > gcc/testsuite/gcc.target/i386/pr81501-8b.c | 31 + > > gcc/testsuite/gcc.target/i386/pr81501-9a.c | 39 ++ > > gcc/testsuite/gcc.target/i386/pr81501-9b.c | 22 + > > 22 files changed, 1119 insertions(+), 113 deletions(-) > > create mode 100644 gcc/testsuite/g++.target/i386/pr81501-1.C > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-1a.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-1b.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-2a.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-2b.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-3.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-4a.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-4b.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-5.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-6a.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-6b.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-7.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-8a.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-8b.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-9a.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-9b.c > > > > diff --git a/gcc/config/i386/i386-features.cc > > b/gcc/config/i386/i386-features.cc > > index c131577805f..d38b297a89a 100644 > > --- a/gcc/config/i386/i386-features.cc > > +++ b/gcc/config/i386/i386-features.cc > > @@ -3493,10 +3493,13 @@ enum x86_cse_kind > > { > > X86_CSE_CONST0_VECTOR, > > X86_CSE_CONSTM1_VECTOR, > > - X86_CSE_VEC_DUP > > + X86_CSE_VEC_DUP, > > + X86_CSE_TLS_GD, > > + X86_CSE_TLS_LD_BASE, > > + X86_CSE_TLSDESC > > }; > > > > -struct redundant_load > > +struct redundant_pattern > > { > > /* Bitmap of basic blocks with broadcast instructions. */ > > auto_bitmap bbs; > > @@ -3669,22 +3672,570 @@ ix86_broadcast_inner (rtx op, machine_mode mode, > > return op; > > } > > > > -/* At entry of the nearest common dominator for basic blocks with vector > > - CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest > > - vector set instruction for all CONST0_RTX and integer CONSTM1_RTX > > - uses. > > +/* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC. */ > > > > - NB: We want to generate only a single widest vector set to cover the > > - whole function. The LCM algorithm isn't appropriate here since it > > - may place a vector set inside the loop. */ > > +static void > > +replace_tls_call (rtx src, auto_bitmap &tls_call_insns) > > +{ > > + bitmap_iterator bi; > > + unsigned int id; > > > > -static unsigned int > > -remove_redundant_vector_load (void) > > + EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi) > > + { > > + rtx_insn *insn = DF_INSN_UID_GET (id)->insn; > > + > > + /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are > > + allowed. */ > > + if (!CALL_P (insn)) > > + { > > + attr_tls64 tls64 = get_attr_tls64 (insn); > > + if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE) > > + gcc_unreachable (); > > + } > > + > > + rtx pat = PATTERN (insn); > > + if (GET_CODE (pat) != PARALLEL) > > + gcc_unreachable (); > > + > > + int j; > > + rtx op, dest = nullptr; > > + for (j = XVECLEN (pat, 0) - 1; j >= 0; j--) > > + { > > + op = XVECEXP (pat, 0, j); > > + if (GET_CODE (op) == SET) > > + { > > + dest = SET_DEST (op); > > + break; > > + } > > + } > > + > > + rtx set = gen_rtx_SET (dest, src); > > + rtx_insn *set_insn = emit_insn_after (set, insn); > > + if (recog_memoized (set_insn) < 0) > > + gcc_unreachable (); > > + > > + if (dump_file) > > + { > > + fprintf (dump_file, "\nReplace:\n\n"); > > + print_rtl_single (dump_file, insn); > > + fprintf (dump_file, "\nwith:\n\n"); > > + print_rtl_single (dump_file, set_insn); > > + fprintf (dump_file, "\n"); > > + } > > + > > + /* Delete the CALL insn. */ > > + delete_insn (insn); > > + > > + df_insn_rescan (set_insn); > > + } > > +} > > + > > +/* Generate a TLS call of KIND with VAL and copy the call result to DEST, > > + at entry of the nearest dominator for basic block map BBS, which is in > > + the fake loop that contains the whole function, so that there is only > > + a single TLS CALL of KIND with VAL in the whole function. If > > + TLSDESC_SET isn't nullptr, insert it before the TLS call. */ > > + > > +static void > > +ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind, > > + bitmap bbs, rtx tlsdesc_set = nullptr) > > +{ > > + basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs); > > + while (bb->loop_father->latch > > + != EXIT_BLOCK_PTR_FOR_FN (cfun)) > > + bb = get_immediate_dominator (CDI_DOMINATORS, > > + bb->loop_father->header); > > + > > + rtx_insn *insn = BB_HEAD (bb); > > + while (insn && !NONDEBUG_INSN_P (insn)) > > + { > > + if (insn == BB_END (bb)) > > + { > > + insn = NULL; > > + break; > > + } > > + insn = NEXT_INSN (insn); > > + } > > + > > + rtx rax = nullptr, rdi; > > + rtx eqv = nullptr; > > + rtx caddr; > > + rtx set; > > + rtx clob; > > + rtx symbol; > > + rtx tls; > > + rtx_insn *tls_insn; > > + > > + switch (kind) > > + { > > + case X86_CSE_TLS_GD: > > + rax = gen_rtx_REG (Pmode, AX_REG); > > + rdi = gen_rtx_REG (Pmode, DI_REG); > > + caddr = ix86_tls_get_addr (); > > + > > + symbol = XVECEXP (val, 0, 0); > > + tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi); > > + > > + if (GET_MODE (symbol) != Pmode) > > + symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol); > > + eqv = symbol; > > + break; > > + > > + case X86_CSE_TLS_LD_BASE: > > + rax = gen_rtx_REG (Pmode, AX_REG); > > + rdi = gen_rtx_REG (Pmode, DI_REG); > > + caddr = ix86_tls_get_addr (); > > + > > + tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi); > > + > > + /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers > > + to share the LD_BASE result with other LD model accesses. */ > > + eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), > > + UNSPEC_TLS_LD_BASE); > > + > > + break; > > + > > + case X86_CSE_TLSDESC: > > + set = gen_rtx_SET (dest, val); > > + clob = gen_rtx_CLOBBER (VOIDmode, > > + gen_rtx_REG (CCmode, FLAGS_REG)); > > + tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob)); > > + break; > > + > > + default: > > + gcc_unreachable (); > > + } > > + > > + rtx_insn *before = nullptr; > > + rtx_insn *after = nullptr; > > + if (insn == BB_HEAD (bb)) > > + before = insn; > > + else > > + after = insn ? PREV_INSN (insn) : BB_END (bb); > > + > > + /* TLS_GD and TLS_LD_BASE instructions are normal functions which > > + clobber caller-saved registers. TLSDESC instructions are special > > + functions which only clobber RAX. If any registers clobbered by > > + the TLS instruction are live in this basic block, we must insert > > + the TLS instruction after all live registers clobbered by the TLS > > + instruction are dead. */ > > + > > + auto_bitmap live_caller_saved_regs; > > + bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb); > > + > > + bool flags_live_p = bitmap_bit_p (in, FLAGS_REG); > > + > > + unsigned int i; > > + > > + /* Get all live caller-saved registers. */ > > + if (kind == X86_CSE_TLSDESC) > > + { > > + if (bitmap_bit_p (in, AX_REG)) > > + bitmap_set_bit (live_caller_saved_regs, AX_REG); > > + } > > + else > > + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) > > + if (call_used_regs[i] > > + && !fixed_regs[i] > > + && bitmap_bit_p (in, i)) > > + bitmap_set_bit (live_caller_saved_regs, i); > > + > > + if (!bitmap_empty_p (live_caller_saved_regs)) > > + { > > + /* Search for REG_DEAD notes in this basic block. */ > > + FOR_BB_INSNS (bb, insn) > > + { > > + if (!NONDEBUG_INSN_P (insn)) > > + continue; > > + > > + /* Check if FLAGS register is live. */ > > + set = single_set (insn); > > + if (set) > > + { > > + rtx dest = SET_DEST (set); > > + if (REG_P (dest) && REGNO (dest) == FLAGS_REG) > > + flags_live_p = true; > > + } > > + > > + rtx link; > > + for (link = REG_NOTES (insn); link; link = XEXP (link, 1)) > > + if (REG_NOTE_KIND (link) == REG_DEAD > > + && REG_P (XEXP (link, 0))) > > + { > > + /* Mark the live caller-saved register as dead. */ > > + for (i = REGNO (XEXP (link, 0)); > > + i < END_REGNO (XEXP (link, 0)); > > + i++) > > + bitmap_clear_bit (live_caller_saved_regs, i); > > + > > + /* Check if FLAGS register is dead. */ > > + if (REGNO (XEXP (link, 0)) == FLAGS_REG) > > + flags_live_p = false; > > + > > + if (bitmap_empty_p (live_caller_saved_regs)) > > + { > > + /* All live caller-saved registers are dead after > > + this instruction. Since TLS instructions > > + clobber FLAGS register, it must be dead where > > + the TLS will be inserted after. */ > > + if (flags_live_p) > > + gcc_unreachable (); > > + after = insn; > > + goto insert_after; > > + } > > + } > > + } > > + > > + /* All live caller-saved registers should be dead at the end > > + of this basic block. */ > > + gcc_unreachable (); > > + } > > + > > + /* Emit the TLS CALL insn. */ > > + if (after) > > + { > > +insert_after: > > + tls_insn = emit_insn_after (tls, after); > > + } > > + else > > + tls_insn = emit_insn_before (tls, before); > > + > > + rtx_insn *tlsdesc_insn = nullptr; > > + if (tlsdesc_set) > > + { > > + rtx dest = copy_rtx (SET_DEST (tlsdesc_set)); > > + rtx src = copy_rtx (SET_SRC (tlsdesc_set)); > > + tlsdesc_set = gen_rtx_SET (dest, src); > > + tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn); > > + } > > + > > + if (kind != X86_CSE_TLSDESC) > > + { > > + RTL_CONST_CALL_P (tls_insn) = 1; > > + > > + /* Indicate that this function can't jump to non-local gotos. */ > > + make_reg_eh_region_note_nothrow_nononlocal (tls_insn); > > + } > > + > > + if (recog_memoized (tls_insn) < 0) > > + gcc_unreachable (); > > + > > + if (dump_file) > > + { > > + if (after) > > + { > > + fprintf (dump_file, "\nPlace:\n\n"); > > + if (tlsdesc_insn) > > + print_rtl_single (dump_file, tlsdesc_insn); > > + print_rtl_single (dump_file, tls_insn); > > + fprintf (dump_file, "\nafter:\n\n"); > > + print_rtl_single (dump_file, after); > > + fprintf (dump_file, "\n"); > > + } > > + else > > + { > > + fprintf (dump_file, "\nPlace:\n\n"); > > + if (tlsdesc_insn) > > + print_rtl_single (dump_file, tlsdesc_insn); > > + print_rtl_single (dump_file, tls_insn); > > + fprintf (dump_file, "\nbefore:\n\n"); > > + print_rtl_single (dump_file, insn); > > + fprintf (dump_file, "\n"); > > + } > > + } > > + > > + if (kind != X86_CSE_TLSDESC) > > + { > > + /* Copy RAX to DEST. */ > > + set = gen_rtx_SET (dest, rax); > > + rtx_insn *set_insn = emit_insn_after (set, tls_insn); > > + set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest); > > + if (dump_file) > > + { > > + fprintf (dump_file, "\nPlace:\n\n"); > > + print_rtl_single (dump_file, set_insn); > > + fprintf (dump_file, "\nafter:\n\n"); > > + print_rtl_single (dump_file, tls_insn); > > + fprintf (dump_file, "\n"); > > + } > > + } > > +} > > + > > +namespace { > > + > > +const pass_data pass_data_x86_cse = > > +{ > > + RTL_PASS, /* type */ > > + "x86_cse", /* name */ > > + OPTGROUP_NONE, /* optinfo_flags */ > > + TV_MACH_DEP, /* tv_id */ > > + 0, /* properties_required */ > > + 0, /* properties_provided */ > > + 0, /* properties_destroyed */ > > + 0, /* todo_flags_start */ > > + 0, /* todo_flags_finish */ > > +}; > > + > > +class pass_x86_cse : public rtl_opt_pass > > +{ > > +public: > > + pass_x86_cse (gcc::context *ctxt) > > + : rtl_opt_pass (pass_data_x86_cse, ctxt) > > + {} > > + > > + /* opt_pass methods: */ > > + bool gate (function *fun) final override > > + { > > + return (TARGET_SSE2 > > + && optimize > > + && optimize_function_for_speed_p (fun)); > > + } > > + > > + unsigned int execute (function *) final override > > + { > > + return x86_cse (); > > + } > > + > > +private: > > + /* The redundant source value. */ > > + rtx val; > > + /* The instruction which defines the redundant value. */ > > + rtx_insn *def_insn; > > + /* Mode of the destination of the candidate redundant instruction. */ > > + machine_mode mode; > > + /* Mode of the source of the candidate redundant instruction. */ > > + machine_mode scalar_mode; > > + /* The classification of the candidate redundant instruction. */ > > + x86_cse_kind kind; > > + > > + enum candidate_kind > > + { > > + candidate_no, /* Instruction isn't a candidate. */ > > + candidate_ignore, /* Instruction should be ignored. */ > > + candidate_yes /* Instruction is a candidate. */ > > + }; > > + > > + unsigned int x86_cse (void); > > + candidate_kind candidate_gnu_tls_p (rtx_insn *); > > + candidate_kind candidate_gnu2_tls_p (rtx_insn *, rtx); > > + bool candidate_vector_p (rtx, rtx); > > +}; // class pass_x86_cse > > + > > +/* Return true and output def_insn, val, mode, scalar_mode and kind if > > + INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE. */ > > + > > +pass_x86_cse::candidate_kind > > +pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn) > > +{ > > + if (!TARGET_64BIT > > + || !cfun->machine->tls_descriptor_call_multiple_p > > + || !CALL_P (insn)) > > + return candidate_no; > > + > > + /* Record the redundant TLS CALLs for 64-bit: > > + > > + (parallel [ > > + (set (reg:DI 0 ax) > > + (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr"))) > > + (const_int 0 [0]))) > > + (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50]) > > + (reg/f:DI 7 sp)] UNSPEC_TLS_GD) > > + (clobber (reg:DI 5 di))]) > > + > > + > > + and > > + > > + (parallel [ > > + (set (reg:DI 0 ax) > > + (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr"))) > > + (const_int 0 [0]))) > > + (unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)]) > > + > > + */ > > + > > + rtx pat, set, dest; > > + attr_tls64 tls64 = get_attr_tls64 (insn); > > + switch (tls64) > > + { > > + default: > > + return candidate_ignore; > > + > > + case TLS64_GD: > > + case TLS64_LD_BASE: > > + pat = PATTERN (insn); > > + set = XVECEXP (pat, 0, 0); > > + gcc_assert (GET_CODE (set) == SET); > > + dest = SET_DEST (set); > > + scalar_mode = mode = GET_MODE (dest); > > + val = XVECEXP (pat, 0, 1); > > + gcc_assert (GET_CODE (val) == UNSPEC); > > + break; > > + } > > + > > + if (tls64 == TLS64_GD) > > + kind = X86_CSE_TLS_GD; > > + else > > + kind = X86_CSE_TLS_LD_BASE; > > + > > + def_insn = nullptr; > > + return candidate_yes; > > +} > > + > > +/* Return true and output def_insn, val, mode, scalar_mode and kind if > > + INSN is UNSPEC_TLSDESC. */ > > + > > +pass_x86_cse::candidate_kind > > +pass_x86_cse::candidate_gnu2_tls_p (rtx_insn *insn, rtx src) > > +{ > > + if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p) > > + return candidate_no; > > + > > + /* Record GNU2 TLS CALLs for 64-bit: > > + > > + (parallel [ > > + (set (reg/f:DI 104) > > + (plus:DI (unspec:DI [ > > + (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10]) > > + (reg:DI 114) > > + (reg/f:DI 7 sp)] UNSPEC_TLSDESC) > > + (const:DI (unspec:DI [ > > + (symbol_ref:DI ("e") [flags 0x1a]) > > + ] UNSPEC_DTPOFF)))) > > + (clobber (reg:CC 17 flags))]) > > + > > + and > > + > > + (parallel [ > > + (set (reg:DI 101) > > + (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50]) > > + (reg:DI 112) > > + (reg/f:DI 7 sp)] UNSPEC_TLSDESC)) > > + (clobber (reg:CC 17 flags))]) > > + > > + */ > > + > > + attr_tls64 tls64 = get_attr_tls64 (insn); > > + if (tls64 == TLS64_CALL) > > + val = src; > > + else if (tls64 == TLS64_COMBINE) > > + { > > + val = src; > > + src = XEXP (src, 0); > > + } > > + else > > + return candidate_no; > > + > > + kind = X86_CSE_TLSDESC; > > + gcc_assert (GET_CODE (src) == UNSPEC); > > + src = XVECEXP (src, 0, 1); > > + scalar_mode = mode = GET_MODE (src); > > + if (REG_P (src)) > > + { > > + /* All definitions of reg:DI 129 in > > + > > + (set (reg:DI 110) > > + (unspec:DI [(symbol_ref:DI ("foo")) > > + (reg:DI 129) > > + (reg/f:DI 7 sp)] UNSPEC_TLSDESC)) > > + > > + should have the same source as in > > + > > + (set (reg:DI 129) > > + (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC)) > > + > > + */ > > + > > + df_ref ref; > > + rtx_insn *set_insn = nullptr; > > + rtx tls_src = nullptr; > > + for (ref = DF_REG_DEF_CHAIN (REGNO (src)); > > + ref; > > + ref = DF_REF_NEXT_REG (ref)) > > + { > > + if (DF_REF_IS_ARTIFICIAL (ref)) > > + break; > > + > > + set_insn = DF_REF_INSN (ref); > > + tls64 = get_attr_tls64 (set_insn); > > + if (tls64 != TLS64_LEA) > > + { > > + set_insn = nullptr; > > + break; > > + } > > + > > + rtx tls_set = PATTERN (set_insn); > > + if (!tls_src) > > + tls_src = SET_SRC (tls_set); > > + else if (!rtx_equal_p (tls_src, SET_SRC (tls_set))) > > + { > > + set_insn = nullptr; > > + break; > > + } > > + } > > + > > + if (!set_insn) > > + return candidate_ignore; > > + > > + rtx set = single_set (insn); > > + if (!set) > > + return candidate_ignore; > > + > > + def_insn = set_insn; > > + } > > + else if (GET_CODE (src) == UNSPEC > > + && XINT (src, 1) == UNSPEC_TLSDESC > > + && SYMBOL_REF_P (XVECEXP (src, 0, 0))) > > + def_insn = nullptr; > > + else > > + gcc_unreachable (); > > + > > + return candidate_yes; > > +} > > + > > +/* Return true and output def_insn, val, mode, scalar_mode and kind if > > + INSN is a vector broadcast instruction. */ > > + > > +bool > > +pass_x86_cse::candidate_vector_p (rtx set, rtx src) > > +{ > > + rtx dest = SET_DEST (set); > > + mode = GET_MODE (dest); > > + /* Skip non-vector instruction. */ > > + if (!VECTOR_MODE_P (mode)) > > + return false; > > + > > + /* Skip non-vector load instruction. */ > > + if (!REG_P (dest) && !SUBREG_P (dest)) > > + return false; > > + > > + val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind, > > + &def_insn); > > + return val ? true : false; > > +} > > + > > +/* At entry of the nearest common dominator for basic blocks with > > + > > + 1. Vector CONST0_RTX patterns. > > + 2. Vector CONSTM1_RTX patterns. > > + 3. Vector broadcast patterns. > > + 4. UNSPEC_TLS_GD patterns. > > + 5. UNSPEC_TLS_LD_BASE patterns. > > + 6. UNSPEC_TLSDESC patterns. > > + > > + generate a single pattern whose destination is used to replace the > > + source in all identical patterns. > > + > > + NB: We want to generate a pattern, which is executed only once, to > > + cover the whole function. The LCM algorithm isn't appropriate here > > + since it may place a pattern inside the loop. */ > > + > > +unsigned int > > +pass_x86_cse::x86_cse (void) > > { > > timevar_push (TV_MACH_DEP); > > > > - auto_vec<redundant_load *> loads; > > - redundant_load *load; > > + auto_vec<redundant_pattern *> loads; > > + redundant_pattern *load; > > basic_block bb; > > rtx_insn *insn; > > unsigned int i; > > @@ -3700,61 +4251,74 @@ remove_redundant_vector_load (void) > > if (!NONDEBUG_INSN_P (insn)) > > continue; > > > > - rtx set = single_set (insn); > > - if (!set) > > - continue; > > + bool matched = false; > > + rtx set, src; > > + /* Remove redundant pattens if there are more than 2 of > > + them. */ > > + unsigned int threshold = 2; > > > > - /* Record single set vector instruction with CONST0_RTX and > > - CONSTM1_RTX source. Record basic blocks with CONST0_RTX and > > - CONSTM1_RTX. Count CONST0_RTX and CONSTM1_RTX. Record the > > - maximum size of CONST0_RTX and CONSTM1_RTX. */ > > + /* First check UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE. */ > > + switch (candidate_gnu_tls_p (insn)) > > Can we just > switch (get_attr_tls64 (insn)) > { > case TLS64_GD: > case TLS64_LD_BASE: > if (!candidate_gnu_tls_p (insn) ----> return true if it's a > candidate, otherwise return false. > continue; > break; > case TLS64_CALL: > case TLS64_COMBINE: > if (!candidate_gnu2_tls_p (insn)) > continue; > break; > case none: > if (!candidate_vector_p (insn) > continue; > break; > > default: > continue; > }
Fixed in the v3 patch. > > + { > > + case candidate_no: > > + /* This isn't UNSPEC_TLS_GD nor UNSPEC_TLS_LD_BASE. */ > > + set = single_set (insn); > > + if (!set) > > + continue; > > > > - rtx dest = SET_DEST (set); > > - machine_mode mode = GET_MODE (dest); > > - /* Skip non-vector instruction. */ > > - if (!VECTOR_MODE_P (mode)) > > - continue; > > + src = SET_SRC (set); > > > > - rtx src = SET_SRC (set); > > - /* Skip non-vector load instruction. */ > > - if (!REG_P (dest) && !SUBREG_P (dest)) > > - continue; > > + /* Check UNSPEC_TLSDESC. */ > > + switch (candidate_gnu2_tls_p (insn, src)) > > + { > > + case candidate_no: > > + /* Check vector instruction. */ > > + if (candidate_vector_p (set, src)) > > + break; > > + continue; > > + case candidate_ignore: > > + /* Not a candidate. Skip. */ > > + continue; > > + case candidate_yes: > > + break; > > + } > > + break; > > > > - rtx_insn *def_insn; > > - machine_mode scalar_mode; > > - x86_cse_kind kind; > > - rtx val = ix86_broadcast_inner (src, mode, &scalar_mode, > > - &kind, &def_insn); > > - if (!val) > > - continue; > > + case candidate_ignore: > > + /* Not a candidate. Skip. */ > > + continue; > > > > - /* Remove redundant register loads if there are more than 2 > > - loads will be used. */ > > - unsigned int threshold = 2; > > + case candidate_yes: > > + /* This is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE. */ > > + break; > > + } > > > > - /* Check if there is a matching redundant vector load. */ > > - bool matched = false; > > + /* Check if there is a matching redundant load. */ > > FOR_EACH_VEC_ELT (loads, i, load) > > if (load->val > > && load->kind == kind > > && load->mode == scalar_mode > > && (load->bb == bb > > - || kind < X86_CSE_VEC_DUP > > + || kind != X86_CSE_VEC_DUP > > /* Non all 0s/1s vector load must be in the same > > basic block if it is in a recursive call. */ > > || !recursive_call_p) > > && rtx_equal_p (load->val, val)) > > { > > - /* Record vector instruction. */ > > + /* Record instruction. */ > > bitmap_set_bit (load->insns, INSN_UID (insn)); > > > > /* Record the maximum vector size. */ > > - if (load->size < GET_MODE_SIZE (mode)) > > + if (kind <= X86_CSE_VEC_DUP > > + && load->size < GET_MODE_SIZE (mode)) > > load->size = GET_MODE_SIZE (mode); > > > > /* Record the basic block. */ > > bitmap_set_bit (load->bbs, bb->index); > > + > > + /* Increment the count. */ > > load->count++; > > + > > matched = true; > > break; > > } > > @@ -3762,8 +4326,11 @@ remove_redundant_vector_load (void) > > if (matched) > > continue; > > > > - /* We see this vector broadcast the first time. */ > > - load = new redundant_load; > > + /* We see this instruction the first time. Record the > > + redundant source value, its mode, the destination size, > > + instruction which defines the redundant source value, > > + instruction basic block and the instruction kind. */ > > + load = new redundant_pattern; > > > > load->val = copy_rtx (val); > > load->mode = scalar_mode; > > @@ -3786,6 +4353,15 @@ remove_redundant_vector_load (void) > > FOR_EACH_VEC_ELT (loads, i, load) > > if (load->count >= load->threshold) > > { > And we can also have > switch (load->kind) > { > case X86_CSE_TLS_GD: > case X86_CSE_TLD_LD_BASE: > case X86_CSE_TLSDESC: > ix86_cse_replace_tls_call (....); > break; > > case X86_CSE_CONST0_VECTOR: > case X86_CSE_CONSTM1_VECTOR: > ix86_cse_replace_const0_m1 (...); > break; > > case X86_CSE_VEC_DUP: > ix86_cse_replace_vec_dup (...); > break; > > default: > gcc_unreachable(); > } > > I think that would be more readable and easy to maintain. Fixed in the v3 patch. > > > + if (load->kind > X86_CSE_VEC_DUP) > > + { > > + broadcast_reg = gen_reg_rtx (load->mode); > > + replace_tls_call (broadcast_reg, load->insns); > > + load->broadcast_reg = broadcast_reg; > > + replaced = true; > > + continue; > > + } > > + > > machine_mode mode = ix86_get_vector_cse_mode (load->size, > > load->mode); > > broadcast_reg = gen_reg_rtx (mode); > > @@ -3841,34 +4417,48 @@ remove_redundant_vector_load (void) > > { > > if (load->def_insn) > > { > > - /* Insert a broadcast after the original scalar > > - definition. */ > > - rtx set = gen_rtx_SET (load->broadcast_reg, > > - load->broadcast_source); > > - insn = emit_insn_after (set, load->def_insn); > > - > > - if (cfun->can_throw_non_call_exceptions) > > + if (load->kind == X86_CSE_TLSDESC) > > + ix86_place_single_tls_call (load->broadcast_reg, > > + load->val, > > + load->kind, > > + load->bbs, > > + PATTERN (load->def_insn)); > > + else > > { > > - /* Handle REG_EH_REGION note in DEF_INSN. */ > > - rtx note = find_reg_note (load->def_insn, > > - REG_EH_REGION, nullptr); > > - if (note) > > + /* Insert a broadcast after the original scalar > > + definition. */ > > + rtx set = gen_rtx_SET (load->broadcast_reg, > > + load->broadcast_source); > > + insn = emit_insn_after (set, load->def_insn); > > + > > + if (cfun->can_throw_non_call_exceptions) > > { > > - control_flow_insns.safe_push (load->def_insn); > > - add_reg_note (insn, REG_EH_REGION, > > - XEXP (note, 0)); > > + /* Handle REG_EH_REGION note in DEF_INSN. */ > > + rtx note = find_reg_note (load->def_insn, > > + REG_EH_REGION, nullptr); > > + if (note) > > + { > > + control_flow_insns.safe_push (load->def_insn); > > + add_reg_note (insn, REG_EH_REGION, > > + XEXP (note, 0)); > > + } > > } > > - } > > > > - if (dump_file) > > - { > > - fprintf (dump_file, "\nAdd:\n\n"); > > - print_rtl_single (dump_file, insn); > > - fprintf (dump_file, "\nafter:\n\n"); > > - print_rtl_single (dump_file, load->def_insn); > > - fprintf (dump_file, "\n"); > > + if (dump_file) > > + { > > + fprintf (dump_file, "\nAdd:\n\n"); > > + print_rtl_single (dump_file, insn); > > + fprintf (dump_file, "\nafter:\n\n"); > > + print_rtl_single (dump_file, load->def_insn); > > + fprintf (dump_file, "\n"); > > + } > > } > > } > > + else if (load->kind > X86_CSE_VEC_DUP) > > + ix86_place_single_tls_call (load->broadcast_reg, > > + load->val, > > + load->kind, > > + load->bbs); > > else > > ix86_place_single_vector_set (load->broadcast_reg, > > load->broadcast_source, > > @@ -3905,48 +4495,12 @@ remove_redundant_vector_load (void) > > return 0; > > } > > > > -namespace { > > - > > -const pass_data pass_data_remove_redundant_vector_load = > > -{ > > - RTL_PASS, /* type */ > > - "rrvl", /* name */ > > - OPTGROUP_NONE, /* optinfo_flags */ > > - TV_MACH_DEP, /* tv_id */ > > - 0, /* properties_required */ > > - 0, /* properties_provided */ > > - 0, /* properties_destroyed */ > > - 0, /* todo_flags_start */ > > - 0, /* todo_flags_finish */ > > -}; > > - > > -class pass_remove_redundant_vector_load : public rtl_opt_pass > > -{ > > -public: > > - pass_remove_redundant_vector_load (gcc::context *ctxt) > > - : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt) > > - {} > > - > > - /* opt_pass methods: */ > > - bool gate (function *fun) final override > > - { > > - return (TARGET_SSE2 > > - && optimize > > - && optimize_function_for_speed_p (fun)); > > - } > > - > > - unsigned int execute (function *) final override > > - { > > - return remove_redundant_vector_load (); > > - } > > -}; // class pass_remove_redundant_vector_load > > - > > } // anon namespace > > > > rtl_opt_pass * > > -make_pass_remove_redundant_vector_load (gcc::context *ctxt) > > +make_pass_x86_cse (gcc::context *ctxt) > > { > > - return new pass_remove_redundant_vector_load (ctxt); > > + return new pass_x86_cse (ctxt); > > } > > > > /* Convert legacy instructions that clobbers EFLAGS to APX_NF > > diff --git a/gcc/config/i386/i386-passes.def > > b/gcc/config/i386/i386-passes.def > > index 06f0288b067..553b46d1fdc 100644 > > --- a/gcc/config/i386/i386-passes.def > > +++ b/gcc/config/i386/i386-passes.def > > @@ -35,6 +35,6 @@ along with GCC; see the file COPYING3. If not see > > PR116174. */ > > INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops); > > > > - INSERT_PASS_AFTER (pass_late_combine, 1, > > pass_remove_redundant_vector_load); > > + INSERT_PASS_AFTER (pass_late_combine, 1, pass_x86_cse); > > INSERT_PASS_AFTER (pass_late_combine, 1, > > pass_remove_partial_avx_dependency); > > INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert); > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > > index 69bc0ee570d..ee6b78b2c77 100644 > > --- a/gcc/config/i386/i386-protos.h > > +++ b/gcc/config/i386/i386-protos.h > > @@ -290,6 +290,7 @@ extern rtx ix86_tls_module_base (void); > > extern bool ix86_gpr_tls_address_pattern_p (rtx); > > extern bool ix86_tls_address_pattern_p (rtx); > > extern rtx ix86_rewrite_tls_address (rtx); > > +extern rtx ix86_tls_get_addr (void); > > > > extern void ix86_expand_vector_init (bool, rtx, rtx); > > extern void ix86_expand_vector_set (bool, rtx, rtx, int); > > @@ -430,8 +431,7 @@ extern rtl_opt_pass > > *make_pass_insert_endbr_and_patchable_area > > (gcc::context *); > > extern rtl_opt_pass *make_pass_remove_partial_avx_dependency > > (gcc::context *); > > -extern rtl_opt_pass *make_pass_remove_redundant_vector_load > > - (gcc::context *); > > +extern rtl_opt_pass *make_pass_x86_cse (gcc::context *); > > extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *); > > extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *); > > > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > > index 4682db85ce4..8e66362862a 100644 > > --- a/gcc/config/i386/i386.cc > > +++ b/gcc/config/i386/i386.cc > > @@ -12439,7 +12439,7 @@ ix86_tls_index (void) > > > > static GTY(()) rtx ix86_tls_symbol; > > > > -static rtx > > +rtx > > ix86_tls_get_addr (void) > > { > > if (!ix86_tls_symbol) > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > index 791f3b9e133..912b942aa1e 100644 > > --- a/gcc/config/i386/i386.h > > +++ b/gcc/config/i386/i386.h > > @@ -2865,6 +2865,9 @@ struct GTY(()) machine_function { > > approximation. */ > > BOOL_BITFIELD tls_descriptor_call_expanded_p : 1; > > > > + /* True if TLS descriptor is called more than once. */ > > + BOOL_BITFIELD tls_descriptor_call_multiple_p : 1; > > + > > /* If true, the current function has a STATIC_CHAIN is placed on the > > stack below the return address. */ > > BOOL_BITFIELD static_chain_on_stack : 1; > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > > index eb526997584..6f15d850c82 100644 > > --- a/gcc/config/i386/i386.md > > +++ b/gcc/config/i386/i386.md > > @@ -901,6 +901,10 @@ (define_attr "i387_cw" > > "roundeven,floor,ceil,trunc,uninitialized,any" > > (define_attr "avx_partial_xmm_update" "false,true" > > (const_string "false")) > > > > +;; Define attribute to indicate 64-bit TLS insns. > > +(define_attr "tls64" "gd,ld_base,call,combine,lea,none" > > + (const_string "none")) > > + > > ;; Define attribute to classify add/sub insns that consumes carry flag (CF) > > (define_attr "use_carry" "0,1" (const_string "0")) > > > > @@ -23243,6 +23247,7 @@ (define_insn "*tls_global_dynamic_64_<mode>" > > return "call\t{*%p2@GOTPCREL(%%rip)|[QWORD PTR %p2@GOTPCREL[rip]]}"; > > } > > [(set_attr "type" "multi") > > + (set_attr "tls64" "gd") > > (set (attr "length") > > (symbol_ref "TARGET_X32 ? 15 : 16"))]) > > > > @@ -23281,7 +23286,11 @@ (define_expand "@tls_global_dynamic_64_<mode>" > > UNSPEC_TLS_GD) > > (clobber (match_operand:P 3 "register_operand"))])] > > "TARGET_64BIT" > > - "ix86_tls_descriptor_calls_expanded_in_cfun = true;") > > +{ > > + if (ix86_tls_descriptor_calls_expanded_in_cfun) > > + cfun->machine->tls_descriptor_call_multiple_p = true; > > + ix86_tls_descriptor_calls_expanded_in_cfun = true; > > +}) > > > > (define_insn "*tls_local_dynamic_base_32_gnu" > > [(set (match_operand:SI 0 "register_operand" "=a") > > @@ -23343,6 +23352,7 @@ (define_insn "*tls_local_dynamic_base_64_<mode>" > > return "call\t{*%p1@GOTPCREL(%%rip)|[QWORD PTR %p1@GOTPCREL[rip]]}"; > > } > > [(set_attr "type" "multi") > > + (set_attr "tls64" "ld_base") > > (set_attr "length" "12")]) > > > > (define_insn "*tls_local_dynamic_base_64_largepic" > > @@ -23376,7 +23386,11 @@ (define_expand "@tls_local_dynamic_base_64_<mode>" > > (unspec:P [(reg:P SP_REG)] UNSPEC_TLS_LD_BASE) > > (clobber (match_operand:P 2 "register_operand"))])] > > "TARGET_64BIT" > > - "ix86_tls_descriptor_calls_expanded_in_cfun = true;") > > +{ > > + if (ix86_tls_descriptor_calls_expanded_in_cfun) > > + cfun->machine->tls_descriptor_call_multiple_p = true; > > + ix86_tls_descriptor_calls_expanded_in_cfun = true; > > +}) > > > > ;; Local dynamic of a single variable is a lose. Show combine how > > ;; to convert that back to global dynamic. > > @@ -23570,6 +23584,8 @@ (define_expand "@tls_dynamic_gnu2_64_<mode>" > > "TARGET_64BIT && TARGET_GNU2_TLS" > > { > > operands[2] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : > > operands[0]; > > + if (ix86_tls_descriptor_calls_expanded_in_cfun) > > + cfun->machine->tls_descriptor_call_multiple_p = true; > > ix86_tls_descriptor_calls_expanded_in_cfun = true; > > }) > > > > @@ -23581,6 +23597,7 @@ (define_insn "*tls_dynamic_gnu2_lea_64_<mode>" > > "lea%z0\t{%E1@TLSDESC(%%rip), %0|%0, %E1@TLSDESC[rip]}" > > [(set_attr "type" "lea") > > (set_attr "mode" "<MODE>") > > + (set_attr "tls64" "lea") > > (set_attr "length" "7") > > (set_attr "length_address" "4")]) > > > > @@ -23594,6 +23611,7 @@ (define_insn "*tls_dynamic_gnu2_call_64_<mode>" > > "TARGET_64BIT && TARGET_GNU2_TLS" > > "call\t{*%a1@TLSCALL(%2)|[QWORD PTR [%2+%a1@TLSCALL]]}" > > [(set_attr "type" "call") > > + (set_attr "tls64" "call") > > (set_attr "length" "2") > > (set_attr "length_address" "0")]) > > > > @@ -23615,7 +23633,8 @@ (define_insn_and_split > > "*tls_dynamic_gnu2_combine_64_<mode>" > > { > > operands[4] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : > > operands[0]; > > emit_insn (gen_tls_dynamic_gnu2_64 (ptr_mode, operands[4], operands[1])); > > -}) > > +} > > + [(set_attr "tls64" "combine")]) > > > > (define_split > > [(match_operand 0 "tls_address_pattern")] > > diff --git a/gcc/testsuite/g++.target/i386/pr81501-1.C > > b/gcc/testsuite/g++.target/i386/pr81501-1.C > > new file mode 100644 > > index 00000000000..b2e89f4a5f0 > > --- /dev/null > > +++ b/gcc/testsuite/g++.target/i386/pr81501-1.C > > @@ -0,0 +1,16 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-std=c++14 -mtls-dialect=gnu -O2 -fpic -fplt" } */ > > +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 1 { > > target { ! ia32 } } } } */ > > + > > +struct foo > > +{ > > + foo(); > > + ~foo(); > > +}; > > + > > +foo * > > +test () > > +{ > > + static thread_local foo foo_tls; > > + return &foo_tls; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-1a.c > > b/gcc/testsuite/gcc.target/i386/pr81501-1a.c > > new file mode 100644 > > index 00000000000..30b4642a9ee > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-1a.c > > @@ -0,0 +1,17 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu" } */ > > + > > +void a(long *); > > +int b(void); > > +void c(void); > > +static __thread long e; > > +long > > +d(void) > > +{ > > + a(&e); > > + if (b()) > > + c(); > > + return e; > > +} > > + > > +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 1 { > > target { ! ia32 } } } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-1b.c > > b/gcc/testsuite/gcc.target/i386/pr81501-1b.c > > new file mode 100644 > > index 00000000000..de25f226990 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-1b.c > > @@ -0,0 +1,6 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu2" } */ > > + > > +#include "pr81501-1a.c" > > + > > +/* { dg-final { scan-assembler-times "call\[ > > \t\]\\*e@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-2a.c > > b/gcc/testsuite/gcc.target/i386/pr81501-2a.c > > new file mode 100644 > > index 00000000000..a06302a468f > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-2a.c > > @@ -0,0 +1,17 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu" } */ > > + > > +void a(long *); > > +int b(void); > > +void c(void); > > +extern __thread long e; > > +long > > +d(void) > > +{ > > + a(&e); > > + if (b()) > > + c(); > > + return e; > > +} > > + > > +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 1 { > > target { ! ia32 } } } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-2b.c > > b/gcc/testsuite/gcc.target/i386/pr81501-2b.c > > new file mode 100644 > > index 00000000000..4afb7426c81 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-2b.c > > @@ -0,0 +1,6 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu2" } */ > > + > > +#include "pr81501-2a.c" > > + > > +/* { dg-final { scan-assembler-times "call\[ > > \t\]\\*e@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-3.c > > b/gcc/testsuite/gcc.target/i386/pr81501-3.c > > new file mode 100644 > > index 00000000000..d4220630900 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-3.c > > @@ -0,0 +1,9 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu2" } */ > > + > > +static __thread int local1; > > +int * > > +get_local1 (void) > > +{ > > + return &local1; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-4a.c > > b/gcc/testsuite/gcc.target/i386/pr81501-4a.c > > new file mode 100644 > > index 00000000000..0c655e259ff > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-4a.c > > @@ -0,0 +1,51 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-O2 -march=x86-64 -fpic -fplt -mtls-dialect=gnu" } */ > > +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ > > +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } > > {^\t?\.} } } */ > > + > > +/* > > +**in_dso: > > +**.LFB[0-9]+: > > +**... > > +** movl %edi, %.* > > +**... > > +** mov(l|q) %(e|r)si, %.* > > +**... > > +** call __tls_get_addr@PLT > > +**... > > +*/ > > + > > +__thread int foo; > > + > > +extern void bar1 (int *, int *); > > +extern void bar2 (int); > > +extern void bar3 (const char *); > > + > > +int > > +in_dso (int n, int *caller_foop) > > +{ > > + int *foop; > > + int result = 0; > > + > > + bar3 ("foo"); /* Make sure PLT is used before > > macros. */ > > + asm ("" ::: "memory"); > > + > > + foop = &foo; > > + > > + if (caller_foop != (void *) 0 && foop != caller_foop) > > + { > > + bar1 (caller_foop, foop); > > + result = 1; > > + } > > + else if (*foop != n) > > + { > > + bar2 (n); > > + result = 1; > > + } > > + > > + *foop = 16; > > + > > + return result; > > +} > > + > > +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 1 { > > target { ! ia32 } } } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-4b.c > > b/gcc/testsuite/gcc.target/i386/pr81501-4b.c > > new file mode 100644 > > index 00000000000..5d35712b70d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-4b.c > > @@ -0,0 +1,6 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu2" } */ > > + > > +#include "pr81501-4a.c" > > + > > +/* { dg-final { scan-assembler-times "call\[ > > \t\]\\*\*foo@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-5.c > > b/gcc/testsuite/gcc.target/i386/pr81501-5.c > > new file mode 100644 > > index 00000000000..7f666e1c006 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-5.c > > @@ -0,0 +1,13 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu" } */ > > + > > +extern __thread int __bid_IDEC_glbflags; > > +extern long __bid64qq_div_bid_y_0_1; > > +extern void get_BID64(int *); > > +void > > +__bid64qq_div(void) > > +{ > > + if (__bid64qq_div_bid_y_0_1) > > + __bid_IDEC_glbflags |= 1; > > + get_BID64(&__bid_IDEC_glbflags); > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-6a.c > > b/gcc/testsuite/gcc.target/i386/pr81501-6a.c > > new file mode 100644 > > index 00000000000..db8acf82883 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-6a.c > > @@ -0,0 +1,67 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-O2 -march=x86-64 -fpic -fplt -mtls-dialect=gnu" } */ > > +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ > > +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } > > {^\t?\.} } } */ > > + > > +/* > > +**in_dso: > > +**.LFB[0-9]+: > > +**... > > +** mov(l|q) %(e|r)dx, %.* > > +**... > > +** movl %edi, %.* > > +**... > > +** mov(l|q) %(e|r)si, %.* > > +**... > > +** call __tls_get_addr@PLT > > +**... > > +*/ > > + > > +__thread int foo; > > +__thread int bar; > > + > > +extern void fun1 (int *, int *); > > +extern void fun2 (int); > > +extern void fun3 (const char *); > > + > > +int > > +in_dso (int n, int *caller_foop, int *caller_barp) > > +{ > > + int *foop; > > + int *barp; > > + int result = 0; > > + > > + fun3 ("foo"); /* Make sure PLT is used before > > macros. */ > > + asm ("" ::: "memory"); > > + > > + foop = &foo; > > + barp = &bar; > > + > > + if (caller_foop != (void *) 0 && foop != caller_foop) > > + { > > + fun1 (caller_foop, foop); > > + result = 1; > > + if (caller_barp != (void *) 0 && barp != caller_barp) > > + { > > + fun1 (caller_barp, barp); > > + result = 2; > > + } > > + else if (*barp != n) > > + { > > + fun2 (n); > > + result = 3; > > + } > > + } > > + else if (*foop != n) > > + { > > + fun2 (n); > > + result = 4; > > + } > > + > > + *barp = 16; > > + *foop = 16; > > + > > + return result; > > +} > > + > > +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 2 { > > target { ! ia32 } } } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-6b.c > > b/gcc/testsuite/gcc.target/i386/pr81501-6b.c > > new file mode 100644 > > index 00000000000..0b71f0a9039 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-6b.c > > @@ -0,0 +1,28 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-O2 -march=x86-64 -fpic -fplt -mtls-dialect=gnu2" } */ > > +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ > > +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } > > {^\t?\.} } } */ > > + > > +/* > > +**in_dso: > > +**.LFB[0-9]+: > > +**... > > +** lea(l|q) bar@TLSDESC\(%rip\), %(e|r)ax > > +** mov(l|q) %(e|r)si, %.* > > +**... > > +** mov(l|q) %(e|r)dx, %.* > > +**... > > +** movl %edi, %.* > > +**... > > +** call \*bar@TLSCALL\(%(e|r)ax\) > > +**... > > +** lea(l|q) foo@TLSDESC\(%rip\), %(e|r)ax > > +**... > > +** call \*foo@TLSCALL\(%(e|r)ax\) > > +**... > > +*/ > > + > > +#include "pr81501-6a.c" > > + > > +/* { dg-final { scan-assembler-times "call\[ > > \t\]\\*foo@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */ > > +/* { dg-final { scan-assembler-times "call\[ > > \t\]\\*bar@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-7.c > > b/gcc/testsuite/gcc.target/i386/pr81501-7.c > > new file mode 100644 > > index 00000000000..b2fe5d5eb85 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-7.c > > @@ -0,0 +1,20 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu" } */ > > + > > +extern int __bid_IDEC_glbround, __bid64qqq_fma_save_fpsf; > > +extern __thread int __bid_IDEC_glbflags; > > +typedef struct { > > + long w[2]; > > +} UINT128; > > +extern long __bid64qqq_fma_res_0_1; > > +extern void bid128_ext_fma(UINT128, UINT128); > > +void > > +__bid64qqq_fma(UINT128 y, UINT128 z) > > +{ > > + __bid_IDEC_glbflags = 0; > > + bid128_ext_fma(y, z); > > + if (__bid_IDEC_glbround || __bid64qqq_fma_res_0_1) > > + __bid_IDEC_glbflags |= __bid64qqq_fma_save_fpsf; > > +} > > + > > +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 1 { > > target { ! ia32 } } } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-8a.c > > b/gcc/testsuite/gcc.target/i386/pr81501-8a.c > > new file mode 100644 > > index 00000000000..7e14ef5cd4f > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-8a.c > > @@ -0,0 +1,82 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-O2 -march=x86-64 -fpic -fplt -mtls-dialect=gnu" } */ > > +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ > > +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } > > {^\t?\.} } } */ > > + > > +/* > > +**in_dso: > > +**.LFB[0-9]+: > > +**... > > +** mov(l|q) %(e|r)dx, %.* > > +**... > > +** movl %edi, %.* > > +**... > > +** mov(l|q) %(e|r)si, %.* > > +**... > > +** testb %al, %al > > +**... > > +** call __tls_get_addr@PLT > > +**... > > +*/ > > + > > +#include <stdarg.h> > > + > > +__thread int foo; > > +__thread int bar; > > + > > +extern void fun1 (int *, int *); > > +extern void fun2 (int); > > +extern void fun3 (const char *); > > + > > +int > > +in_dso (int n, int *caller_foop, int *caller_barp, ...) > > +{ > > + int *foop; > > + int *barp; > > + int result; > > + va_list ap; > > + double d; > > + > > + va_start (ap, caller_barp); > > + > > + result = 0; > > + > > + fun3 ("foo"); /* Make sure PLT is used before > > macros. */ > > + asm ("" ::: "memory"); > > + > > + foop = &foo; > > + barp = &bar; > > + > > + if (caller_foop != (void *) 0 && foop != caller_foop) > > + { > > + fun1 (caller_foop, foop); > > + result = 1; > > + if (caller_barp != (void *) 0 && barp != caller_barp) > > + { > > + fun1 (caller_barp, barp); > > + result = 2; > > + } > > + else if (*barp != n) > > + { > > + fun2 (n); > > + result = 3; > > + } > > + } > > + else if (*foop != n) > > + { > > + fun2 (n); > > + result = 4; > > + } > > + > > + *barp = 16; > > + *foop = 16; > > + > > + d = va_arg (ap, double); > > + if (d != 1234.0) > > + result = 10; > > + va_end (ap); > > + > > + return result; > > +} > > + > > +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 2 { > > target { ! ia32 } } } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-8b.c > > b/gcc/testsuite/gcc.target/i386/pr81501-8b.c > > new file mode 100644 > > index 00000000000..778b2fb3507 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-8b.c > > @@ -0,0 +1,31 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-O2 -march=x86-64 -fpic -fplt -mtls-dialect=gnu2" } */ > > +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ > > +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } > > {^\t?\.} } } */ > > + > > +/* > > +**in_dso: > > +**.LFB[0-9]+: > > +**... > > +** mov(l|q) %(e|r)si, %.* > > +**... > > +** mov(l|q) %(e|r)dx, %.* > > +**... > > +** movl %edi, %.* > > +**... > > +** testb %al, %al > > +**... > > +** lea(l|q) bar@TLSDESC\(%rip\), %(e|r)ax > > +**... > > +** call \*bar@TLSCALL\(%(e|r)ax\) > > +**... > > +** lea(l|q) foo@TLSDESC\(%rip\), %(e|r)ax > > +**... > > +** call \*foo@TLSCALL\(%(e|r)ax\) > > +**... > > +*/ > > + > > +#include "pr81501-8a.c" > > + > > +/* { dg-final { scan-assembler-times "call\[ > > \t\]\\*foo@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */ > > +/* { dg-final { scan-assembler-times "call\[ > > \t\]\\*bar@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-9a.c > > b/gcc/testsuite/gcc.target/i386/pr81501-9a.c > > new file mode 100644 > > index 00000000000..c5de37009c1 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-9a.c > > @@ -0,0 +1,39 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -march=x86-64-v4 -fpic -fplt -mtls-dialect=gnu" } */ > > +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ > > +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } > > {^\t?\.} } } */ > > + > > +/* > > +**foo: > > +**.LFB[0-9]+: > > +**... > > +** vpbroadcastb %edi, %zmm0 > > +**... > > +** call __tls_get_addr@PLT > > +**... > > +*/ > > + > > +#include <immintrin.h> > > + > > +extern __m512i sinkz; > > +extern __m256i sinky; > > +extern __m128i sinkx; > > +extern void func1 (long *); > > +extern int func2 (void); > > +extern void func3 (void); > > +static __thread long var; > > + > > +long > > +foo (char c) > > +{ > > + func1 (&var); > > + if (func2 ()) > > + func3 (); > > + sinkx = _mm_set1_epi8 (c); > > + sinkz = _mm512_set1_epi8 (c); > > + sinky = _mm256_set1_epi8 (c); > > + return var; > > +} > > + > > +/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */ > > +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 1 { > > target { ! ia32 } } } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr81501-9b.c > > b/gcc/testsuite/gcc.target/i386/pr81501-9b.c > > new file mode 100644 > > index 00000000000..711b177bc1e > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr81501-9b.c > > @@ -0,0 +1,22 @@ > > +/* { dg-do compile { target *-*-linux* } } */ > > +/* { dg-options "-O2 -march=x86-64-v4 -fpic -fplt -mtls-dialect=gnu2" } */ > > +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ > > +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } > > {^\t?\.} } } */ > > +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ > > + > > +/* > > +**foo: > > +**.LFB[0-9]+: > > +**... > > +** vpbroadcastb %edi, %zmm0 > > +**... > > +** lea(l|q) var@TLSDESC\(%rip\), %(e|r)ax > > +**... > > +** call \*var@TLSCALL\(%(e|r)ax\) > > +**... > > +*/ > > + > > +#include "pr81501-9a.c" > > + > > +/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */ > > +/* { dg-final { scan-assembler-times "call\[ > > \t\]\\*var@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */ > > -- > > 2.50.1 > > > > > -- > BR, > Hongtao -- H.J.