Hi! We right now ICE with -mcmodel=large -fpic on x86_64 on TLS GD and LD sequences, because obviously we can't call __tls_get_addr@plt there from code potentially more than 2GB away from the PLT slot.
The attached patches add support for that in gcc and also teaches linker about those, because otherwise the linker will fail if you try to link such -mcmodel=large -fpic code into binaries or PIEs. To make transitions possible, we emit always leaq foo@tlsgd(%rip), %rdi movabsq $__tls_get_addr@pltoff, %rax addq $rbx, %rax call *%rax resp. leaq foo@tlsld(%rip), %rdi movabsq $__tls_get_addr@pltoff, %rax addq $rbx, %rax call *%rax sequences (22 bytes, 6 bytes longer than what we do for TLSGD for normal libraries). Bootstrapped/regtested on x86_64-linux and i686-linux, attached is also the sources I've used to test all the 3 different transitions. Ok for trunk and 4.8 branch (and binutils trunk)? Jakub
2013-08-13 Jakub Jelinek <ja...@redhat.com> PR target/58067 * config/i386/i386.md (*tls_global_dynamic_64_largepic): New insn. (*tls_local_dynamic_base_64_largepic): Likewise. (tls_global_dynamic_64_<mode>, tls_local_dynamic_base_64_<mode>): Remove predicate from call operand. * config/i386/i386.c (ix86_tls_get_addr): For -mcmodel=large -fpic return sum of pic_offset_table_rtx and UNSPEC_PLTOFF of the symbol. --- gcc/config/i386/i386.md.jj 2013-08-13 12:20:20.000000000 +0200 +++ gcc/config/i386/i386.md 2013-08-13 15:03:55.632194607 +0200 @@ -12303,11 +12303,33 @@ (define_insn "*tls_global_dynamic_64_<mo (set (attr "length") (symbol_ref "TARGET_X32 ? 15 : 16"))]) +(define_insn "*tls_global_dynamic_64_largepic" + [(set (match_operand:DI 0 "register_operand" "=a") + (call:DI + (mem:QI (plus:DI (match_operand:DI 2 "register_operand" "b") + (match_operand:DI 3 "immediate_operand" "i"))) + (match_operand 4))) + (unspec:DI [(match_operand 1 "tls_symbolic_operand")] + UNSPEC_TLS_GD)] + "TARGET_64BIT && ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF + && GET_CODE (operands[3]) == CONST + && GET_CODE (XEXP (operands[3], 0)) == UNSPEC + && XINT (XEXP (operands[3], 0), 1) == UNSPEC_PLTOFF" +{ + output_asm_insn + ("lea{q}\t{%E1@tlsgd(%%rip), %%rdi|rdi, %E1@tlsgd[rip]}", operands); + output_asm_insn ("movabs{q}\t{%3, %%rax|rax, %3}", operands); + output_asm_insn ("add{q}\t{%2, %%rax|rax, %2}", operands); + return "call\t{*%%rax|rax}"; +} + [(set_attr "type" "multi") + (set_attr "length" "22")]) + (define_expand "tls_global_dynamic_64_<mode>" [(parallel [(set (match_operand:P 0 "register_operand") (call:P - (mem:QI (match_operand 2 "constant_call_address_operand")) + (mem:QI (match_operand 2)) (const_int 0))) (unspec:P [(match_operand 1 "tls_symbolic_operand")] UNSPEC_TLS_GD)])] @@ -12365,11 +12387,32 @@ (define_insn "*tls_local_dynamic_base_64 [(set_attr "type" "multi") (set_attr "length" "12")]) +(define_insn "*tls_local_dynamic_base_64_largepic" + [(set (match_operand:DI 0 "register_operand" "=a") + (call:DI + (mem:QI (plus:DI (match_operand:DI 1 "register_operand" "b") + (match_operand:DI 2 "immediate_operand" "i"))) + (match_operand 3))) + (unspec:DI [(const_int 0)] UNSPEC_TLS_LD_BASE)] + "TARGET_64BIT && ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF + && GET_CODE (operands[2]) == CONST + && GET_CODE (XEXP (operands[2], 0)) == UNSPEC + && XINT (XEXP (operands[2], 0), 1) == UNSPEC_PLTOFF" +{ + output_asm_insn + ("lea{q}\t{%&@tlsld(%%rip), %%rdi|rdi, %&@tlsld[rip]}", operands); + output_asm_insn ("movabs{q}\t{%2, %%rax|rax, %2}", operands); + output_asm_insn ("add{q}\t{%1, %%rax|rax, %1}", operands); + return "call\t{*%%rax|rax}"; +} + [(set_attr "type" "multi") + (set_attr "length" "22")]) + (define_expand "tls_local_dynamic_base_64_<mode>" [(parallel [(set (match_operand:P 0 "register_operand") (call:P - (mem:QI (match_operand 1 "constant_call_address_operand")) + (mem:QI (match_operand 1)) (const_int 0))) (unspec:P [(const_int 0)] UNSPEC_TLS_LD_BASE)])] "TARGET_64BIT") --- gcc/config/i386/i386.c.jj 2013-08-13 12:20:20.000000000 +0200 +++ gcc/config/i386/i386.c 2013-08-13 14:42:32.449334139 +0200 @@ -13220,6 +13220,14 @@ ix86_tls_get_addr (void) ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym); } + if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF) + { + rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol), + UNSPEC_PLTOFF); + return gen_rtx_PLUS (Pmode, pic_offset_table_rtx, + gen_rtx_CONST (Pmode, unspec)); + } + return ix86_tls_symbol; }
2013-08-13 Jakub Jelinek <ja...@redhat.com> * elf64-x86-64.c (elf_x86_64_check_tls_transition): Allow 64-bit -mcmodel=large -fpic TLS GD and LD sequences. (elf_x86_64_relocate_section): Handle -mcmodel=large -fpic TLS GD and LD sequences in GD->LE, GD->IE and LD->LE transitions. --- bfd/elf64-x86-64.c.jj 2013-08-13 13:41:41.000000000 +0200 +++ bfd/elf64-x86-64.c 2013-08-13 16:11:28.902439602 +0200 @@ -1087,6 +1087,7 @@ elf_x86_64_check_tls_transition (bfd *ab { unsigned int val; unsigned long r_symndx; + bfd_boolean largepic = FALSE; struct elf_link_hash_entry *h; bfd_vma offset; struct elf_x86_64_link_hash_table *htab; @@ -1124,16 +1125,32 @@ elf_x86_64_check_tls_transition (bfd *ab can transit to different access model. For 32bit, only leaq foo@tlsgd(%rip), %rdi .word 0x6666; rex64; call __tls_get_addr - can transit to different access model. */ + can transit to different access model. For largepic + we also support: + leaq foo@tlsgd(%rip), %rdi + movabsq $__tls_get_addr@pltoff, %rax + addq $rbx, %rax + call *%rax. */ static const unsigned char call[] = { 0x66, 0x66, 0x48, 0xe8 }; static const unsigned char leaq[] = { 0x66, 0x48, 0x8d, 0x3d }; - if ((offset + 12) > sec->size - || memcmp (contents + offset + 4, call, 4) != 0) + if ((offset + 12) > sec->size) return FALSE; - if (ABI_64_P (abfd)) + if (memcmp (contents + offset + 4, call, 4) != 0) + { + if (!ABI_64_P (abfd) + || (offset + 19) > sec->size + || offset < 3 + || memcmp (contents + offset - 3, leaq + 1, 3) != 0 + || memcmp (contents + offset + 4, "\x48\xb8", 2) != 0 + || memcmp (contents + offset + 14, "\x48\x01\xd8\xff\xd0", 5) + != 0) + return FALSE; + largepic = TRUE; + } + else if (ABI_64_P (abfd)) { if (offset < 4 || memcmp (contents + offset - 4, leaq, 4) != 0) @@ -1151,16 +1168,31 @@ elf_x86_64_check_tls_transition (bfd *ab /* Check transition from LD access model. Only leaq foo@tlsld(%rip), %rdi; call __tls_get_addr - can transit to different access model. */ + can transit to different access model. For largepic + we also support: + leaq foo@tlsld(%rip), %rdi + movabsq $__tls_get_addr@pltoff, %rax + addq $rbx, %rax + call *%rax. */ static const unsigned char lea[] = { 0x48, 0x8d, 0x3d }; if (offset < 3 || (offset + 9) > sec->size) return FALSE; - if (memcmp (contents + offset - 3, lea, 3) != 0 - || 0xe8 != *(contents + offset + 4)) + if (memcmp (contents + offset - 3, lea, 3) != 0) return FALSE; + + if (0xe8 != *(contents + offset + 4)) + { + if (!ABI_64_P (abfd) + || (offset + 19) > sec->size + || memcmp (contents + offset + 4, "\x48\xb8", 2) != 0 + || memcmp (contents + offset + 14, "\x48\x01\xd8\xff\xd0", 5) + != 0) + return FALSE; + largepic = TRUE; + } } r_symndx = htab->r_sym (rel[1].r_info); @@ -1172,8 +1204,10 @@ elf_x86_64_check_tls_transition (bfd *ab may be versioned. */ return (h != NULL && h->root.root.string != NULL - && (ELF32_R_TYPE (rel[1].r_info) == R_X86_64_PC32 - || ELF32_R_TYPE (rel[1].r_info) == R_X86_64_PLT32) + && (largepic + ? ELF32_R_TYPE (rel[1].r_info) == R_X86_64_PLTOFF64 + : (ELF32_R_TYPE (rel[1].r_info) == R_X86_64_PC32 + || ELF32_R_TYPE (rel[1].r_info) == R_X86_64_PLT32)) && (strncmp (h->root.root.string, "__tls_get_addr", 14) == 0)); @@ -3947,8 +3981,26 @@ direct: .word 0x6666; rex64; call __tls_get_addr into: movl %fs:0, %eax - leaq foo@tpoff(%rax), %rax */ - if (ABI_64_P (output_bfd)) + leaq foo@tpoff(%rax), %rax + For largepic, change: + leaq foo@tlsgd(%rip), %rdi + movabsq $__tls_get_addr@pltoff, %rax + addq %rbx, %rax + call *%rax + into: + movq %fs:0, %rax + leaq foo@tpoff(%rax), %rax + nopw 0x0(%rax,%rax,1) */ + int largepic = 0; + if (ABI_64_P (output_bfd) + && contents[roff + 5] == (bfd_byte) '\xb8') + { + memcpy (contents + roff - 3, + "\x64\x48\x8b\x04\x25\0\0\0\0\x48\x8d\x80" + "\0\0\0\0\x66\x0f\x1f\x44\0", 22); + largepic = 1; + } + else if (ABI_64_P (output_bfd)) memcpy (contents + roff - 4, "\x64\x48\x8b\x04\x25\0\0\0\0\x48\x8d\x80\0\0\0", 16); @@ -3958,8 +4010,8 @@ direct: 15); bfd_put_32 (output_bfd, elf_x86_64_tpoff (info, relocation), - contents + roff + 8); - /* Skip R_X86_64_PC32/R_X86_64_PLT32. */ + contents + roff + 8 + largepic); + /* Skip R_X86_64_PC32/R_X86_64_PLT32/R_X86_64_PLTOFF64. */ rel++; continue; } @@ -4194,8 +4246,26 @@ direct: .word 0x6666; rex64; call __tls_get_addr@plt into: movl %fs:0, %eax - addq foo@gottpoff(%rip), %rax */ - if (ABI_64_P (output_bfd)) + addq foo@gottpoff(%rip), %rax + For largepic, change: + leaq foo@tlsgd(%rip), %rdi + movabsq $__tls_get_addr@pltoff, %rax + addq %rbx, %rax + call *%rax + into: + movq %fs:0, %rax + addq foo@gottpoff(%rax), %rax + nopw 0x0(%rax,%rax,1) */ + int largepic = 0; + if (ABI_64_P (output_bfd) + && contents[roff + 5] == (bfd_byte) '\xb8') + { + memcpy (contents + roff - 3, + "\x64\x48\x8b\x04\x25\0\0\0\0\x48\x03\x05" + "\0\0\0\0\x66\x0f\x1f\x44\0", 22); + largepic = 1; + } + else if (ABI_64_P (output_bfd)) memcpy (contents + roff - 4, "\x64\x48\x8b\x04\x25\0\0\0\0\x48\x03\x05\0\0\0", 16); @@ -4207,12 +4277,13 @@ direct: relocation = (htab->elf.sgot->output_section->vma + htab->elf.sgot->output_offset + off - roff + - largepic - input_section->output_section->vma - input_section->output_offset - 12); bfd_put_32 (output_bfd, relocation, - contents + roff + 8); - /* Skip R_X86_64_PLT32. */ + contents + roff + 8 + largepic); + /* Skip R_X86_64_PLT32/R_X86_64_PLTOFF64. */ rel++; continue; } @@ -4274,16 +4345,29 @@ direct: For 64bit, we change it into: .word 0x6666; .byte 0x66; movq %fs:0, %rax. For 32bit, we change it into: - nopl 0x0(%rax); movl %fs:0, %eax. */ + nopl 0x0(%rax); movl %fs:0, %eax. + For largepic, change: + leaq foo@tlsgd(%rip), %rdi + movabsq $__tls_get_addr@pltoff, %rax + addq %rbx, %rax + call *%rax + into: + data32 data32 data32 nopw %cs:0x0(%rax,%rax,1) + movq %fs:0, %eax */ BFD_ASSERT (r_type == R_X86_64_TPOFF32); - if (ABI_64_P (output_bfd)) + if (ABI_64_P (output_bfd) + && contents[rel->r_offset + 5] == (bfd_byte) '\xb8') + memcpy (contents + rel->r_offset - 3, + "\x66\x66\x66\x66\x2e\x0f\x1f\x84\0\0\0\0\0" + "\x64\x48\x8b\x04\x25\0\0\0", 22); + else if (ABI_64_P (output_bfd)) memcpy (contents + rel->r_offset - 3, "\x66\x66\x66\x64\x48\x8b\x04\x25\0\0\0", 12); else memcpy (contents + rel->r_offset - 3, "\x0f\x1f\x40\x00\x64\x8b\x04\x25\0\0\0", 12); - /* Skip R_X86_64_PC32/R_X86_64_PLT32. */ + /* Skip R_X86_64_PC32/R_X86_64_PLT32/R_X86_64_PLTOFF64. */ rel++; continue; }
__thread int a; static __thread int b; extern __thread int c; int foo () { return a++ + b++ + c++; } int main () { a = 4; b = 5; c = 6; return foo () + foo () - 33; }
__thread int c;