This is the updated patch of pr58066-3.patch. The calls added in the templates of tls_local_dynamic_base_32 and tls_global_dynamic_32 in pr58066-3.patch are used to prevent sched2 from moving sp setting across implicit tls calls, but those calls make the combine of UNSPEC_TLS_LD_BASE and UNSPEC_DTPOFF difficult, so that the optimization in tls_local_dynamic_32_once to convert local_dynamic to global_dynamic mode for single tls reference cannot take effect. In the updated patch, I remove those calls from insn templates and add "reg:SI SP_REG" explicitly in the templates of UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE. It solves the sched2 and combine problems above, and now the optimization in tls_local_dynamic_32_once works.
bootstrapped ok on x86_64-linux-gnu. regression is going on. Is it OK if regression passes? Thanks. Wei. ChangeLog: gcc/ 2014-05-07 Wei Mi <w...@google.com> * config/i386/i386.c (ix86_compute_frame_layout): preferred_stack_boundary updated for tls expanded call. * config/i386/i386.md: Set ix86_tls_descriptor_calls_expanded_in_cfun. gcc/testsuite/ 2014-05-07 Wei Mi <w...@google.com> * gcc.target/i386/pr58066.c: New test. Index: testsuite/gcc.target/i386/pr58066.c =================================================================== --- testsuite/gcc.target/i386/pr58066.c (revision 0) +++ testsuite/gcc.target/i386/pr58066.c (revision 0) @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-fPIC -O2" } */ + +/* Check whether the stack frame starting addresses of tls expanded calls + in foo and goo are 16bytes aligned. */ +static __thread char ccc1; +void* foo() +{ + return &ccc1; +} + +__thread char ccc2; +void* goo() +{ + return &ccc2; +} + +/* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 16" 2 } } */ Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 209979) +++ config/i386/i386.c (working copy) @@ -9485,20 +9485,30 @@ ix86_compute_frame_layout (struct ix86_f frame->nregs = ix86_nsaved_regs (); frame->nsseregs = ix86_nsaved_sseregs (); - stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT; - preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT; - /* 64-bit MS ABI seem to require stack alignment to be always 16 except for function prologues and leaf. */ - if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16) + if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128) && (!crtl->is_leaf || cfun->calls_alloca != 0 || ix86_current_function_calls_tls_descriptor)) { - preferred_alignment = 16; - stack_alignment_needed = 16; crtl->preferred_stack_boundary = 128; crtl->stack_alignment_needed = 128; } + /* preferred_stack_boundary is never updated for call + expanded from tls descriptor. Update it here. We don't update it in + expand stage because according to the comments before + ix86_current_function_calls_tls_descriptor, tls calls may be optimized + away. */ + else if (ix86_current_function_calls_tls_descriptor + && crtl->preferred_stack_boundary < PREFERRED_STACK_BOUNDARY) + { + crtl->preferred_stack_boundary = PREFERRED_STACK_BOUNDARY; + if (crtl->stack_alignment_needed < PREFERRED_STACK_BOUNDARY) + crtl->stack_alignment_needed = PREFERRED_STACK_BOUNDARY; + } + + stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT; + preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT; gcc_assert (!size || stack_alignment_needed); gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT); Index: config/i386/i386.md =================================================================== --- config/i386/i386.md (revision 209979) +++ config/i386/i386.md (working copy) @@ -12530,7 +12530,8 @@ (unspec:SI [(match_operand:SI 1 "register_operand" "b") (match_operand 2 "tls_symbolic_operand") - (match_operand 3 "constant_call_address_operand" "z")] + (match_operand 3 "constant_call_address_operand" "z") + (reg:SI SP_REG)] UNSPEC_TLS_GD)) (clobber (match_scratch:SI 4 "=d")) (clobber (match_scratch:SI 5 "=c")) @@ -12555,11 +12556,14 @@ [(set (match_operand:SI 0 "register_operand") (unspec:SI [(match_operand:SI 2 "register_operand") (match_operand 1 "tls_symbolic_operand") - (match_operand 3 "constant_call_address_operand")] + (match_operand 3 "constant_call_address_operand") + (reg:SI SP_REG)] UNSPEC_TLS_GD)) (clobber (match_scratch:SI 4)) (clobber (match_scratch:SI 5)) - (clobber (reg:CC FLAGS_REG))])]) + (clobber (reg:CC FLAGS_REG))])] + "" + "ix86_tls_descriptor_calls_expanded_in_cfun = true;") (define_insn "*tls_global_dynamic_64_<mode>" [(set (match_operand:P 0 "register_operand" "=a") @@ -12614,13 +12618,15 @@ (const_int 0))) (unspec:P [(match_operand 1 "tls_symbolic_operand")] UNSPEC_TLS_GD)])] - "TARGET_64BIT") + "TARGET_64BIT" + "ix86_tls_descriptor_calls_expanded_in_cfun = true;") (define_insn "*tls_local_dynamic_base_32_gnu" [(set (match_operand:SI 0 "register_operand" "=a") (unspec:SI [(match_operand:SI 1 "register_operand" "b") - (match_operand 2 "constant_call_address_operand" "z")] + (match_operand 2 "constant_call_address_operand" "z") + (reg:SI SP_REG)] UNSPEC_TLS_LD_BASE)) (clobber (match_scratch:SI 3 "=d")) (clobber (match_scratch:SI 4 "=c")) @@ -12646,11 +12652,14 @@ [(set (match_operand:SI 0 "register_operand") (unspec:SI [(match_operand:SI 1 "register_operand") - (match_operand 2 "constant_call_address_operand")] + (match_operand 2 "constant_call_address_operand") + (reg:SI SP_REG)] UNSPEC_TLS_LD_BASE)) (clobber (match_scratch:SI 3)) (clobber (match_scratch:SI 4)) - (clobber (reg:CC FLAGS_REG))])]) + (clobber (reg:CC FLAGS_REG))])] + "" + "ix86_tls_descriptor_calls_expanded_in_cfun = true;") (define_insn "*tls_local_dynamic_base_64_<mode>" [(set (match_operand:P 0 "register_operand" "=a") @@ -12697,7 +12706,8 @@ (mem:QI (match_operand 1)) (const_int 0))) (unspec:P [(const_int 0)] UNSPEC_TLS_LD_BASE)])] - "TARGET_64BIT") + "TARGET_64BIT" + "ix86_tls_descriptor_calls_expanded_in_cfun = true;") ;; Local dynamic of a single variable is a lose. Show combine how ;; to convert that back to global dynamic. @@ -12706,7 +12716,8 @@ [(set (match_operand:SI 0 "register_operand" "=a") (plus:SI (unspec:SI [(match_operand:SI 1 "register_operand" "b") - (match_operand 2 "constant_call_address_operand" "z")] + (match_operand 2 "constant_call_address_operand" "z") + (reg:SI SP_REG)] UNSPEC_TLS_LD_BASE) (const:SI (unspec:SI [(match_operand 3 "tls_symbolic_operand")] @@ -12719,7 +12730,8 @@ "" [(parallel [(set (match_dup 0) - (unspec:SI [(match_dup 1) (match_dup 3) (match_dup 2)] + (unspec:SI [(match_dup 1) (match_dup 3) (match_dup 2) + (reg:SI SP_REG)] UNSPEC_TLS_GD)) (clobber (match_dup 4)) (clobber (match_dup 5))