https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120653
--- Comment #14 from Jakub Jelinek <jakub at gcc dot gnu.org> --- Seems _dl_start_final is in this configuration inlined into _dl_start and the important difference is (-fstrict-aliasing to -fno-strict-aliasing): @@ -1206,11 +1207,8 @@ _dl_start: pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 - leaq __ehdr_start(%rip), %rsi leaq _end(%rip), %rax - movq %rsi, %xmm2 movq %rax, %xmm3 - punpcklqdq %xmm3, %xmm2 movq %rsp, %rbp .cfi_def_cfa_register 6 pushq %r15 @@ -1225,214 +1223,176 @@ _dl_start: .cfi_offset 12, -48 .cfi_offset 3, -56 movq %rdi, -136(%rbp) + movq .LC31(%rip), %xmm2 + punpcklqdq %xmm3, %xmm2 movaps %xmm2, -128(%rbp) rdtsc + leaq __ehdr_start(%rip), %rdi andb $-33, 854+_dl_rtld_map(%rip) - leaq 64+_dl_rtld_map(%rip), %rcx - movl $1879048191, %r8d - movl $1879048233, %r9d + movq %rdi, _dl_rtld_map(%rip) salq $32, %rdx - movq %rsi, _dl_rtld_map(%rip) orq %rdx, %rax leaq _DYNAMIC(%rip), %rdx movq %rax, start_time(%rip) movq _DYNAMIC(%rip), %rax ... @@ -6289,9 +6296,13 @@ _rtld_global_ro: .globl _rtld_local_ro .hidden _rtld_local_ro .set _rtld_local_ro,_rtld_global_ro + .section .data.rel.ro.local + .align 8 +.LC31: + .quad __ehdr_start .section .rodata.cst16,"aM",@progbits,16 .align 16 -.LC74: +.LC75: .quad -1 .quad 0 .hidden __rtld_libc_freeres The compiler has decided to vectorize the __ehdr_start and _end stores in both cases, optimized dump has similar code like: _359 = (long unsigned int) &_end; _358 = (long unsigned int) &__ehdr_start; _357 = {_358, _359}; ... MEM <vector(2) long unsigned int> [(long unsigned int *)&_dl_rtld_map + 912B] = _357; Before RA we have in both cases something like (insn 15 3 1129 2 (set (reg/f:DI 253) (symbol_ref:DI ("__ehdr_start") [flags 0x42] <var_decl 0x7fb465c3e1b0 __ehdr_start>)) 84 {*movdi_internal} (expr_list:REG_EQUIV (symbol_ref:DI ("__ehdr_start") [flags 0x42] <var_decl 0x7fb465c3e1b0 __ehdr_start>) (nil))) (insn 1129 15 16 2 (set (reg/f:DI 252) (symbol_ref:DI ("_end") [flags 0x42] <var_decl 0x7fb465c89000 _end>)) 84 {*movdi_internal} (expr_list:REG_EQUIV (symbol_ref:DI ("_end") [flags 0x42] <var_decl 0x7fb465c89000 _end>) (nil))) (insn 16 1129 18 2 (set (reg:V2DI 235 [ _357 ]) (vec_concat:V2DI (reg/f:DI 253) (reg/f:DI 252))) 7525 {vec_concatv2di} (expr_list:REG_DEAD (reg/f:DI 252) (expr_list:REG_EQUIV (vec_concat:V2DI (symbol_ref:DI ("__ehdr_start") [flags 0x42] <var_decl 0x7fb465c3e1b0 __ehdr_start>) (symbol_ref:DI ("_end") [flags 0x42] <var_decl 0x7fb465c89000 _end>)) (nil)))) i.e. set one pseudo to __ehdr_start, another to _end and do vec_concat on that. But in the -fno-strict-aliasing case RA decides to spill __ehdr_start into memory and load from memory: (insn 1102 13 1107 2 (set (reg/f:DI 0 ax [258]) (symbol_ref:DI ("_end") [flags 0x42] <var_decl 0x7f759e489000 _end>)) 84 {*movdi_internal} (expr_list:REG_EQUIV (symbol_ref:DI ("_end") [flags 0x42] <var_decl 0x7f759e489000 _end>) (nil))) (insn 1107 1102 1109 2 (set (reg:DI 22 xmm2 [orig:243 _359 ] [243]) (mem/u/c:DI (symbol_ref/u:DI ("*.LC31") [flags 0x2]) [0 S8 A64])) 84 {*movdi_internal} (nil)) (insn 1109 1107 14 2 (set (reg/f:DI 23 xmm3 [258]) (reg/f:DI 0 ax [258])) 84 {*movdi_internal} (nil)) (insn 14 1109 1108 2 (set (reg:V2DI 22 xmm2 [orig:243 _359 ] [243]) (vec_concat:V2DI (reg:DI 22 xmm2 [orig:243 _359 ] [243]) (reg/f:DI 23 xmm3 [258]))) 7525 {vec_concatv2di} (expr_list:REG_EQUIV (vec_concat:V2DI (symbol_ref:DI ("__ehdr_start") [flags 0x42] <var_decl 0x7f759e43e1b0 __ehdr_start>) (symbol_ref:DI ("_end") [flags 0x42] <var_decl 0x7f759e489000 _end>)) (nil))) while in the -fstrict-aliasing case it doesn't: (insn 15 3 1129 2 (set (reg/f:DI 4 si [253]) (symbol_ref:DI ("__ehdr_start") [flags 0x42] <var_decl 0x7fb465c3e1b0 __ehdr_start>)) 84 {*movdi_internal} (expr_list:REG_EQUIV (symbol_ref:DI ("__ehdr_start") [flags 0x42] <var_decl 0x7fb465c3e1b0 __ehdr_start>) (nil))) (insn 1129 15 1134 2 (set (reg/f:DI 0 ax [252]) (symbol_ref:DI ("_end") [flags 0x42] <var_decl 0x7fb465c89000 _end>)) 84 {*movdi_internal} (expr_list:REG_EQUIV (symbol_ref:DI ("_end") [flags 0x42] <var_decl 0x7fb465c89000 _end>) (nil))) (insn 1134 1129 1136 2 (set (reg:DI 22 xmm2 [orig:235 _357 ] [235]) (reg/f:DI 4 si [253])) 84 {*movdi_internal} (nil)) (insn 1136 1134 16 2 (set (reg/f:DI 23 xmm3 [252]) (reg/f:DI 0 ax [252])) 84 {*movdi_internal} (nil)) (insn 16 1136 1135 2 (set (reg:V2DI 22 xmm2 [orig:235 _357 ] [235]) (vec_concat:V2DI (reg:DI 22 xmm2 [orig:235 _357 ] [235]) (reg/f:DI 23 xmm3 [252]))) 7525 {vec_concatv2di} (expr_list:REG_EQUIV (vec_concat:V2DI (symbol_ref:DI ("__ehdr_start") [flags 0x42] <var_decl 0x7fb465c3e1b0 __ehdr_start>) (symbol_ref:DI ("_end") [flags 0x42] <var_decl 0x7fb465c89000 _end>)) (nil))) I don't really see anything wrong here on the GCC side. Perhaps rtld.c should be compiled with -fno-tree-vectorize -fno-slp-vectorize if the compiler supports those switches, or _dl_start and _dl_start_final should use optimize attribute to achive the same effect again if compiler supports that.