https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120653

--- Comment #14 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
Seems _dl_start_final is in this configuration inlined into _dl_start and the
important difference is (-fstrict-aliasing to -fno-strict-aliasing):
@@ -1206,11 +1207,8 @@ _dl_start:
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
-       leaq    __ehdr_start(%rip), %rsi
        leaq    _end(%rip), %rax
-       movq    %rsi, %xmm2
        movq    %rax, %xmm3
-       punpcklqdq      %xmm3, %xmm2
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        pushq   %r15
@@ -1225,214 +1223,176 @@ _dl_start:
        .cfi_offset 12, -48
        .cfi_offset 3, -56
        movq    %rdi, -136(%rbp)
+       movq    .LC31(%rip), %xmm2
+       punpcklqdq      %xmm3, %xmm2
        movaps  %xmm2, -128(%rbp)
        rdtsc
+       leaq    __ehdr_start(%rip), %rdi
        andb    $-33, 854+_dl_rtld_map(%rip)
-       leaq    64+_dl_rtld_map(%rip), %rcx
-       movl    $1879048191, %r8d
-       movl    $1879048233, %r9d
+       movq    %rdi, _dl_rtld_map(%rip)
        salq    $32, %rdx
-       movq    %rsi, _dl_rtld_map(%rip)
        orq     %rdx, %rax
        leaq    _DYNAMIC(%rip), %rdx
        movq    %rax, start_time(%rip)
        movq    _DYNAMIC(%rip), %rax
...
@@ -6289,9 +6296,13 @@ _rtld_global_ro:
        .globl  _rtld_local_ro
        .hidden _rtld_local_ro
        .set    _rtld_local_ro,_rtld_global_ro
+       .section        .data.rel.ro.local
+       .align 8
+.LC31:
+       .quad   __ehdr_start
        .section        .rodata.cst16,"aM",@progbits,16
        .align 16
-.LC74:
+.LC75:
        .quad   -1
        .quad   0
        .hidden __rtld_libc_freeres
The compiler has decided to vectorize the __ehdr_start and _end stores in both
cases, optimized dump has similar code like:
  _359 = (long unsigned int) &_end;
  _358 = (long unsigned int) &__ehdr_start;
  _357 = {_358, _359};
...
  MEM <vector(2) long unsigned int> [(long unsigned int *)&_dl_rtld_map + 912B]
= _357;
Before RA we have in both cases something like
(insn 15 3 1129 2 (set (reg/f:DI 253)
        (symbol_ref:DI ("__ehdr_start") [flags 0x42]  <var_decl 0x7fb465c3e1b0
__ehdr_start>)) 84 {*movdi_internal}
     (expr_list:REG_EQUIV (symbol_ref:DI ("__ehdr_start") [flags 0x42] 
<var_decl 0x7fb465c3e1b0 __ehdr_start>)
        (nil)))
(insn 1129 15 16 2 (set (reg/f:DI 252)
        (symbol_ref:DI ("_end") [flags 0x42]  <var_decl 0x7fb465c89000 _end>))
84 {*movdi_internal}
     (expr_list:REG_EQUIV (symbol_ref:DI ("_end") [flags 0x42]  <var_decl
0x7fb465c89000 _end>)
        (nil)))
(insn 16 1129 18 2 (set (reg:V2DI 235 [ _357 ])
        (vec_concat:V2DI (reg/f:DI 253)
            (reg/f:DI 252))) 7525 {vec_concatv2di}
     (expr_list:REG_DEAD (reg/f:DI 252)
        (expr_list:REG_EQUIV (vec_concat:V2DI (symbol_ref:DI ("__ehdr_start")
[flags 0x42]  <var_decl 0x7fb465c3e1b0 __ehdr_start>)
                (symbol_ref:DI ("_end") [flags 0x42]  <var_decl 0x7fb465c89000
_end>))
            (nil))))
i.e. set one pseudo to __ehdr_start, another to _end and do vec_concat on that.
But in the -fno-strict-aliasing case RA decides to spill __ehdr_start into
memory and
load from memory:
(insn 1102 13 1107 2 (set (reg/f:DI 0 ax [258])
        (symbol_ref:DI ("_end") [flags 0x42]  <var_decl 0x7f759e489000 _end>))
84 {*movdi_internal}
     (expr_list:REG_EQUIV (symbol_ref:DI ("_end") [flags 0x42]  <var_decl
0x7f759e489000 _end>)
        (nil)))
(insn 1107 1102 1109 2 (set (reg:DI 22 xmm2 [orig:243 _359 ] [243])
        (mem/u/c:DI (symbol_ref/u:DI ("*.LC31") [flags 0x2]) [0  S8 A64])) 84
{*movdi_internal}
     (nil))
(insn 1109 1107 14 2 (set (reg/f:DI 23 xmm3 [258])
        (reg/f:DI 0 ax [258])) 84 {*movdi_internal}
     (nil))
(insn 14 1109 1108 2 (set (reg:V2DI 22 xmm2 [orig:243 _359 ] [243])
        (vec_concat:V2DI (reg:DI 22 xmm2 [orig:243 _359 ] [243])
            (reg/f:DI 23 xmm3 [258]))) 7525 {vec_concatv2di}
     (expr_list:REG_EQUIV (vec_concat:V2DI (symbol_ref:DI ("__ehdr_start")
[flags 0x42]  <var_decl 0x7f759e43e1b0 __ehdr_start>)
            (symbol_ref:DI ("_end") [flags 0x42]  <var_decl 0x7f759e489000
_end>))
        (nil)))
while in the -fstrict-aliasing case it doesn't:
(insn 15 3 1129 2 (set (reg/f:DI 4 si [253])
        (symbol_ref:DI ("__ehdr_start") [flags 0x42]  <var_decl 0x7fb465c3e1b0
__ehdr_start>)) 84 {*movdi_internal}
     (expr_list:REG_EQUIV (symbol_ref:DI ("__ehdr_start") [flags 0x42] 
<var_decl 0x7fb465c3e1b0 __ehdr_start>)
        (nil)))
(insn 1129 15 1134 2 (set (reg/f:DI 0 ax [252])
        (symbol_ref:DI ("_end") [flags 0x42]  <var_decl 0x7fb465c89000 _end>))
84 {*movdi_internal}
     (expr_list:REG_EQUIV (symbol_ref:DI ("_end") [flags 0x42]  <var_decl
0x7fb465c89000 _end>)
        (nil)))
(insn 1134 1129 1136 2 (set (reg:DI 22 xmm2 [orig:235 _357 ] [235])
        (reg/f:DI 4 si [253])) 84 {*movdi_internal}
     (nil))
(insn 1136 1134 16 2 (set (reg/f:DI 23 xmm3 [252])
        (reg/f:DI 0 ax [252])) 84 {*movdi_internal}
     (nil))
(insn 16 1136 1135 2 (set (reg:V2DI 22 xmm2 [orig:235 _357 ] [235])
        (vec_concat:V2DI (reg:DI 22 xmm2 [orig:235 _357 ] [235])
            (reg/f:DI 23 xmm3 [252]))) 7525 {vec_concatv2di}
     (expr_list:REG_EQUIV (vec_concat:V2DI (symbol_ref:DI ("__ehdr_start")
[flags 0x42]  <var_decl 0x7fb465c3e1b0 __ehdr_start>)
            (symbol_ref:DI ("_end") [flags 0x42]  <var_decl 0x7fb465c89000
_end>))
        (nil)))

I don't really see anything wrong here on the GCC side.
Perhaps rtld.c should be compiled with -fno-tree-vectorize -fno-slp-vectorize
if the compiler supports those switches, or _dl_start and _dl_start_final
should use optimize attribute to achive the same effect again if compiler
supports that.

Reply via email to