https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121495
--- Comment #14 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
With:
add x1, x0, 24
mov x2, 32
str x2, [x0], 8 ;;; wrong spot?
adrp x2, .LC291
movi v24.2s, 0
fmov d31, x0 // ok
add x3, x0, 520 ;; (incoming)x0+8+520??? seems wrong.
ldr q28, [x2, #:lo12:.LC291]
fmov d30, d31
adrp x0, .LC290
adrp x2, .LC292
ldr q29, [x0, #:lo12:.LC290]
ins v30.d[1], x1
fmov x0, d31
ldr q27, [x2, #:lo12:.LC292]
Without (-mearly-ra=none):
mov x4, x0
mov x2, 32
add x3, x4, 520 // x0+520
add x1, x0, 24 // x1 = x0+24
adrp x0, .LC290
movi v30.2s, 0
str x2, [x4], 8 // x0+8
ldr q25, [x0, #:lo12:.LC290]
fmov d29, x4 // x0+8
adrp x0, .LC291
ins v29.d[1], x1
adrp x1, .LC292
ldr q26, [x0, #:lo12:.LC291]
mov x0, x4
ldr q27, [x1, #:lo12:.LC292]
So the bug is not in early-ra as far as I can tell. Let me look into why .
So in summary looks like an additional iteration through the loop.
Is there a way to patch the assembly and for you to try with that?
e.g. take -S and patch s/520/512/ and see if that works?