https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118328

Diego Russo <Diego.Russo at arm dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |Diego.Russo at arm dot com

--- Comment #12 from Diego Russo <Diego.Russo at arm dot com> ---
Hello,

I was able to test Richard's patch and I'm glad to confirm that it brings the
benefit expected.
I built gcc with the patch and with it I compiled
https://github.com/Fidget-Spinner/cpython/tree/tail-call-gcc-2 branch that
implements the tail-calling interpreter.
I've also compiled a modified version of that branch which doesn't use the
preserve_none attribute.

We noticed improvements in the code generation. This is the version without
preserve_none


000000000060600c <_TAIL_CALL_BINARY_OP_ADD_INT>:
  60600c:       f85f0026        ldur    x6, [x1, #-16]
  606010:       aa0103e5        mov     x5, x1
  606014:       900018e1        adrp    x1, 922000 <PyList_Type+0x140>
  606018:       91114021        add     x1, x1, #0x450
  60601c:       aa0003e9        mov     x9, x0
  606020:       f94004c7        ldr     x7, [x6, #8]
  606024:       f9001c03        str     x3, [x0, #56]
  606028:       eb0100ff        cmp     x7, x1
  60602c:       540000a1        b.ne    606040
<_TAIL_CALL_BINARY_OP_ADD_INT+0x34>  // b.any
  606030:       f85f80a8        ldur    x8, [x5, #-8]
  606034:       f9400500        ldr     x0, [x8, #8]
  606038:       eb07001f        cmp     x0, x7
  60603c:       54000080        b.eq    60604c
<_TAIL_CALL_BINARY_OP_ADD_INT+0x40>  // b.none
  606040:       aa0503e1        mov     x1, x5
  606044:       aa0903e0        mov     x0, x9
  606048:       17fffd9a        b       6056b0 <_TAIL_CALL_BINARY_OP>
  60604c:       a9bb7bfd        stp     x29, x30, [sp, #-80]!
  606050:       aa0803e1        mov     x1, x8
  606054:       aa0603e0        mov     x0, x6
  606058:       910003fd        mov     x29, sp
  60605c:       a90153f3        stp     x19, x20, [sp, #16]
  606060:       91003073        add     x19, x3, #0xc
  606064:       aa0203f4        mov     x20, x2
  606068:       a90223e6        stp     x6, x8, [sp, #32]
  60606c:       a90327e3        stp     x3, x9, [sp, #48]
  606070:       f90023e5        str     x5, [sp, #64]
  606074:       97fb2ca4        bl      4d1304 <_PyLong_Add>
  606078:       a94223e6        ldp     x6, x8, [sp, #32]
  60607c:       aa0003e4        mov     x4, x0
  606080:       f94023e5        ldr     x5, [sp, #64]
  606084:       a94327e3        ldp     x3, x9, [sp, #48]
  606088:       b9400100        ldr     w0, [x8]
  60608c:       37f80340        tbnz    w0, #31, 6060f4
<_TAIL_CALL_BINARY_OP_ADD_INT+0xe8>
  606090:       51000400        sub     w0, w0, #0x1
  606094:       b9000100        str     w0, [x8]
  606098:       350002e0        cbnz    w0, 6060f4
<_TAIL_CALL_BINARY_OP_ADD_INT+0xe8>
  60609c:       90001a20        adrp    x0, 94a000 <stat_methods+0x78>
  6060a0:       91176000        add     x0, x0, #0x5d8
  6060a4:       f9544807        ldr     x7, [x0, #10384]
  6060a8:       b4000167        cbz     x7, 6060d4
<_TAIL_CALL_BINARY_OP_ADD_INT+0xc8>
  6060ac:       f9544c02        ldr     x2, [x0, #10392]
  6060b0:       a9021be8        stp     x8, x6, [sp, #32]
  6060b4:       aa0803e0        mov     x0, x8
  6060b8:       52800021        mov     w1, #0x1                        // #1
  6060bc:       f9001be4        str     x4, [sp, #48]
  6060c0:       f90027e3        str     x3, [sp, #72]
  6060c4:       d63f00e0        blr     x7
  6060c8:       a9421be8        ldp     x8, x6, [sp, #32]
  6060cc:       a94327e4        ldp     x4, x9, [sp, #48]
  6060d0:       a9440fe5        ldp     x5, x3, [sp, #64]
  6060d4:       aa0803e0        mov     x0, x8
  6060d8:       a90213e6        stp     x6, x4, [sp, #32]
  6060dc:       a90317e9        stp     x9, x5, [sp, #48]
  6060e0:       f90023e3        str     x3, [sp, #64]
  6060e4:       97fb2c71        bl      4d12a8 <_PyLong_ExactDealloc>
  6060e8:       a94213e6        ldp     x6, x4, [sp, #32]
  6060ec:       a94317e9        ldp     x9, x5, [sp, #48]
  6060f0:       f94023e3        ldr     x3, [sp, #64]
  6060f4:       b94000c0        ldr     w0, [x6]
  6060f8:       37f80300        tbnz    w0, #31, 606158
<_TAIL_CALL_BINARY_OP_ADD_INT+0x14c>
  6060fc:       51000400        sub     w0, w0, #0x1
  606100:       b90000c0        str     w0, [x6]
  606104:       350002a0        cbnz    w0, 606158
<_TAIL_CALL_BINARY_OP_ADD_INT+0x14c>
  606108:       90001a20        adrp    x0, 94a000 <stat_methods+0x78>
  60610c:       91176000        add     x0, x0, #0x5d8
  606110:       f9544807        ldr     x7, [x0, #10384]
  606114:       b4000167        cbz     x7, 606140
<_TAIL_CALL_BINARY_OP_ADD_INT+0x134>
  606118:       f9544c02        ldr     x2, [x0, #10392]
  60611c:       a90213e6        stp     x6, x4, [sp, #32]
  606120:       aa0603e0        mov     x0, x6
  606124:       a90317e9        stp     x9, x5, [sp, #48]
  606128:       52800021        mov     w1, #0x1                        // #1
  60612c:       f90023e3        str     x3, [sp, #64]
  606130:       d63f00e0        blr     x7
  606134:       a94213e6        ldp     x6, x4, [sp, #32]
  606138:       a94317e9        ldp     x9, x5, [sp, #48]
  60613c:       f94023e3        ldr     x3, [sp, #64]
  606140:       aa0603e0        mov     x0, x6
  606144:       a90227e4        stp     x4, x9, [sp, #32]
  606148:       a9030fe5        stp     x5, x3, [sp, #48]
  60614c:       97fb2c57        bl      4d12a8 <_PyLong_ExactDealloc>
  606150:       a94227e4        ldp     x4, x9, [sp, #32]
  606154:       a9430fe5        ldp     x5, x3, [sp, #48]
  606158:       b4000204        cbz     x4, 606198
<_TAIL_CALL_BINARY_OP_ADD_INT+0x18c>
  60615c:       f81f00a4        stur    x4, [x5, #-16]
  606160:       d0000c60        adrp    x0, 794000
<builtin___import____doc__+0x80>
  606164:       79401864        ldrh    w4, [x3, #12]
  606168:       910c0000        add     x0, x0, #0x300
  60616c:       aa1403e2        mov     x2, x20
  606170:       aa1303e3        mov     x3, x19
  606174:       12001c81        and     w1, w4, #0xff
  606178:       a94153f3        ldp     x19, x20, [sp, #16]
  60617c:       53087c84        lsr     w4, w4, #8
  606180:       f861d806        ldr     x6, [x0, w1, sxtw #3]
  606184:       d10020a1        sub     x1, x5, #0x8
  606188:       a8c57bfd        ldp     x29, x30, [sp], #80
  60618c:       aa0903e0        mov     x0, x9
  606190:       aa0603f0        mov     x16, x6
  606194:       d61f0200        br      x16
  606198:       aa1303e3        mov     x3, x19
  60619c:       aa1403e2        mov     x2, x20
  6061a0:       a94153f3        ldp     x19, x20, [sp, #16]
  6061a4:       d10040a1        sub     x1, x5, #0x10
  6061a8:       a8c57bfd        ldp     x29, x30, [sp], #80
  6061ac:       aa0903e0        mov     x0, x9
  6061b0:       17ffe575        b       5ff784 <_TAIL_CALL_error.isra.0>
  6061b4:       d503201f        nop
  6061b8:       d503201f        nop
  6061bc:       d503201f        nop

we can see the callee-save registers x19, x20 spilled to the stack. Similar
thing happens with the caller-save registers (x3, x4, x5, x9)

...
  606144:       a90227e4        stp     x4, x9, [sp, #32]
  606148:       a9030fe5        stp     x5, x3, [sp, #48]
  60614c:       97fb2c57        bl      4d12a8 <_PyLong_ExactDealloc>
  606150:       a94227e4        ldp     x4, x9, [sp, #32]
  606154:       a9430fe5        ldp     x5, x3, [sp, #48]
...

This is the preserves_none output:

0000000000601be0 <_TAIL_CALL_BINARY_OP_ADD_INT>:
  601be0:       f85f0035        ldur    x21, [x1, #-16]
  601be4:       aa0303f4        mov     x20, x3
  601be8:       f9001c03        str     x3, [x0, #56]
  601bec:       aa0103f3        mov     x19, x1
  601bf0:       b0001901        adrp    x1, 922000 <PyList_Type+0x140>
  601bf4:       91114021        add     x1, x1, #0x450
  601bf8:       f94006a3        ldr     x3, [x21, #8]
  601bfc:       aa0003f7        mov     x23, x0
  601c00:       eb01007f        cmp     x3, x1
  601c04:       540000a1        b.ne    601c18
<_TAIL_CALL_BINARY_OP_ADD_INT+0x38>  // b.any
  601c08:       f85f8276        ldur    x22, [x19, #-8]
  601c0c:       f94006c0        ldr     x0, [x22, #8]
  601c10:       eb03001f        cmp     x0, x3
  601c14:       540000a0        b.eq    601c28
<_TAIL_CALL_BINARY_OP_ADD_INT+0x48>  // b.none
  601c18:       aa1403e3        mov     x3, x20
  601c1c:       aa1303e1        mov     x1, x19
  601c20:       aa1703e0        mov     x0, x23
  601c24:       17fffe0a        b       60144c <_TAIL_CALL_BINARY_OP>
  601c28:       a9bf7bfd        stp     x29, x30, [sp, #-16]!
  601c2c:       2a0403f9        mov     w25, w4
  601c30:       aa0203f8        mov     x24, x2
  601c34:       910003fd        mov     x29, sp
  601c38:       aa1603e1        mov     x1, x22
  601c3c:       aa1503e0        mov     x0, x21
  601c40:       97fb3db1        bl      4d1304 <_PyLong_Add>
  601c44:       aa0003fa        mov     x26, x0
  601c48:       b94002c0        ldr     w0, [x22]
  601c4c:       9100329b        add     x27, x20, #0xc
  601c50:       37f801c0        tbnz    w0, #31, 601c88
<_TAIL_CALL_BINARY_OP_ADD_INT+0xa8>
  601c54:       51000400        sub     w0, w0, #0x1
  601c58:       b90002c0        str     w0, [x22]
  601c5c:       35000160        cbnz    w0, 601c88
<_TAIL_CALL_BINARY_OP_ADD_INT+0xa8>
  601c60:       b0001a40        adrp    x0, 94a000 <stat_methods+0x78>
  601c64:       91176000        add     x0, x0, #0x5d8
  601c68:       f9544803        ldr     x3, [x0, #10384]
  601c6c:       b40000a3        cbz     x3, 601c80
<_TAIL_CALL_BINARY_OP_ADD_INT+0xa0>
  601c70:       f9544c02        ldr     x2, [x0, #10392]
  601c74:       52800021        mov     w1, #0x1                        // #1
  601c78:       aa1603e0        mov     x0, x22
  601c7c:       d63f0060        blr     x3
  601c80:       aa1603e0        mov     x0, x22
  601c84:       97fb3d89        bl      4d12a8 <_PyLong_ExactDealloc>
  601c88:       b94002a0        ldr     w0, [x21]
  601c8c:       37f801c0        tbnz    w0, #31, 601cc4
<_TAIL_CALL_BINARY_OP_ADD_INT+0xe4>
  601c90:       51000400        sub     w0, w0, #0x1
  601c94:       b90002a0        str     w0, [x21]
  601c98:       35000160        cbnz    w0, 601cc4
<_TAIL_CALL_BINARY_OP_ADD_INT+0xe4>
  601c9c:       b0001a40        adrp    x0, 94a000 <stat_methods+0x78>
  601ca0:       91176000        add     x0, x0, #0x5d8
  601ca4:       f9544803        ldr     x3, [x0, #10384]
  601ca8:       b40000a3        cbz     x3, 601cbc
<_TAIL_CALL_BINARY_OP_ADD_INT+0xdc>
  601cac:       f9544c02        ldr     x2, [x0, #10392]
  601cb0:       52800021        mov     w1, #0x1                        // #1
  601cb4:       aa1503e0        mov     x0, x21
  601cb8:       d63f0060        blr     x3
  601cbc:       aa1503e0        mov     x0, x21
  601cc0:       97fb3d7a        bl      4d12a8 <_PyLong_ExactDealloc>
  601cc4:       b40001fa        cbz     x26, 601d00
<_TAIL_CALL_BINARY_OP_ADD_INT+0x120>
  601cc8:       79401a84        ldrh    w4, [x20, #12]
  601ccc:       b0000c80        adrp    x0, 792000 <builtin_all__doc__+0x20>
  601cd0:       91110000        add     x0, x0, #0x440
  601cd4:       aa1b03e3        mov     x3, x27
  601cd8:       12001c81        and     w1, w4, #0xff
  601cdc:       aa1803e2        mov     x2, x24
  601ce0:       a8c17bfd        ldp     x29, x30, [sp], #16
  601ce4:       f81f027a        stur    x26, [x19, #-16]
  601ce8:       f861d805        ldr     x5, [x0, w1, sxtw #3]
  601cec:       53087c84        lsr     w4, w4, #8
  601cf0:       d1002261        sub     x1, x19, #0x8
  601cf4:       aa1703e0        mov     x0, x23
  601cf8:       aa0503f0        mov     x16, x5
  601cfc:       d61f0200        br      x16
  601d00:       a8c17bfd        ldp     x29, x30, [sp], #16
  601d04:       2a1903e4        mov     w4, w25
  601d08:       aa1b03e3        mov     x3, x27
  601d0c:       aa1803e2        mov     x2, x24
  601d10:       d1004261        sub     x1, x19, #0x10
  601d14:       aa1703e0        mov     x0, x23
  601d18:       17fff672        b       5ff6e0 <_TAIL_CALL_error>
  601d1c:       d503201f        nop

In the preserves_none output, it can move things like x3-5 and v9 above into
call-preserved registers (x21+) without penalty.  So the stores before the call
become moves (which can be handled using renaming) and the code after the call
can use those registers directly (so the loads disappear entirely).

Thanks Richard for double checking the output and for the explanation.

I've also run pyperformance benchmark suite against the two versions of
CPython. The preserve_none implementation brings a 4% performance improvement
(as geometric mean).
Almost all benchmarks show performance improvements, up to 16%.

Please, schedule this feature to be fully implemented in GCC as the CPython
project will need it (the PR for tail-calling interpreter has been merged
already - just for clang for now).

Thanks

Reply via email to