https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91598

--- Comment #4 from Wilco <wilco at gcc dot gnu.org> ---
Fixing vmull_lane_s16 and vmlal_lane_s16 to avoid inline assembler gives this
schedule which runs 63% faster on Cortex-A53:

        ldr     d2, [x6, x0]
        ldr     d4, [x6, x3]
        ldr     d3, [x6, x2]
        smull   v2.4s, v2.4h, v0.4h[0]
        ldr     d1, [x6, x1]
        smull   v4.4s, v4.4h, v0.4h[1]
        ldr     d16, [x7, x3]
        smull   v3.4s, v3.4h, v0.4h[3]
        ldr     d7, [x7, x2]
        smull   v1.4s, v1.4h, v0.4h[0]
        ldr     d6, [x7, x1]
        ldr     d5, [x7, x0]
        smlal   v4.4s, v16.4h, v0.4h[3]
        ldr     d16, [x4, x3]
        smlal   v3.4s, v7.4h, v0.4h[2]
        ldr     d7, [x4, x2]
        smlal   v1.4s, v6.4h, v0.4h[1]
        ldr     d6, [x4, x1]
        smlal   v2.4s, v5.4h, v0.4h[1]
        ldr     d5, [x4, x0]
        smlal   v4.4s, v16.4h, v0.4h[3]
        ldr     d16, [x5, x3]
        smlal   v3.4s, v7.4h, v0.4h[0]
        ldr     d7, [x5, x2]
        smlal   v1.4s, v6.4h, v0.4h[2]
        ldr     d6, [x5, x1]
        smlal   v2.4s, v5.4h, v0.4h[2]
        ldr     d5, [x5, x0]
        smlal   v4.4s, v16.4h, v0.4h[3]
        smlal   v3.4s, v7.4h, v0.4h[3]
        smlal   v1.4s, v6.4h, v0.4h[2]
        smlal   v2.4s, v5.4h, v0.4h[0]

Reply via email to