Hi,

On Sat, 19 Oct 2024 at 19:20, Torbjörn SVENSSON
<torbjorn.svens...@foss.st.com> wrote:
>
> With r15-1618-g9f168b412f4, I get the following asm generated for the test 
> case:
>
>         .align  1
>         .align  2
>         .global test5
>         .syntax unified
>         .thumb
>         .thumb_func
>         .type   test5, %function
> test5:
>         @ args = 4, pretend = 0, frame = 0
>         @ frame_needed = 0, uses_anonymous_args = 0
>         push    {r4, r5, r6, lr}
>         ldr     r4, [sp, #16]
>         cmp     r4, #0
>         ble     .L37
>         sub     ip, r4, #16
>         adds    r6, r2, r4
>         adds    r5, r1, r4
>         add     r0, r0, r4
>         dlstp.8 lr, r4
> .L39:
>         subs    r2, r5, r4
>         subs    r1, r0, r4
>         vldrb.8 q3, [r1]
>         vldrb.8 q2, [r2]
>         subs    r2, r6, r4
>         mov     r4, ip
>         sub     ip, ip, #16
>         vadd.i8 q3, q3, q2
>         vstrb.8 q3, [r2]
>         vstrb.8 q3, [r3]
>         letp    lr, .L39
> .L37:
>         pop     {r4, r5, r6, pc}
>         .size   test5, .-test5
>
> ...
>
>         .align  1
>         .align  2
>         .global test8
>         .syntax unified
>         .thumb
>         .thumb_func
>         .type   test8, %function
> test8:
>         @ args = 4, pretend = 0, frame = 0
>         @ frame_needed = 0, uses_anonymous_args = 0
>         push    {r4, lr}
>         ldr     r4, [sp, #8]
>         cmp     r3, #0
>         ble     .L59
>         dlstp.32        lr, r3
> .L61:
>         vldrw.32        q3, [r0], #16
>         vctp.32 r4
>         vpst
>         vldrwt.32       q2, [r1], #16
>         adds    r4, r4, #1
>         vadd.i32        q3, q3, q2
>         vstrw.32        q3, [r2], #16
>         letp    lr, .L61
> .L59:
>         pop     {r4, pc}
>         .size   test8, .-test8
>
>
>
>
> With r15-1619-g3b9b8d6cfdf, I instead get:
>
>         .align  1
>         .align  2
>         .global test5
>         .syntax unified
>         .thumb
>         .thumb_func
>         .type   test5, %function
> test5:
>         @ args = 4, pretend = 0, frame = 0
>         @ frame_needed = 0, uses_anonymous_args = 0
>         push    {r4, r5, r6, lr}
>         ldr     ip, [sp, #16]
>         cmp     ip, #0
>         ble     .L37
>         mov     r6, r3
>         sub     r3, ip, #16
>         add     r5, r2, ip
>         add     r4, r1, ip
>         add     r0, r0, ip
>         dlstp.8 lr, ip
> .L39:
>         sub     r2, r4, ip
>         sub     r1, r0, ip
>         vldrb.8 q3, [r1]
>         vldrb.8 q2, [r2]
>         sub     r2, r5, ip
>         mov     ip, r3
>         subs    r3, r3, #16
>         vadd.i8 q3, q3, q2
>         vstrb.8 q3, [r2]
>         vstrb.8 q3, [r6]
>         letp    lr, .L39
> .L37:
>         pop     {r4, r5, r6, pc}
>         .size   test5, .-test5
>
> ...
>
>         .align  1
>         .align  2
>         .global test8
>         .syntax unified
>         .thumb
>         .thumb_func
>         .type   test8, %function
> test8:
>         @ args = 4, pretend = 0, frame = 0
>         @ frame_needed = 0, uses_anonymous_args = 0
>         push    {lr}
>         ldr     ip, [sp, #4]
>         cmp     r3, #0
>         ble     .L59
>         dlstp.32        lr, r3
> .L61:
>         vldrw.32        q3, [r0], #16
>         vctp.32 ip
>         vpst
>         vldrwt.32       q2, [r1], #16
>         add     ip, ip, #1
>         vadd.i32        q3, q3, q2
>         vstrw.32        q3, [r2], #16
>         letp    lr, .L61
> .L59:
>         ldr     pc, [sp], #4
>         .size   test8, .-test8
>
>
>
> As can be seen, with r15-1619-g3b9b8d6cfdf, it now uses ip in ways that it did
> not before. I think this part is fine.
> It also, for some reason, decides to move r3 into r6 in test5 and then use
> that later for the vstrb.8. While I suppose it does work, it will consume one
> extra mov, so it's slightly bigger.
>
> With below patch, I no longer see any failure reported for arm-none-eabi.
>
> Even with the slight size increase for test5, is it ok for trunk?
>
> --
>
> Since r15-1619-g3b9b8d6cfdf, test5 and test8 fails due to that "ip"
> might be used and r3 might be moved to another register for later
> dereference.
>
> gcc/testsuite/ChangeLog:
>
>         PR testsuite/116623
>         * gcc.target/arm/mve/dlstp-compile-asm-2.c: Align test5 and
>         test8 with changes in r15-1619-g3b9b8d6cfdf.
>

Thanks for looking into this. That commit (r15-1619-g3b9b8d6cfdf)
caused several regressions, including this one.

LGTM.

Christophe


> Signed-off-by: Torbjörn SVENSSON <torbjorn.svens...@foss.st.com>
> ---
>  gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c | 10 ++++++----
>  1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c 
> b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
> index 84f4a2fc4f9..c62f592a60d 100644
> --- a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
> +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
> @@ -147,15 +147,17 @@ void test5 (uint8_t *a, uint8_t *b, uint8_t *c,  
> uint8_t *d, int n)
>  /*
>  ** test5:
>  **...
> -**     dlstp.8 lr, r[0-9]+
> +**     (?:mov  (r[0-9]+), r3)?
> +**...
> +**     dlstp.8 lr, (?:r[0-9]+|ip)
>  **...
>  **     vldrb.8 q[0-9]+, \[r1\]
>  **     vldrb.8 q[0-9]+, \[r2\]
>  **...
>  **     vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
>  **...
> -**     vstrb.8 \1, \[r2\]
> -**     vstrb.8 \1, \[r3\]
> +**     vstrb.8 \2, \[r2\]
> +**     vstrb.8 \2, \[(r3|\1)\]
>  **     letp    lr, .*
>  **...
>  */
> @@ -247,7 +249,7 @@ void test8 (int32_t *a, int32_t *b, int32_t *c, int n, 
> int g)
>  **...
>  **     dlstp.32        lr, r3
>  **     vldrw.32        q[0-9]+, \[r0\], #16
> -**     vctp.32 r4
> +**     vctp.32 (?:r4|ip)
>  **     vpst
>  **     vldrwt.32       q[0-9]+, \[r1\], #16
>  **...
> --
> 2.25.1
>

Reply via email to