Re: Modifying ARM code generator for elimination of 8bit writes - need help

Rask Ingemann Lambertsen Fri, 21 Jul 2006 06:27:13 -0700

On Thu, Jul 20, 2006 at 04:37:41PM +0200, Rask Ingemann Lambertsen wrote:
> ;; This is primarily a hack for the Nintendo DS external RAM.
> (define_insn "_arm_movqi_insn_swp"
>   [(set (match_operand:QI 0 "reg_or_Qmem_operand" "=r,r,r,Q,Q")
>       (match_operand:QI 1 "general_operand" "rI,K,m,r,r"))
>         (clobber (match_scratch:QI 2 "=X,X,X,1,&r"))]
>   "TARGET_ARM && TARGET_SWP_BYTE_WRITES
>    && (   register_operand (operands[0], QImode)
>        || register_operand (operands[1], QImode))"
>   "@
>    mov%?\\t%0, %1
>    mvn%?\\t%0, #%B1
>    ldr%?b\\t%0, %1
>    swp%?b\\t%1, %1, [%|%m0]
>    swp%?b\\t%2, %1, [%|%m0]"
>   [(set_attr "type" "*,*,load1,store1,store1")
>    (set_attr "predicable" "yes")]
> )


I found that this peephole optimization improves the code a whole lot:

;; The register allocator is often stupid. Try to change
;;      mov     r2, r1
;;      swpb    r2, r2, [r0]
;; into
;;      swpb    r2, r1, [r0]
;; (and pretend it is just another way of allocating a scratch register).
(define_peephole2
  [(parallel
  [(set (match_operand:QI 2 "register_operand")
        (match_operand:QI 1 "register_operand"))
   (clobber (match_scratch:QI 3))])
   (parallel [
   (set (match_operand:QI 0 "memory_operand") (match_dup 2))
   (clobber (match_dup 2))])]
  "TARGET_ARM && TARGET_SWP_BYTE_WRITES"
  [(parallel
  [(set (match_dup 0) (match_dup 1))
   (clobber (match_dup 2))])]
)

Another way of improving the code was to swap the order of the two last
alternatives of _arm_movqi_insn_swp. There are a few differences in the
generated code, shown with "1,&r" to the left and "&r,1" to the right:

.L92:                             .L92:
        ldr     r2, [fp, #-144] |         ldr     r1, [fp, #-144]
        ldr     r3, [fp, #-152]           ldr     r3, [fp, #-152]
        cmp     r2, #0          |         cmp     r1, #0
        add     r2, r3, #2                add     r2, r3, #2
        ldreq   r0, [fp, #-144] |         moveq   r0, r1

Above, reload from memory [fp, #-144] for no apparent reason.

.L141:                                                          .L141:
        ldr     r0, [fp, #-152] |         ldr     r2, [fp, #-152]
        sub     r3, r0, #2      |         sub     r3, r2, #2
        cmp     r5, r3                    cmp     r5, r3
        beq     .L142                     beq     .L142
        cmp     r5, #0                    cmp     r5, #0
        movne   r2, r0          |         beq     .L144
        bne     .L146           |         b       .L146
        b       .L144           <

Some sort of register allocation mismatch.

        beq     .L160           |         beq     .L155
        cmp     r0, #44                   cmp     r0, #44
        cmpne   r0, #59                   cmpne   r0, #59
        beq     .L160           |         beq     .L155
        cmp     r0, #61                   cmp     r0, #61
        cmpne   r0, #43                   cmpne   r0, #43
        bne     .L158                     bne     .L158
                                > .L155:
                                >         mov     ip, #95
                                >         str     r8, [fp, #-120]
                                >         mov     r0, #1
                                >         swpb    r2, ip, [r6]
                                >         b       .L159
.L160:                            .L160:
        mov     r3, #95                   mov     r3, #95
        str     r8, [fp, #-120]           str     r8, [fp, #-120]
        mov     r0, #1                    mov     r0, #1
        swpb    r1, r3, [r6]              swpb    r1, r3, [r6]
        b       .L159                     b       .L159

Code duplication, presumably because of the different register allocation.

        ldr     lr, [fp, #-104]           ldr     lr, [fp, #-104]
        ldrb    r2, [r1, ip]              ldrb    r2, [r1, ip]
        add     r3, r1, lr                add     r3, r1, lr
        swpb    r2, r2, [r3]    |         swpb    lr, r2, [r3]
        ldr     r2, [fp, #-132]           ldr     r2, [fp, #-132]
        add     r1, r1, #1                add     r1, r1, #1
                                >         ldr     lr, [fp, #-104]
        add     r2, r2, #1                add     r2, r2, #1
        cmp     r1, r0                    cmp     r1, r0
        str     r2, [fp, #-132]           str     r2, [fp, #-132]
        add     r3, lr, r1                add     r3, lr, r1

Here, the register allocator is just plain stupid in not using the best
alternative. I suspect this is because only reload allocates scratch
registers and doesn't realize that the input register dies in this insn.

        ldr     r2, [fp, #-184] |         ldr     r5, [fp, #-184]
        strh    r3, [r4, #22]             strh    r3, [r4, #22]
        strh    r3, [r4, #14]             strh    r3, [r4, #14]
        mov     r0, r2, asr #16 <
        ldrh    r2, [fp, #-48]            ldrh    r2, [fp, #-48]
        mov     r1, #0                    mov     r1, #0
                                >         mov     r0, r5, asr #16
        add     r3, r4, #13               add     r3, r4, #13
        strh    r2, [r4, #24]             strh    r2, [r4, #24]
        strh    r2, [r4, #18]             strh    r2, [r4, #18]
        strh    r2, [r4, #16]             strh    r2, [r4, #16]
        swpb    ip, r1, [r3]    |         swpb    r6, r1, [r3]
        strh    r0, [r4, #20]             strh    r0, [r4, #20]
        str     r1, [r4, #28]             str     r1, [r4, #28]
        ldr     lr, [fp, #-184] |         strh    r5, [r4, #26]
        strh    lr, [r4, #26]   <

Again, needless reload from memory [fp, #-184]. One more example omitted.

-- 
Rask Ingemann Lambertsen

Re: Modifying ARM code generator for elimination of 8bit writes - need help

Reply via email to