[avr-libc-dev] Even faster decimal code

George Spelvin Fri, 23 Dec 2016 14:12:04 -0800

I mentioned last time trying doing something like this, and now
I have it coded up (as utoa_bytes()).


Input   Decimal
bits    mem_toa mem_tod itoa    nibbles bytes
 8       269     220     217     141     143
16       664     462     451     321     273
24      1294     783     838     608     432
32      2059    1219    1167     948     666
40      3059    1775    1649    1395     941
48      4194    2373    2127    1895    1217
56      5477    3069    2733    2459    1551
64      6995    3822    3420    3130    1902

It's larger (120 bytes, vs, 90 for the nibbles code), and has a bit more
startup overhead, but is quite fast.  I also have a 122-byte variant
which saves 1 cycle per pair of digits.  (1895 cycles for 2^64-1.)

I'm interested in this both for speed, and because it's a good fit to my
suggestion of saving RAM space buffering the converted digits in BCD.

So now that we have several good candidates, how to proceed?
What size/speed tradeoff should be the final choice?

Personally, I'm not worried about 30 bytes of code on enhanced AVRs with
a multiplier, and although I haven't coded up the combined algorithm, I
believe the whole thing is smaller than the ultoa_invert code it replaces.

But this is a judgement call, and I'd appreciate some other voices.



#define __zero_reg__ r1
#define __tmp_reg__ r0

.macro DEFUN name
.section .text.\name,"ax",@progbits
.global \name
.func \name
\name:
.endm

.macro ENDF name
.size \name, .-\name
.endfunc
.endm

#if defined (__AVR_HAVE_JMP_CALL__)
#define XCALL call
#define XJMP  jmp
#else
#define XCALL rcall
#define XJMP  rjmp
#endif

.macro  wmov  r_dest, r_src
#if defined (__AVR_HAVE_MOVW__)
    movw \r_dest,   \r_src
#else
    mov \r_dest,    \r_src
    mov \r_dest+1,  \r_src+1
#endif
.endm

.text

#define pBuf    26
#define pNum    30
#define Length  r20

#define Rem     r19
#define Quot    r21
#define Count   r22
#define Num     r23

#define Ten     r16
#define Ox67    r18

;; extern void u64toa_nibbles (char *pBuf, void *pNum, uint8_t Length)
;; Length in [1, 127]

DEFUN u64toa_nibbles

    wmov    pBuf, 24
    wmov    pNum, 22

    push    Ten
    ldi     Ten, 10
    ;; For / 10 by multiply we would usually use a value of 0xcd ~ 0x800/10,
    ;; however Rem below will be in [0, 0x9f] so that ~ 1/2 of that value
    ;; also works, and that saves us one "lsr r1" per division.
    ldi     Ox67, 0x67
    clr     Count

.LoopDigits:

    add     pNum, Length
    ;; Count is 0.
    adc     pNum+1, Count

    mov     Count, Length
    ldi     Length, -1
    clr     Rem

.LoopBytes:

    ld      Num, -Z

    ;; Rem:Num <<= 4
    ;; "andi Rem, 0xf" not needed because Rem < 10.
    eor     Rem, Num
    andi    Num, 0x0f
    eor     Rem, Num
    swap    Rem

    ;; Quot := Rem / 10
    mul     Rem, Ox67
    lsr     r1
    lsr     r1
    mov     Quot, r1

    ;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
    mul     r1, Ten
    sub     Rem, r0

    ;; ...and (mostly) the same again for the low nibble.

    ;; Quot <<= 4
    swap    Quot

    ;; Rem:Num <<= 4
    swap    Rem
    eor     Rem, Num

    ;; Quot += Rem / 10
    mul     Rem, Ox67
    lsr     r1
    lsr     r1
    add     Quot, r1

    ;; Quot = 0  ==>  R1 = 0, hence we can also skip the MUL + SUB below.
    breq    1f
    sbrc    Length, 7
     ;; Current Length not yet known: Set it according to highest non-zero byte.
     mov     Length, Count

    ;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
    mul     r1, Ten
    sub     Rem, r0
1:

    st      Z, Quot

    dec     Count
    brne    .LoopBytes

    subi    Rem, -'0'
    st      X+, Rem

    ;; Only continue if we determined Length (Length > 0)
    ;; Length < 0 means all bytes are zero, hence we are done then.
    sbrs    Length, 7
     rjmp    .LoopDigits

    pop     Ten
    clr     __zero_reg__

    st      X, __zero_reg__

    ;; pBuf survived in R25:R24
#if 0
        # XJMP  strrev
        wmov    ZL, 24
7:      ld      Count, -X
        ld      Num, Z
        st      Z+, Count
        st      X, Num
        /* Loop while Z < X */
        cp      ZL, XL
        cpc     ZH, XH
        brlo    7b
#endif
        ret
ENDF u64toa_nibbles

#undef Count
#undef pNum
#undef pBuf
#undef Rem
#undef Num
#undef Quot
#undef Ten
#undef Ox67
#undef Length

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; This works simularly, but in base 100.  Every step, we take a
;; remainder <= 99, multiply by 256, and add the next byte (<=255).
;; The result is in the range 0 <= x < 25600.
;;
;; To divide by 100 with multiply:
;; x *   0x29 >> 12 == x/100 for 0 <= x <  1099
;; x *  0x51f >> 17 == x/100 for 0 <= x <  4699
;; x * 0x147b >> 19 == x/100 for 0 <= x < 43699
;; x * 0x28f6 >> 20 == x/100 for 0 <= x < 43699
;;
;; Using a multiplier one bit larger than strictly necessary gives us
;; a more convenient shift amount.  This is still small enough that we
;; can compute the high bits of the product we need in only two registers.
;;
;; The largest possible x = 0x63ff also has the largest possible
;; lsbyte, so it will give us the largest possible partial products.
;; Multiplying that by 0x28f6 requires four partial products:
;;
;;   FF *   F6 =     F50A
;;   FF * 28-- =   27D8--
;; 63-- *   F6 =   5F22--
;; 63-- * 28-- =  F78----
;;
;; The important thing is that the sum of the first three partial products
;; is 0x87ef0a, a 24-bit number.  So there is no need to propagate
;; carries into the msbyte.  We immediately discard the lsbyte of the
;; first partial product, and sum the others into a 2-byte register.
;; Then we throw away the lsbyte of that register and use it for the msbyte
;; of the final sum.

;; extern void utoa_bytes(char *pBuf, void *pNum, uint8_t Length)
;; Length in [1, 127]
#define ZH      r31
#define ZL      r30
#define XH      r27
#define XL      r26

#define Q2      r23     /* Byte 2 of 4-byte product */
#define Q1      r22     /* Byte 1 (and 3) of 4-byte product */
#define Count   r21
#define Length  r20     /* Argument */
#define Rem     r19
#define Hundred r18     /* Constant 0x64 = 100 */
#define Khi     r17     /* Constant 0x28 =  40 */
#define Klo     r16     /* Constant 0xf6 = 246 */
#define Num     r15


DEFUN utoa_bytes
        movw    XL, r24 ; pBuf (output) to X
        movw    ZL, r22 ; pNum (input) to Z
        ldi     Hundred, 100
        push    Khi
        ldi     Khi, 0x28
        push    Klo
        ldi     Klo, 0xf6
        push    Num
        clr     Count

.LoopDigits2:
        add     ZL, Length
        adc     ZH, Count

        mov     Count, Length
        ldi     Length, -1
        clr     Rem

.LoopBytes2:
        ld      Num, -Z

        /* Multiply Rem:Num by Khi:Klo */
        mul     Num, Khi
        mov     Q1, r0
        mov     Q2, r1
        mul     Rem, Klo
        add     Q1, r0
        adc     Q2, r1          ; Cannot overflow
        mul     Num, Klo
        add     Q1, r1
        clr     Q1              ; We only need the carry bit out
        adc     Q2, Q1          ; This cannot overflow, either
        mul     Rem, Khi
        add     Q2, r0
        adc     Q1, r1

        ; We now have the high 12 bits of the 28-bit product in Q1:Q2.
        ; Shift down by 4 bits
        andi    Q2, 0xf0
        or      Q1, Q2
        swap    Q1
        ;; We now have the new quotient in "Q1".
        st      Z, Q1

        ;; If Q1 == 0, don't set length (and multiply is redundant, too)
        breq    1f
        sbrc    Length, 7       ;; Do we have a length yet?
         mov     Length, Count  ;; Remember position of highest non-zero Q

        mul     Q1, Hundred
        sub     Num, r0         ; New remainder
1:
        mov     Rem, Num
        dec     Count
        brne    .LoopBytes2

        ;; We now have the final base-100 remiander in Rem.
        ;; Break it apart into two base-10 digits in Q1:Rem
        ;; (We could use the multiplier again, but that woud require
        ;; even more registers for the constants and more instructions,
        ;; and this isn't the inner loop.)
        ldi     Q1, '0'
3:      subi    Q1, -3
        subi    Rem, 30
        brcc    3b
4:      dec     Q1
        subi    Rem, -10
        brcs    4b

        subi    Rem, -'0'
        st      X+, Rem
        st      X+, Q1

        ;; Only continue if we determined Length (Length > 0)
        ;; Length < 0 means all bytes are zero, hence we are done.
        sbrs    Length, 7
         rjmp   .LoopDigits2

        ;; Chop off redundant trailing zero
        clr     __zero_reg__
        cpi     Q1, '0'
        brne    5f
         st     -X, __zero_reg__        ; Could sbiw, but this is same speed
5:      st      X, __zero_reg__

        ; Restore registers
        pop     Num
        pop     Klo
        pop     Khi

        movw    r24, r26        ; Return end of output
        ret
ENDF utoa_bytes

#undef Num
#undef Klo
#undef Khi
#undef Hundred
#undef Rem
#undef Length
#undef Count
#undef Q1
#undef Q2
#undef XL
#undef XH
#undef ZL
#undef ZH

_______________________________________________
AVR-libc-dev mailing list
AVR-libc-dev@nongnu.org
https://lists.nongnu.org/mailman/listinfo/avr-libc-dev

[avr-libc-dev] Even faster decimal code

Reply via email to