On 08.12.2016 00:34, George Spelvin wrote:
Georg-Johann Lay <a...@gjlay.de> wrote:
The algo is rather slow because it always iterates over all
digits, i.e. it won't run faster for small numbers.

Have fun!

Code size is ~140 bytes.

Well, it's bigger (140 > 124), slower, and doesn't handle sizes *other*
than 64 bits, so that's not terribly useful.

I think you could shrink it a bit, replacing these 16 instructions of messy
digit output code (why are you looping incrementing DIGIT2 when you know it
is never more than 1?):

It's a transcript from antique C-code for 32-bit values, which
don't have this nice property.  I shouldn't write code to late in
the evening.

And I didn't actually intend nor expect to beat your code, just was
interested in how far it can be pushed...

        clr     DIGIT2
1:
        inc     DIGIT2
        subi    DIGIT, 10
        brcc    1b

        brts    2f
        ;; T = 0 is the first round.  Output the high digit if it's not '0'.
        set
        subi    DIGIT2, 1-'0'
        ;; Initialize nonZero.  We only output digits if we saw a digit != '0'.
        mov     nonZero, DIGIT2
        cpi     nonZero, '0'
        breq    2f
        st      X+, DIGIT2
2:
        ;; Output digits except the highest (except that for 10^19).
        subi    DIGIT, -10-'0'
        or      nonZero, DIGIT
        ;; We only output digits if we saw a digit != '0', i.e. strip leading 
'0's.
        cpi     nonZero, '0'
        breq    3f
        st      X+, DIGIT

With these 9 instructions:
        cpi     DIGIT, 10       ;; First "digit" can be as high as 18
        brcs    2f
        ldi     nonZero, '1'    ;; '1' is non-zero, which is perfect
        st      X+, nonZero
        subi    DIGIT, 10
2:
        or      nonZero, DIGIT
        breq    3f              ;; Don't print leading zeros
        subi    DIGIT, -10-'0'
        st      X+, DIGIT
3:

With this, you can also delete the leading clt.  It eliminates DIGIT2,
but unfortunately that doesn't save a spill.  You also have to adapt
the final "lone zero" printing code to print if nonZero == 0, but that's
the same size.

Also, this is just silly:
    dec     Count
    cpse    Count, Zero
    rjmp    .Loop

"dec" sets the zero flag, so that can just be "dec Count $ brnz .Loop".

And finally, your multiply loop is wasting two instructions:

    mul A0,Ten  $  mov A0,r0  $  add A0,Cy  $  mov Cy,r1  $  adc Cy,Zero
    mov __tmp_reg__,A0
    mov A0,A1   $  mov A1,A2  $  mov A2,A3  $  mov A3,A4
    mov A4,A5   $  mov A5,A6  $  mov A6,A7  $  mov A7,__tmp_reg__

"mov A0,r0" and "mov __tmp_reg__,A0" are cancelling each other out

Nice spotting!

and should both be deleted (with the "A0 += Cy" adjusted to add to r0,
of course).  Just make it:

    mul A0,Ten  $  mov A0,r0  $  add r0,Cy  $  adc r1,Zero  $  mov Cy,r1
    mov A0,A1   $  mov A1,A2  $  mov A2,A3  $  mov A3,A4
    mov A4,A5   $  mov A5,A6  $  mov A6,A7  $  mov A7,r0


That saves 22 bytes, leaving it 6 bytes smaller than mine.  Nice to have 
available!

The reworked version comes up with 110 bytes (still asserting MUL).

But I like your approach more, as it does not rely on special properties
of the numbers and comes with some nice ideas.

perf-metering with avrtest reveals a run time from ~3100 up to < 4800 ticks; high as expected.

Johann

#define __zero_reg__ r1

.macro DEFUN name
.global \name
.func \name
\name:
.endm

.macro ENDF name
.size \name, .-\name
.endfunc
.endm

.macro  wmov  r_dest, r_src
#if defined (__AVR_HAVE_MOVW__)
    movw \r_dest,   \r_src
#else
    mov \r_dest,    \r_src
    mov \r_dest+1,  \r_src+1
#endif
.endm

#if defined (__AVR_HAVE_JMP_CALL__)
#define XCALL call
#define XJMP  jmp
#else
#define XCALL rcall
#define XJMP  rjmp
#endif

.text

#define A0      18
#define A1      A0 + 1
#define A2      A0 + 2
#define A3      A0 + 3
#define A4      A0 + 4
#define A5      A0 + 5
#define A6      A0 + 6
#define A7      A0 + 7

#define BUF0    26
#define BUF1    BUF0 + 1

#define Digit   31

#define Carry   31
#define Ten     30
#define Count   29
#define nonZero 28

;;; extern void put64 (uint64_t A, char *buf);
;;; This function writes up to 21 characters (including final '\0') to buf.

DEFUN put64
    push    r28
    push    r29

    wmov    BUF0, 16
    ldi     Count, 19
    ldi     nonZero, '0'

.Loop:
    ;; A[] -= 1.000.000.000.000.000.000  as often as we can.
    ;; Digit will hold the net number of subtractions (plus '0').
    ldi     Digit, '0'-1
1:
    inc     Digit
    ;;  1.000.000.000.000.000.000 = 0DE0 B6B3 A764 0000
    subi    A2, 0x64
    sbci    A3, 0xa7
    sbci    A4, 0xb3
    sbci    A5, 0xb6
    sbci    A6, 0xe0
    sbci    A7, 0xd
    brcc    1b

    ;; -1.000.000.000.000.000.000 = F21F 494C 589C 0000
    ;; Undo the underflow from above
    subi    A2, 0x9c
    sbci    A3, 0x58
    sbci    A4, 0x4c
    sbci    A5, 0x49
    sbci    A6, 0x1f
    sbci    A7, 0xf2

    ;; As ffff ffff ffff ffff = 18.***.***.***.***.***.***  we can get
    ;; Digit up to 18+'0' in the first round.
    cpi     Digit, '0'+10
    brlo    2f

    ;; If that is the case, split it into 2 digits: a '1' and the
    ;; low part in Digit.
    ldi     nonZero, '1'
    st      X+, nonZero
    subi    Digit, 10
2:
    ;; nonZero "accumulates" digits: it holds a value != '0' if we ever
    ;; output a digit not equal to '0'.
    or      nonZero, Digit
    ;; We only output digits if we saw a digit != '0', i.e. strip leading '0's.
    cpi     nonZero, '0'
    breq    3f
    st      X+, Digit
3:
    ;; A[] *= 10
    ldi     Ten, 10
    clr     Carry
.Lrot:
    mul     A0, Ten
    add     r0, Carry
    mov     Carry, r1
    clr     __zero_reg__
    adc     Carry, __zero_reg__
    ;; A[] >>>= 8
    ;; Rotate 8 times so that we can loop the *= 10 over A[].
    ;; The current value of A0 resides in r0, hence no "mov r0,A0" needed
    ;; to start the rotation.
    mov A0,A1  $  mov A1,A2  $  mov A2,A3  $  mov A3,A4
    mov A4,A5  $  mov A5,A6  $  mov A6,A7  $  mov A7,r0

    ;; Use the upper 3 bits of Count as loop counter for 8 loops.
    subi    Count, -32
    brcs    .Lrot

    dec     Count
    brne    .Loop

    ;; If all digits were '0', we had A[] = 0:  Output one '0'.
    cpi     nonZero, '0'
    brne    4f
    st      X+, nonZero
4:
    ;; String end and epilogue.
    st      X+, __zero_reg__
    pop     r29
    pop     r28
    ret
ENDF put64

;;; extern void put64s (int64_t A, char *buf);
;;; Writes up to 21 characters (including '-' and final '\0') to buf.

DEFUN put64s
#ifdef __AVR_ERRATA_SKIP_JMP_CALL__
    tst     A7
    brmi    0f
#else
    sbrs    A7, 7
#endif
    XJMP    put64
0:
    push    r16
    push    r17
    XCALL   __negdi2            ; Assumes avr-gcc 4.7+
    wmov    BUF0, 16
    ldi     Digit, '-'
    st      X+, Digit
    wmov    16, BUF0
    XCALL   put64
    pop     r17
    pop     r16
    ret
ENDF put64s
_______________________________________________
AVR-libc-dev mailing list
AVR-libc-dev@nongnu.org
https://lists.nongnu.org/mailman/listinfo/avr-libc-dev

Reply via email to