I mentioned last time trying doing something like this, and now I have it coded up (as utoa_bytes()).
Input Decimal bits mem_toa mem_tod itoa nibbles bytes 8 269 220 217 141 143 16 664 462 451 321 273 24 1294 783 838 608 432 32 2059 1219 1167 948 666 40 3059 1775 1649 1395 941 48 4194 2373 2127 1895 1217 56 5477 3069 2733 2459 1551 64 6995 3822 3420 3130 1902 It's larger (120 bytes, vs, 90 for the nibbles code), and has a bit more startup overhead, but is quite fast. I also have a 122-byte variant which saves 1 cycle per pair of digits. (1895 cycles for 2^64-1.) I'm interested in this both for speed, and because it's a good fit to my suggestion of saving RAM space buffering the converted digits in BCD. So now that we have several good candidates, how to proceed? What size/speed tradeoff should be the final choice? Personally, I'm not worried about 30 bytes of code on enhanced AVRs with a multiplier, and although I haven't coded up the combined algorithm, I believe the whole thing is smaller than the ultoa_invert code it replaces. But this is a judgement call, and I'd appreciate some other voices. #define __zero_reg__ r1 #define __tmp_reg__ r0 .macro DEFUN name .section .text.\name,"ax",@progbits .global \name .func \name \name: .endm .macro ENDF name .size \name, .-\name .endfunc .endm #if defined (__AVR_HAVE_JMP_CALL__) #define XCALL call #define XJMP jmp #else #define XCALL rcall #define XJMP rjmp #endif .macro wmov r_dest, r_src #if defined (__AVR_HAVE_MOVW__) movw \r_dest, \r_src #else mov \r_dest, \r_src mov \r_dest+1, \r_src+1 #endif .endm .text #define pBuf 26 #define pNum 30 #define Length r20 #define Rem r19 #define Quot r21 #define Count r22 #define Num r23 #define Ten r16 #define Ox67 r18 ;; extern void u64toa_nibbles (char *pBuf, void *pNum, uint8_t Length) ;; Length in [1, 127] DEFUN u64toa_nibbles wmov pBuf, 24 wmov pNum, 22 push Ten ldi Ten, 10 ;; For / 10 by multiply we would usually use a value of 0xcd ~ 0x800/10, ;; however Rem below will be in [0, 0x9f] so that ~ 1/2 of that value ;; also works, and that saves us one "lsr r1" per division. ldi Ox67, 0x67 clr Count .LoopDigits: add pNum, Length ;; Count is 0. adc pNum+1, Count mov Count, Length ldi Length, -1 clr Rem .LoopBytes: ld Num, -Z ;; Rem:Num <<= 4 ;; "andi Rem, 0xf" not needed because Rem < 10. eor Rem, Num andi Num, 0x0f eor Rem, Num swap Rem ;; Quot := Rem / 10 mul Rem, Ox67 lsr r1 lsr r1 mov Quot, r1 ;; Rem := Rem - 10 * (Rem / 10) = Rem % 10 mul r1, Ten sub Rem, r0 ;; ...and (mostly) the same again for the low nibble. ;; Quot <<= 4 swap Quot ;; Rem:Num <<= 4 swap Rem eor Rem, Num ;; Quot += Rem / 10 mul Rem, Ox67 lsr r1 lsr r1 add Quot, r1 ;; Quot = 0 ==> R1 = 0, hence we can also skip the MUL + SUB below. breq 1f sbrc Length, 7 ;; Current Length not yet known: Set it according to highest non-zero byte. mov Length, Count ;; Rem := Rem - 10 * (Rem / 10) = Rem % 10 mul r1, Ten sub Rem, r0 1: st Z, Quot dec Count brne .LoopBytes subi Rem, -'0' st X+, Rem ;; Only continue if we determined Length (Length > 0) ;; Length < 0 means all bytes are zero, hence we are done then. sbrs Length, 7 rjmp .LoopDigits pop Ten clr __zero_reg__ st X, __zero_reg__ ;; pBuf survived in R25:R24 #if 0 # XJMP strrev wmov ZL, 24 7: ld Count, -X ld Num, Z st Z+, Count st X, Num /* Loop while Z < X */ cp ZL, XL cpc ZH, XH brlo 7b #endif ret ENDF u64toa_nibbles #undef Count #undef pNum #undef pBuf #undef Rem #undef Num #undef Quot #undef Ten #undef Ox67 #undef Length ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; This works simularly, but in base 100. Every step, we take a ;; remainder <= 99, multiply by 256, and add the next byte (<=255). ;; The result is in the range 0 <= x < 25600. ;; ;; To divide by 100 with multiply: ;; x * 0x29 >> 12 == x/100 for 0 <= x < 1099 ;; x * 0x51f >> 17 == x/100 for 0 <= x < 4699 ;; x * 0x147b >> 19 == x/100 for 0 <= x < 43699 ;; x * 0x28f6 >> 20 == x/100 for 0 <= x < 43699 ;; ;; Using a multiplier one bit larger than strictly necessary gives us ;; a more convenient shift amount. This is still small enough that we ;; can compute the high bits of the product we need in only two registers. ;; ;; The largest possible x = 0x63ff also has the largest possible ;; lsbyte, so it will give us the largest possible partial products. ;; Multiplying that by 0x28f6 requires four partial products: ;; ;; FF * F6 = F50A ;; FF * 28-- = 27D8-- ;; 63-- * F6 = 5F22-- ;; 63-- * 28-- = F78---- ;; ;; The important thing is that the sum of the first three partial products ;; is 0x87ef0a, a 24-bit number. So there is no need to propagate ;; carries into the msbyte. We immediately discard the lsbyte of the ;; first partial product, and sum the others into a 2-byte register. ;; Then we throw away the lsbyte of that register and use it for the msbyte ;; of the final sum. ;; extern void utoa_bytes(char *pBuf, void *pNum, uint8_t Length) ;; Length in [1, 127] #define ZH r31 #define ZL r30 #define XH r27 #define XL r26 #define Q2 r23 /* Byte 2 of 4-byte product */ #define Q1 r22 /* Byte 1 (and 3) of 4-byte product */ #define Count r21 #define Length r20 /* Argument */ #define Rem r19 #define Hundred r18 /* Constant 0x64 = 100 */ #define Khi r17 /* Constant 0x28 = 40 */ #define Klo r16 /* Constant 0xf6 = 246 */ #define Num r15 DEFUN utoa_bytes movw XL, r24 ; pBuf (output) to X movw ZL, r22 ; pNum (input) to Z ldi Hundred, 100 push Khi ldi Khi, 0x28 push Klo ldi Klo, 0xf6 push Num clr Count .LoopDigits2: add ZL, Length adc ZH, Count mov Count, Length ldi Length, -1 clr Rem .LoopBytes2: ld Num, -Z /* Multiply Rem:Num by Khi:Klo */ mul Num, Khi mov Q1, r0 mov Q2, r1 mul Rem, Klo add Q1, r0 adc Q2, r1 ; Cannot overflow mul Num, Klo add Q1, r1 clr Q1 ; We only need the carry bit out adc Q2, Q1 ; This cannot overflow, either mul Rem, Khi add Q2, r0 adc Q1, r1 ; We now have the high 12 bits of the 28-bit product in Q1:Q2. ; Shift down by 4 bits andi Q2, 0xf0 or Q1, Q2 swap Q1 ;; We now have the new quotient in "Q1". st Z, Q1 ;; If Q1 == 0, don't set length (and multiply is redundant, too) breq 1f sbrc Length, 7 ;; Do we have a length yet? mov Length, Count ;; Remember position of highest non-zero Q mul Q1, Hundred sub Num, r0 ; New remainder 1: mov Rem, Num dec Count brne .LoopBytes2 ;; We now have the final base-100 remiander in Rem. ;; Break it apart into two base-10 digits in Q1:Rem ;; (We could use the multiplier again, but that woud require ;; even more registers for the constants and more instructions, ;; and this isn't the inner loop.) ldi Q1, '0' 3: subi Q1, -3 subi Rem, 30 brcc 3b 4: dec Q1 subi Rem, -10 brcs 4b subi Rem, -'0' st X+, Rem st X+, Q1 ;; Only continue if we determined Length (Length > 0) ;; Length < 0 means all bytes are zero, hence we are done. sbrs Length, 7 rjmp .LoopDigits2 ;; Chop off redundant trailing zero clr __zero_reg__ cpi Q1, '0' brne 5f st -X, __zero_reg__ ; Could sbiw, but this is same speed 5: st X, __zero_reg__ ; Restore registers pop Num pop Klo pop Khi movw r24, r26 ; Return end of output ret ENDF utoa_bytes #undef Num #undef Klo #undef Khi #undef Hundred #undef Rem #undef Length #undef Count #undef Q1 #undef Q2 #undef XL #undef XH #undef ZL #undef ZH _______________________________________________ AVR-libc-dev mailing list AVR-libc-dev@nongnu.org https://lists.nongnu.org/mailman/listinfo/avr-libc-dev