On Mon, Sep 24, 2012 at 2:35 PM, George Spelvin <li...@horizon.com> wrote:
>> Here is the comparison of the x86-32 assembly
>> of the fragment which does "x / 10000" thing,
>> before and after the patch:
>
>> -01 c6                  add    %eax,%esi
>> -b8 59 17 b7 d1         mov    $0xd1b71759,%eax
>> -f7 e6                  mul    %esi
>> -89 d3                  mov    %edx,%ebx
>> -89 f2                  mov    %esi,%edx
>> -c1 eb 0d               shr    $0xd,%ebx
>>
>> +01 c7                  add    %eax,%edi
>> +b8 d7 c5 6d 34         mov    $0x346dc5d7,%eax
>> +f7 e7                  mul    %edi
>> +89 55 e8               mov    %edx,-0x18(%ebp)
>> +8b 5d e8               mov    -0x18(%ebp),%ebx
>> +89 fa                  mov    %edi,%edx
>> +89 45 e4               mov    %eax,-0x1c(%ebp)
>> +c1 eb 0b               shr    $0xb,%ebx
>>
>> Poor gcc got confused, and generated somewhat
>> worse code (spilling and immediately reloading upper
>> part of 32x32->64 multiply).
>
>> Please test and benchmark your changes to this code
>> before submitting them.
>
> Thanks for the feedback!  It very much *was* intended to start a
> conversation with you, but the 7 week response delay somewhat interfered
> with that process.
>
> I was playing with it on ARM, where the results are a bit different.
>
> As you can see, it fell out of some other word which *did* make a
> useful difference.  I just hadn't tested this change in isolation,

Please find attached source of test program.

You need to touch test_new.c (or make a few copies of it
to experiment with different versions of code). test_header.c
and test_main.c contain benchmarking code and need not be modified.

It also includes a verification step, which would catch the bugs
you had in your patches.

Usage:

$ gcc [ --static] [-m32] -O2 -Wall test_new.c -otest_new
$ ./test_new
Conversions per second: 8:59964000 123:48272000 123456:37852000
12345678:34216000 123456789:23528000 2^32:23520000 2^64:17616000
Conversions per second: 8:60092000 123:48536000 123456:37836000
12345678:33924000 123456789:23580000 2^32:23372000 2^64:17608000
Conversions per second: 8:60084000 123:48396000 123456:37840000
12345678:34192000 123456789:23564000 2^32:23484000 2^64:17612000
Conversions per second: 8:60108000 123:48500000 123456:37872000
12345678:33996000 123456789:23576000 2^32:23524000 2^64:17612000
Tested 14680064      ^C
^^^^^^^^^^^^^^^^^^^^^^^^ tests correctness until user interrupts it
$
#include <features.h>
#include <inttypes.h>
#include <stdint.h>
#include <stdbool.h>
#include <sys/types.h> /* size_t */
#include <limits.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdarg.h>
#include <stddef.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#include <stdio.h> /* for FILE */
#include <sys/time.h>
#include <time.h>

#define noinline_for_stack __attribute__((noinline))
typedef uint8_t u8;
typedef int16_t s16;

/*
 * do_div() is NOT a C function. It wants to return
 * two values (the quotient and the remainder), but
 * since that doesn't work very well in C, what it
 * does is:
 *
 * - modifies the 64-bit dividend _in_place_
 * - returns the 32-bit remainder
 *
 * This ends up being the most efficient "calling
 * convention" on x86.
 */
#if LONG_MAX == ((1U << 31)-1)
#define do_div(n, base)						\
({								\
	unsigned long __upper, __low, __high, __mod, __base;	\
	__base = (base);					\
	{							\
		asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
		__upper = __high;				\
		if (__high) {					\
			__upper = __high % (__base);		\
			__high = __high / (__base);		\
		}						\
		asm("divl %2" : "=a" (__low), "=d" (__mod)	\
			: "rm" (__base), "0" (__low), "1" (__upper));	\
		asm("" : "=A" (n) : "a" (__low), "d" (__high));	\
	}							\
	__mod;							\
})
#elif LONG_MAX > ((1U << 31)-1)
#define do_div(n,base) ({                                       \
	uint32_t __base = (base);                               \
	uint32_t __rem;                                         \
	__rem = ((uint64_t)(n)) % __base;                       \
	(n) = ((uint64_t)(n)) / __base;                         \
	__rem;                                                  \
})
#endif
static const struct printf_spec dummy_spec = {
	.type = FORMAT_TYPE_LONG_LONG,
	.flags = 0,
	.base = 10,
	.qualifier = 0,
	.field_width = 0,
	.precision = 0,
};

#define REPS (4000)

static unsigned measure_number(time_t* tp, unsigned long long num)
{
	char buf[64];
	time_t t, n;
	int i;
	unsigned count;

	t = *tp;
	count = 0;
	while (t == (n = time(NULL))) {
		for (i = REPS; --i >= 0;)
			number(buf, buf + 63, num, dummy_spec)[0] = '\0';
		count += REPS;
	}
	*tp = n;
	return count;
}

static void measure()
{
	time_t t, n;
	unsigned count1, count2, count3, count4, count5, count6, count7;

	t = time(NULL);
	while (t == (n = time(NULL)))
		continue;
	t = n;

	count1 = measure_number(&t, 8);
	count2 = measure_number(&t, 123);
	count3 = measure_number(&t, 123456);
	count4 = measure_number(&t, 12345678);
	count5 = measure_number(&t, 123456789);
	count6 = measure_number(&t, (1ULL << 32) - 1);
	count7 = measure_number(&t, (0ULL - 1));

	printf("Conversions per second: 8:%d 123:%d 123456:%d 12345678:%d 123456789:%d 2^32:%d 2^64:%d\n",
		count1,
		count2,
		count3,
		count4,
		count5,
		count6,
		count7
	);
}

static void check(unsigned long long v)
{
	char buf1[64];
	char buf2[64];
	number(buf1, buf1 + 63, v, dummy_spec)[0] = '\0';
	sprintf(buf2, "%llu", v);
	if (strcmp(buf1, buf2) != 0) {
		printf("Error in formatting %llu:'%s'\n", v, buf1);
		exit(1);
	}
}

int main()
{
	measure();measure();
	measure();measure();
	//measure();measure();

	unsigned long long i;
	for (i = 0; ; i++) {
		check(i);
//		check(((1ULL << 32) - 1) + i);
//		check(((1ULL << 32) - 1) - i);
		check(0ULL - i);
//		check(i * i);
//		check(i * i * i);
		if (!(i & 0x3ffff)) {
			printf("\rTested %llu                               ", i);
			fflush(NULL);
		}
	}

	return 0;
}
#include "test_header.c"

/* Decimal conversion is by far the most typical, and is used
 * for /proc and /sys data. This directly impacts e.g. top performance
 * with many processes running. We optimize it for speed
 * using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
 * (with permission from the author, Douglas W. Jones).
 */

#if LONG_MAX > ((1UL<<31)-1) || LLONG_MAX > ((1ULL<<63)-1)
/* Formats correctly any integer in [0, 999999999] */
static noinline_for_stack
char *put_dec_full9(char *buf, unsigned q)
{
	unsigned r;

	/* Possible ways to approx. divide by 10
	 * (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit)
	 * (x * 0xcccd) >> 19     x <      81920 (x < 262149 when 64-bit mul)
	 * (x * 0x6667) >> 18     x <      43699
	 * (x * 0x3334) >> 17     x <      16389
	 * (x * 0x199a) >> 16     x <      16389
	 * (x * 0x0ccd) >> 15     x <      16389
	 * (x * 0x0667) >> 14     x <       2739
	 * (x * 0x0334) >> 13     x <       1029
	 * (x * 0x019a) >> 12     x <       1029
	 * (x * 0x00cd) >> 11     x <       1029 shorter code than * 0x67 (on i386)
	 * (x * 0x0067) >> 10     x <        179
	 * (x * 0x0034) >>  9     x <         69 same
	 * (x * 0x001a) >>  8     x <         69 same
	 * (x * 0x000d) >>  7     x <         69 same, shortest code (on i386)
	 * (x * 0x0007) >>  6     x <         19
	 * See <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
	 */
	r      = (q * (uint64_t)0x1999999a) >> 32;
	*buf++ = (q - 10 * r) + '0'; /* 1 */
	q      = (r * (uint64_t)0x1999999a) >> 32;
	*buf++ = (r - 10 * q) + '0'; /* 2 */
	r      = (q * (uint64_t)0x1999999a) >> 32;
	*buf++ = (q - 10 * r) + '0'; /* 3 */
	q      = (r * (uint64_t)0x1999999a) >> 32;
	*buf++ = (r - 10 * q) + '0'; /* 4 */
	r      = (q * (uint64_t)0x1999999a) >> 32;
	*buf++ = (q - 10 * r) + '0'; /* 5 */
	/* Now value is under 10000, can avoid 64-bit multiply */
	q      = (r * 0x199a) >> 16;
	*buf++ = (r - 10 * q)  + '0'; /* 6 */
	r      = (q * 0xcd) >> 11;
	*buf++ = (q - 10 * r)  + '0'; /* 7 */
	q      = (r * 0xcd) >> 11;
	*buf++ = (r - 10 * q) + '0'; /* 8 */
	*buf++ = q + '0'; /* 9 */
	return buf;
}
#endif

/* Similar to above but do not pad with zeros.
 * Code can be easily arranged to print 9 digits too, but our callers
 * always call put_dec_full9() instead when the number has 9 decimal digits.
 */
static noinline_for_stack
char *put_dec_trunc8(char *buf, unsigned r)
{
	unsigned q;

	/* Copy of previous function's body with added early returns */
	q      = (r * (uint64_t)0x1999999a) >> 32;
	*buf++ = (r - 10 * q) + '0'; /* 2 */
	if (q == 0) return buf;
	r      = (q * (uint64_t)0x1999999a) >> 32;
	*buf++ = (q - 10 * r) + '0'; /* 3 */
	if (r == 0) return buf;
	q      = (r * (uint64_t)0x1999999a) >> 32;
	*buf++ = (r - 10 * q) + '0'; /* 4 */
	if (q == 0) return buf;
	r      = (q * (uint64_t)0x1999999a) >> 32;
	*buf++ = (q - 10 * r) + '0'; /* 5 */
	if (r == 0) return buf;
	q      = (r * 0x199a) >> 16;
	*buf++ = (r - 10 * q)  + '0'; /* 6 */
	if (q == 0) return buf;
	r      = (q * 0xcd) >> 11;
	*buf++ = (q - 10 * r)  + '0'; /* 7 */
	if (r == 0) return buf;
	q      = (r * 0xcd) >> 11;
	*buf++ = (r - 10 * q) + '0'; /* 8 */
	if (q == 0) return buf;
	*buf++ = q + '0'; /* 9 */
	return buf;
}

/* There are two algorithms to print larger numbers.
 * One is generic: divide by 1000000000 and repeatedly print
 * groups of (up to) 9 digits. It's conceptually simple,
 * but requires a (unsigned long long) / 1000000000 division.
 *
 * Second algorithm splits 64-bit unsigned long long into 16-bit chunks,
 * manipulates them cleverly and generates groups of 4 decimal digits.
 * It so happens that it does NOT require long long division.
 *
 * If long is > 32 bits, division of 64-bit values is relatively easy,
 * and we will use the first algorithm.
 * If long long is > 64 bits (strange architecture with VERY large long long),
 * second algorithm can't be used, and we again use the first one.
 *
 * Else (if long is 32 bits and long long is 64 bits) we use second one.
 */

#if LONG_MAX > ((1UL<<31)-1) || LLONG_MAX > ((1ULL<<63)-1)

/* First algorithm: generic */

static
char *put_dec(char *buf, unsigned long long n)
{
	if (n >= 100*1000*1000) {
		while (n >= 1000*1000*1000)
			buf = put_dec_full9(buf, do_div(n, 1000*1000*1000));
		if (n >= 100*1000*1000)
			return put_dec_full9(buf, n);
	}
	return put_dec_trunc8(buf, n);
}

#else

/* Second algorithm: valid only for 32-bit longs, 64-bit long longs */

static noinline_for_stack
char *put_dec_full4(char *buf, unsigned q)
{
	unsigned r;
	r      = (q * 0xcccd) >> 19;
	*buf++ = (q - 10 * r) + '0';
	q      = (r * 0x199a) >> 16;
	*buf++ = (r - 10 * q)  + '0';
	r      = (q * 0xcd) >> 11;
	*buf++ = (q - 10 * r)  + '0';
	*buf++ = r + '0';
	return buf;
}

/* Based on code by Douglas W. Jones found at
 * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
 * (with permission from the author).
 * Performs no 64-bit division and hence should be fast on 32-bit machines.
 */
static
char *put_dec(char *buf, unsigned long long n)
{
	uint32_t d3, d2, d1, q, h;

	if (n < 100*1000*1000)
		return put_dec_trunc8(buf, n);

	d1  = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
	h   = (n >> 32);
	d2  = (h      ) & 0xffff;
	d3  = (h >> 16); /* implicit "& 0xffff" */

	q   = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);

	buf = put_dec_full4(buf, q % 10000);
	q   = q / 10000;

	d1  = q + 7671 * d3 + 9496 * d2 + 6 * d1;
	buf = put_dec_full4(buf, d1 % 10000);
	q   = d1 / 10000;

	d2  = q + 4749 * d3 + 42 * d2;
	buf = put_dec_full4(buf, d2 % 10000);
	q   = d2 / 10000;

	d3  = q + 281 * d3;
	if (!d3)
		goto done;
	buf = put_dec_full4(buf, d3 % 10000);
	q   = d3 / 10000;
	if (!q)
		goto done;
	buf = put_dec_full4(buf, q);
 done:
	while (buf[-1] == '0')
		--buf;

	return buf;
}

#endif

/*
 * Convert passed number to decimal string.
 * Returns the length of string.  On buffer overflow, returns 0.
 *
 * If speed is not important, use snprintf(). It's easy to read the code.
 */
int num_to_str(char *buf, int size, unsigned long long num)
{
	char tmp[sizeof(num) * 3];
	int idx, len;

	/* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
	if (num <= 9) {
		tmp[0] = '0' + num;
		len = 1;
	} else {
		len = put_dec(tmp, num) - tmp;
	}

	if (len > size)
		return 0;
	for (idx = 0; idx < len; ++idx)
		buf[idx] = tmp[len - idx - 1];
	return len;
}

#define ZEROPAD	1		/* pad with zero */
#define SIGN	2		/* unsigned/signed long */
#define PLUS	4		/* show plus */
#define SPACE	8		/* space if plus */
#define LEFT	16		/* left justified */
#define SMALL	32		/* use lowercase in hex (must be 32 == 0x20) */
#define SPECIAL	64		/* prefix hex with "0x", octal with "0" */

enum format_type {
	FORMAT_TYPE_NONE, /* Just a string part */
	FORMAT_TYPE_WIDTH,
	FORMAT_TYPE_PRECISION,
	FORMAT_TYPE_CHAR,
	FORMAT_TYPE_STR,
	FORMAT_TYPE_PTR,
	FORMAT_TYPE_PERCENT_CHAR,
	FORMAT_TYPE_INVALID,
	FORMAT_TYPE_LONG_LONG,
	FORMAT_TYPE_ULONG,
	FORMAT_TYPE_LONG,
	FORMAT_TYPE_UBYTE,
	FORMAT_TYPE_BYTE,
	FORMAT_TYPE_USHORT,
	FORMAT_TYPE_SHORT,
	FORMAT_TYPE_UINT,
	FORMAT_TYPE_INT,
	FORMAT_TYPE_NRCHARS,
	FORMAT_TYPE_SIZE_T,
	FORMAT_TYPE_PTRDIFF
};

struct printf_spec {
	u8	type;		/* format_type enum */
	u8	flags;		/* flags to number() */
	u8	base;		/* number base, 8, 10 or 16 only */
	u8	qualifier;	/* number qualifier, one of 'hHlLtzZ' */
	s16	field_width;	/* width of output field */
	s16	precision;	/* # of digits/chars */
};

static noinline_for_stack
char *number(char *buf, char *end, unsigned long long num,
	     struct printf_spec spec)
{
	/* we are called with base 8, 10 or 16, only, thus don't need "G..."  */
	static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */

	char tmp[66];
	char sign;
	char locase;
	int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
	int i;

	/* locase = 0 or 0x20. ORing digits or letters with 'locase'
	 * produces same digits or (maybe lowercased) letters */
	locase = (spec.flags & SMALL);
	if (spec.flags & LEFT)
		spec.flags &= ~ZEROPAD;
	sign = 0;
	if (spec.flags & SIGN) {
		if ((signed long long)num < 0) {
			sign = '-';
			num = -(signed long long)num;
			spec.field_width--;
		} else if (spec.flags & PLUS) {
			sign = '+';
			spec.field_width--;
		} else if (spec.flags & SPACE) {
			sign = ' ';
			spec.field_width--;
		}
	}
	if (need_pfx) {
		spec.field_width--;
		if (spec.base == 16)
			spec.field_width--;
	}

	/* generate full string in tmp[], in reverse order */
	i = 0;
	if (num < 8)
		tmp[i++] = '0' + num;
	/* Generic code, for any base:
	else do {
		tmp[i++] = (digits[do_div(num,base)] | locase);
	} while (num != 0);
	*/
	else if (spec.base != 10) { /* 8 or 16 */
		int mask = spec.base - 1;
		int shift = 3;

		if (spec.base == 16)
			shift = 4;
		do {
			tmp[i++] = (digits[((unsigned char)num) & mask] | locase);
			num >>= shift;
		} while (num);
	} else { /* base 10 */
		i = put_dec(tmp, num) - tmp;
	}

	/* printing 100 using %2d gives "100", not "00" */
	if (i > spec.precision)
		spec.precision = i;
	/* leading space padding */
	spec.field_width -= spec.precision;
	if (!(spec.flags & (ZEROPAD+LEFT))) {
		while (--spec.field_width >= 0) {
			if (buf < end)
				*buf = ' ';
			++buf;
		}
	}
	/* sign */
	if (sign) {
		if (buf < end)
			*buf = sign;
		++buf;
	}
	/* "0x" / "0" prefix */
	if (need_pfx) {
		if (buf < end)
			*buf = '0';
		++buf;
		if (spec.base == 16) {
			if (buf < end)
				*buf = ('X' | locase);
			++buf;
		}
	}
	/* zero or space padding */
	if (!(spec.flags & LEFT)) {
		char c = (spec.flags & ZEROPAD) ? '0' : ' ';
		while (--spec.field_width >= 0) {
			if (buf < end)
				*buf = c;
			++buf;
		}
	}
	/* hmm even more zero padding? */
	while (i <= --spec.precision) {
		if (buf < end)
			*buf = '0';
		++buf;
	}
	/* actual digits of result */
	while (--i >= 0) {
		if (buf < end)
			*buf = tmp[i];
		++buf;
	}
	/* trailing space padding */
	while (--spec.field_width >= 0) {
		if (buf < end)
			*buf = ' ';
		++buf;
	}

	return buf;
}

#include "test_main.c"

Reply via email to