Re: Some optimisations for numeric division

Dean Rasheed Fri, 25 Feb 2022 02:46:01 -0800

On Wed, 23 Feb 2022 at 22:55, Tom Lane <t...@sss.pgh.pa.us> wrote:
>
> Dean Rasheed <dean.a.rash...@gmail.com> writes:
>
> > One thought that occurred to me was that it's a bit silly that
> > exp_var() and ln_var() have to use a NumericVar for what could just be
> > an int, if we had a div_var_int() function that could divide by an
> > int. Then both div_var() and div_var_fast() could hand off to it for
> > one and two digit divisors.
>
> Oooh, that seems like a good idea.
>


OK, I've replaced the 0003 patch with one that does that instead. The
div_var_int() API is slightly different in that it also accepts a
divisor weight argument, but the alternative would have been for the
callers to have to adjust both the result weight and rscale, which
would have been uglier.

There's a large block of code in div_var() that needs re-indenting,
but I think it would be better to leave that to a later pgindent run.

The performance results are quite pleasing. It's slightly faster than
the old one-digit div_var() code because it manages to avoid some
digit array copying, and for two digit divisors it's much faster:

CREATE TEMP TABLE div_test(x numeric, y numeric);
SELECT setseed(0);
INSERT INTO div_test
  SELECT (SELECT (((x%9)+1)||string_agg((random()*9)::int::text, ''))::numeric
          FROM generate_series(1,50)),
         (SELECT ('1.'||string_agg((random()*9)::int::text,
'')||(x%10)||'e3')::numeric
          FROM generate_series(1,6))
  FROM generate_series(1,5000) g(x);
select * from div_test limit 10;

SELECT sum(t1.x/t2.y) FROM div_test t1, div_test t2;

Time: 11600.034 ms  (HEAD)
Time: 9890.788 ms   (with 0002)
Time: 6202.851 ms   (with 0003)

And obviously it'll be a larger relative gain for div_var_fast(),
since that was slower to begin with in such cases.

This makes me think that it might also be worthwhile to follow this
with a similar div_var_int64() function on platforms with 128-bit
integers, which could then be used to handle 3- and 4-digit divisors,
which are probably quite common in practice.

Attached is the updated patch series (0001 and 0002 unchanged).

Regards,
Dean

From 481b7c0044afbe72adff1961624a1bb515791b96 Mon Sep 17 00:00:00 2001
From: Dean Rasheed <dean.a.rash...@gmail.com>
Date: Sun, 20 Feb 2022 17:15:49 +0000
Subject: [PATCH 2/3] Simplify the inner loop of numeric division in div_var().

In the standard numeric division algorithm, the inner loop multiplies
the divisor by the next quotient digit and subtracts that from the
working dividend. As suggested by the original code comment, the
separate "carry" and "borrow" variables (from the multiplication and
subtraction steps respectively) can be folded together into a single
variable. Doing so significantly improves performance, as well as
simplifying the code.
---
 src/backend/utils/adt/numeric.c | 36 ++++++++++++++-------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index fc43d2a456..909f4eed74 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -8605,31 +8605,25 @@ div_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result,
 			/* As above, need do nothing more when quotient digit is 0 */
 			if (qhat > 0)
 			{
+				NumericDigit *dividend_j = &dividend[j];
+
 				/*
 				 * Multiply the divisor by qhat, and subtract that from the
-				 * working dividend.  "carry" tracks the multiplication,
-				 * "borrow" the subtraction (could we fold these together?)
+				 * working dividend.  The multiplication and subtraction are
+				 * folded together here, noting that qhat <= NBASE (since it
+				 * might be one too large), and so the intermediate result
+				 * "tmp_result" is in the range [-NBASE^2, NBASE - 1], and
+				 * "borrow" is in the range [0, NBASE].
 				 */
-				carry = 0;
 				borrow = 0;
 				for (i = var2ndigits; i >= 0; i--)
 				{
-					carry += divisor[i] * qhat;
-					borrow -= carry % NBASE;
-					carry = carry / NBASE;
-					borrow += dividend[j + i];
-					if (borrow < 0)
-					{
-						dividend[j + i] = borrow + NBASE;
-						borrow = -1;
-					}
-					else
-					{
-						dividend[j + i] = borrow;
-						borrow = 0;
-					}
+					int			tmp_result;
+
+					tmp_result = dividend_j[i] - borrow - divisor[i] * qhat;
+					borrow = (NBASE - 1 - tmp_result) / NBASE;
+					dividend_j[i] = tmp_result + borrow * NBASE;
 				}
-				Assert(carry == 0);
 
 				/*
 				 * If we got a borrow out of the top dividend digit, then
@@ -8645,15 +8639,15 @@ div_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result,
 					carry = 0;
 					for (i = var2ndigits; i >= 0; i--)
 					{
-						carry += dividend[j + i] + divisor[i];
+						carry += dividend_j[i] + divisor[i];
 						if (carry >= NBASE)
 						{
-							dividend[j + i] = carry - NBASE;
+							dividend_j[i] = carry - NBASE;
 							carry = 1;
 						}
 						else
 						{
-							dividend[j + i] = carry;
+							dividend_j[i] = carry;
 							carry = 0;
 						}
 					}
-- 
2.26.2

From 41732ad9a44dcd12e52d823fb59cb23cce4fe217 Mon Sep 17 00:00:00 2001
From: Dean Rasheed <dean.a.rash...@gmail.com>
Date: Sun, 20 Feb 2022 12:45:01 +0000
Subject: [PATCH 1/3] Apply auto-vectorization to the inner loop of
 div_var_fast().

This loop is basically the same as the inner loop of mul_var(), which
was auto-vectorized in commit 8870917623, but the compiler will only
consider auto-vectorizing the div_var_fast() loop if the assignment
target div[qi + i] is replaced by div_qi[i], where div_qi = &div[qi].

Additionally, since the compiler doesn't know that qdigit is
guaranteed to fit in a 16-bit NumericDigit, cast it to NumericDigit
before multiplying to make the resulting auto-vectorized code more
efficient.
---
 src/backend/utils/adt/numeric.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index 3208789f75..fc43d2a456 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -8908,13 +8908,22 @@ div_var_fast(const NumericVar *var1, const NumericVar *var2,
 			 * which would make the new value simply div[qi] mod vardigits[0].
 			 * The lower-order terms in qdigit can change this result by not
 			 * more than about twice INT_MAX/NBASE, so overflow is impossible.
+			 *
+			 * This inner loop is the performance bottleneck for division, so
+			 * code it in the same way as the inner loop of mul_var() so that
+			 * it can be auto-vectorized.  We cast qdigit to NumericDigit
+			 * before multiplying to allow the compiler to generate more
+			 * efficient code (using 16-bit multiplication), which is safe
+			 * since we know that the quotient digit is off by at most one, so
+			 * there is no overflow risk.
 			 */
 			if (qdigit != 0)
 			{
 				int			istop = Min(var2ndigits, div_ndigits - qi + 1);
+				int		   *div_qi = &div[qi];
 
 				for (i = 0; i < istop; i++)
-					div[qi + i] -= qdigit * var2digits[i];
+					div_qi[i] -= ((NumericDigit) qdigit) * var2digits[i];
 			}
 		}
 
-- 
2.26.2

From 43061400317d48c95689961a4e8c5050ab581410 Mon Sep 17 00:00:00 2001
From: Dean Rasheed <dean.a.rash...@gmail.com>
Date: Sun, 20 Feb 2022 18:10:25 +0000
Subject: [PATCH 3/3] Optimise numeric division for one and two base-NBASE
 digit divisors.

Formerly div_var() had "fast path" short division code that was
significantly faster when the divisor was just one base-NBASE digit,
but otherwise used long division.

This commit adds a new function div_var_int() that divides by an
arbitrary 32-bit integer, using the fast short division algorithm, and
updates both div_var() and div_var_fast() to use it for one and two
digit divisors. In the case of div_var(), this is slightly faster in
the one-digit case, because it avoids some digit array copying, and is
much faster in the two-digit case where it replaces long division. For
div_var_fast(), it is much faster in both cases because div_var_fast()
is optimised for larger inputs.

Additionally, optimise exp() and ln() by using div_var_int(), allowing
a NumericVar to be replaced by an int variable in a couple of places,
most notably in the Taylor series code. This produces a significant
speedup of exp(), ln() and the numeric-big regression test.
---
 src/backend/utils/adt/numeric.c | 223 ++++++++++++++++++++++++++------
 1 file changed, 180 insertions(+), 43 deletions(-)

diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index 909f4eed74..5b8c009f44 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -551,6 +551,8 @@ static void div_var(const NumericVar *var1, const NumericVar *var2,
 					int rscale, bool round);
 static void div_var_fast(const NumericVar *var1, const NumericVar *var2,
 						 NumericVar *result, int rscale, bool round);
+static void div_var_int(const NumericVar *var, int ival, int ival_weight,
+						NumericVar *result, int rscale, bool round);
 static int	select_div_scale(const NumericVar *var1, const NumericVar *var2);
 static void mod_var(const NumericVar *var1, const NumericVar *var2,
 					NumericVar *result);
@@ -8451,8 +8453,33 @@ div_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result,
 				 errmsg("division by zero")));
 
 	/*
-	 * Now result zero check
+	 * If the divisor has just one or two digits, delegate to div_var_int(),
+	 * which uses fast short division.
 	 */
+	if (var2ndigits <= 2)
+	{
+		int			idivisor;
+		int			idivisor_weight;
+
+		idivisor = var2->digits[0];
+		idivisor_weight = var2->weight;
+		if (var2ndigits == 2)
+		{
+			idivisor = idivisor * NBASE + var2->digits[1];
+			idivisor_weight--;
+		}
+		if (var2->sign == NUMERIC_NEG)
+			idivisor = -idivisor;
+
+		div_var_int(var1, idivisor, idivisor_weight, result, rscale, round);
+		return;
+	}
+
+	/*
+	 * Otherwise, perform full long division.
+	 */
+
+	/* Result zero check */
 	if (var1ndigits == 0)
 	{
 		zero_var(result);
@@ -8510,23 +8537,6 @@ div_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result,
 	alloc_var(result, res_ndigits);
 	res_digits = result->digits;
 
-	if (var2ndigits == 1)
-	{
-		/*
-		 * If there's only a single divisor digit, we can use a fast path (cf.
-		 * Knuth section 4.3.1 exercise 16).
-		 */
-		divisor1 = divisor[1];
-		carry = 0;
-		for (i = 0; i < res_ndigits; i++)
-		{
-			carry = carry * NBASE + dividend[i + 1];
-			res_digits[i] = carry / divisor1;
-			carry = carry % divisor1;
-		}
-	}
-	else
-	{
 		/*
 		 * The full multiple-place algorithm is taken from Knuth volume 2,
 		 * Algorithm 4.3.1D.
@@ -8659,7 +8669,6 @@ div_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result,
 			/* And we're done with this quotient digit */
 			res_digits[j] = qhat;
 		}
-	}
 
 	pfree(dividend);
 
@@ -8735,8 +8744,33 @@ div_var_fast(const NumericVar *var1, const NumericVar *var2,
 				 errmsg("division by zero")));
 
 	/*
-	 * Now result zero check
+	 * If the divisor has just one or two digits, delegate to div_var_int(),
+	 * which uses fast short division.
 	 */
+	if (var2ndigits <= 2)
+	{
+		int			idivisor;
+		int			idivisor_weight;
+
+		idivisor = var2->digits[0];
+		idivisor_weight = var2->weight;
+		if (var2ndigits == 2)
+		{
+			idivisor = idivisor * NBASE + var2->digits[1];
+			idivisor_weight--;
+		}
+		if (var2->sign == NUMERIC_NEG)
+			idivisor = -idivisor;
+
+		div_var_int(var1, idivisor, idivisor_weight, result, rscale, round);
+		return;
+	}
+
+	/*
+	 * Otherwise, perform full long division.
+	 */
+
+	/* Result zero check */
 	if (var1ndigits == 0)
 	{
 		zero_var(result);
@@ -9008,6 +9042,118 @@ div_var_fast(const NumericVar *var1, const NumericVar *var2,
 }
 
 
+/*
+ * div_var_int() -
+ *
+ *	Divide a numeric variable by a 32-bit integer with the specified weight.
+ *	The quotient var / (ival * NBASE^ival_weight) is stored in result.
+ */
+static void
+div_var_int(const NumericVar *var, int ival, int ival_weight,
+			NumericVar *result, int rscale, bool round)
+{
+	NumericDigit *var_digits = var->digits;
+	int			var_ndigits = var->ndigits;
+	int			res_sign;
+	int			res_weight;
+	int			res_ndigits;
+	NumericDigit *res_buf;
+	NumericDigit *res_digits;
+	uint		divisor;
+	int			i;
+
+	/* Guard against division by zero */
+	if (ival == 0)
+		ereport(ERROR,
+				errcode(ERRCODE_DIVISION_BY_ZERO),
+				errmsg("division by zero"));
+
+	/* Result zero check */
+	if (var_ndigits == 0)
+	{
+		zero_var(result);
+		result->dscale = rscale;
+		return;
+	}
+
+	/*
+	 * Determine the result sign, weight and number of digits to calculate.
+	 * The weight figured here is correct if the emitted quotient has no
+	 * leading zero digits; otherwise strip_var() will fix things up.
+	 */
+	if (var->sign == NUMERIC_POS)
+		res_sign = ival > 0 ? NUMERIC_POS : NUMERIC_NEG;
+	else
+		res_sign = ival > 0 ? NUMERIC_NEG : NUMERIC_POS;
+	res_weight = var->weight - ival_weight;
+	/* The number of accurate result digits we need to produce: */
+	res_ndigits = res_weight + 1 + (rscale + DEC_DIGITS - 1) / DEC_DIGITS;
+	/* ... but always at least 1 */
+	res_ndigits = Max(res_ndigits, 1);
+	/* If rounding needed, figure one more digit to ensure correct result */
+	if (round)
+		res_ndigits++;
+
+	res_buf = digitbuf_alloc(res_ndigits + 1);
+	res_buf[0] = 0;				/* spare digit for later rounding */
+	res_digits = res_buf + 1;
+
+	/*
+	 * Now compute the quotient digits.  This is the short division algorithm
+	 * described in Knuth volume 2, section 4.3.1 exercise 16, except that we
+	 * allow the divisor to exceed the internal base.
+	 *
+	 * In this algorithm, the carry from one digit to the next is at most
+	 * divisor - 1.  Therefore, while processing the next digit, carry may
+	 * become as large as divisor * NBASE - 1, and so it requires a 64-bit
+	 * integer if this exceeds UINT_MAX.
+	 */
+	divisor = Abs(ival);
+
+	if (divisor <= UINT_MAX / NBASE)
+	{
+		/* carry cannot overflow 32 bits */
+		uint32		carry = 0;
+
+		for (i = 0; i < res_ndigits; i++)
+		{
+			carry = carry * NBASE + (i < var_ndigits ? var_digits[i] : 0);
+			res_digits[i] = (NumericDigit) (carry / divisor);
+			carry = carry % divisor;
+		}
+	}
+	else
+	{
+		/* carry may exceed 32 bits */
+		uint64		carry = 0;
+
+		for (i = 0; i < res_ndigits; i++)
+		{
+			carry = carry * NBASE + (i < var_ndigits ? var_digits[i] : 0);
+			res_digits[i] = (NumericDigit) (carry / divisor);
+			carry = carry % divisor;
+		}
+	}
+
+	/* Store the quotient in result */
+	digitbuf_free(result->buf);
+	result->ndigits = res_ndigits;
+	result->buf = res_buf;
+	result->digits = res_digits;
+	result->weight = res_weight;
+	result->sign = res_sign;
+
+	/* Round or truncate to target rscale (and set result->dscale) */
+	if (round)
+		round_var(result, rscale);
+	else
+		trunc_var(result, rscale);
+
+	/* Strip leading/trailing zeroes */
+	strip_var(result);
+}
+
+
 /*
  * Default scale selection for division
  *
@@ -9783,7 +9929,7 @@ exp_var(const NumericVar *arg, NumericVar *result, int rscale)
 {
 	NumericVar	x;
 	NumericVar	elem;
-	NumericVar	ni;
+	int			ni;
 	double		val;
 	int			dweight;
 	int			ndiv2;
@@ -9792,7 +9938,6 @@ exp_var(const NumericVar *arg, NumericVar *result, int rscale)
 
 	init_var(&x);
 	init_var(&elem);
-	init_var(&ni);
 
 	set_var_from_var(arg, &x);
 
@@ -9820,15 +9965,13 @@ exp_var(const NumericVar *arg, NumericVar *result, int rscale)
 
 	/*
 	 * Reduce x to the range -0.01 <= x <= 0.01 (approximately) by dividing by
-	 * 2^n, to improve the convergence rate of the Taylor series.
+	 * 2^ndiv2, to improve the convergence rate of the Taylor series.
+	 *
+	 * Note that the overflow check above ensures that Abs(x) < 6000, which
+	 * means that ndiv2 <= 20 here.
 	 */
 	if (Abs(val) > 0.01)
 	{
-		NumericVar	tmp;
-
-		init_var(&tmp);
-		set_var_from_var(&const_two, &tmp);
-
 		ndiv2 = 1;
 		val /= 2;
 
@@ -9836,13 +9979,10 @@ exp_var(const NumericVar *arg, NumericVar *result, int rscale)
 		{
 			ndiv2++;
 			val /= 2;
-			add_var(&tmp, &tmp, &tmp);
 		}
 
 		local_rscale = x.dscale + ndiv2;
-		div_var_fast(&x, &tmp, &x, local_rscale, true);
-
-		free_var(&tmp);
+		div_var_int(&x, 1 << ndiv2, 0, &x, local_rscale, true);
 	}
 	else
 		ndiv2 = 0;
@@ -9870,16 +10010,16 @@ exp_var(const NumericVar *arg, NumericVar *result, int rscale)
 	add_var(&const_one, &x, result);
 
 	mul_var(&x, &x, &elem, local_rscale);
-	set_var_from_var(&const_two, &ni);
-	div_var_fast(&elem, &ni, &elem, local_rscale, true);
+	ni = 2;
+	div_var_int(&elem, ni, 0, &elem, local_rscale, true);
 
 	while (elem.ndigits != 0)
 	{
 		add_var(result, &elem, result);
 
 		mul_var(&elem, &x, &elem, local_rscale);
-		add_var(&ni, &const_one, &ni);
-		div_var_fast(&elem, &ni, &elem, local_rscale, true);
+		ni++;
+		div_var_int(&elem, ni, 0, &elem, local_rscale, true);
 	}
 
 	/*
@@ -9899,7 +10039,6 @@ exp_var(const NumericVar *arg, NumericVar *result, int rscale)
 
 	free_var(&x);
 	free_var(&elem);
-	free_var(&ni);
 }
 
 
@@ -9993,7 +10132,7 @@ ln_var(const NumericVar *arg, NumericVar *result, int rscale)
 {
 	NumericVar	x;
 	NumericVar	xx;
-	NumericVar	ni;
+	int			ni;
 	NumericVar	elem;
 	NumericVar	fact;
 	int			nsqrt;
@@ -10012,7 +10151,6 @@ ln_var(const NumericVar *arg, NumericVar *result, int rscale)
 
 	init_var(&x);
 	init_var(&xx);
-	init_var(&ni);
 	init_var(&elem);
 	init_var(&fact);
 
@@ -10073,13 +10211,13 @@ ln_var(const NumericVar *arg, NumericVar *result, int rscale)
 	set_var_from_var(result, &xx);
 	mul_var(result, result, &x, local_rscale);
 
-	set_var_from_var(&const_one, &ni);
+	ni = 1;
 
 	for (;;)
 	{
-		add_var(&ni, &const_two, &ni);
+		ni += 2;
 		mul_var(&xx, &x, &xx, local_rscale);
-		div_var_fast(&xx, &ni, &elem, local_rscale, true);
+		div_var_int(&xx, ni, 0, &elem, local_rscale, true);
 
 		if (elem.ndigits == 0)
 			break;
@@ -10095,7 +10233,6 @@ ln_var(const NumericVar *arg, NumericVar *result, int rscale)
 
 	free_var(&x);
 	free_var(&xx);
-	free_var(&ni);
 	free_var(&elem);
 	free_var(&fact);
 }
-- 
2.26.2

Re: Some optimisations for numeric division

Reply via email to