[issue8692] Use divide-and-conquer for faster factorials

Alexander Belopolsky Thu, 13 May 2010 19:24:28 -0700

Alexander Belopolsky <belopol...@users.sourceforge.net> added the comment:

On Thu, May 13, 2010 at 5:58 PM, Mark Dickinson <rep...@bugs.python.org> wrote:
>Â Optimizations that speed up, say, factorial(n) for n <= 1000 would seem more 
>valuable.

I am attaching a variant of my patch which precomputes partial
products that fit in 32 bit unsigned int.  This results in speed up
over Daniel's code which varies from 1.8x for 20! down to 7% for 100!
and no measurable improvement for 1000!.

This optimization is orthogonal to the choice of partial_product
algorithm and can be easily extended on platforms with long long to
precompute 64 bit products.

----------
Added file: http://bugs.python.org/file17325/factorial-precompute-partials.patch

_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue8692>
_______________________________________

Index: Modules/mathmodule.c
===================================================================
--- Modules/mathmodule.c        (revision 81150)
+++ Modules/mathmodule.c        (working copy)
@@ -1129,56 +1129,183 @@
 Return an accurate floating point sum of values in the iterable.\n\
 Assumes IEEE-754 floating point arithmetic.");
 
+/* Divide-and-conquer factorial algorithm
+ *
+ * Based on the formula and psuedo-code provided at:
+ * http://www.luschny.de/math/factorial/binarysplitfact.html
+ *
+ * Faster algorithms exist, but they're more complicated and depend on
+ * a fast prime factoriazation algorithm.
+ */
+
+static unsigned long 
+precomputed[] = {3, 15, 5, 35, 315, 63, 693, 9009, 1287, 19305, 328185, 36465, 
692835,
+                14549535, 1322685, 30421755, 760543875, 58503375, 
1579591125ul, 3053876175ul};
+/* i-th odd number */
+#define ODD(i) (((i) << 1) | 1) 
+/* Compute product(ODD(i) for i in range(first, last+1)) */
 static PyObject *
+factorial_partial_product(unsigned long first, unsigned long last)
+{
+    PyObject *result;
+    PyObject *a, *x, *y;
+    Py_ssize_t n, i;
+    
+    if (last < sizeof(precomputed)/sizeof(*precomputed)) {
+       size_t index = first + last - 2;
+       if (index <  sizeof(precomputed)/sizeof(*precomputed)) {
+           return PyLong_FromUnsignedLong(precomputed[index]);
+       }
+    }
+   
+    n = last - first + 1;
+    if (n <= 0)
+       return PyLong_FromLong(1L);
+    if (n == 1)
+       return PyLong_FromUnsignedLong(ODD(first));
+
+    /* a = [ODD(i) for i in range(first, last + 1)] */
+    a = PyList_New(last - first + 1);
+    if (a == NULL)
+       return NULL;
+    for  (i = 0; i < n; ++i) {
+       result = PyLong_FromUnsignedLong(ODD(first + i));
+       if (result == NULL)
+           goto done;
+       PyList_SET_ITEM(a, i, result);
+    }
+    while (n > 1) {
+       for (i = (n >> 1) - 1; i >= 0; --i) {
+           x = PyList_GET_ITEM(a, i);
+           y = PyList_GET_ITEM(a, n - i - 1);
+           result = PyNumber_Multiply(x, y);
+           if (result == NULL)
+               goto done;
+           PyList_SET_ITEM(a, i, result);
+           PyList_SET_ITEM(a, n - i - 1, NULL);
+           Py_DECREF(x);
+           Py_DECREF(y);
+       }
+       n = (n >> 1) + (n & 1);
+    }
+    /* Release reference held by the list */
+    PyList_SET_ITEM(a, 0, NULL);
+ done:
+    Py_DECREF(a);
+    return result;
+}
+
+static unsigned long
+bit_length(unsigned long n)
+{
+    unsigned long len = 0;
+    while (n != 0) {
+       ++len;
+       n >>= 1;
+    }
+    return len;
+}
+
+static unsigned long
+bit_count(unsigned long n)
+{
+    unsigned long count = 0;
+    while (n != 0) {
+       ++count;
+       n &= n - 1; /* clear least significant bit */
+    }
+    return count;
+}
+
+static PyObject *
+factorial_loop(unsigned long n)
+{
+    long i, v;
+    PyObject *p, *r;
+    PyObject *part, *tmp;
+
+    p = PyLong_FromLong(1L);
+    if (p == NULL)
+        return NULL;
+    Py_INCREF(p);
+    r = p;
+
+    for (i = bit_length(n) - 2; i >= 0; --i) {
+        v = n >> i;
+        if (v > 2) {
+           part = factorial_partial_product(((v >> 1) + 1) >> 1, (v - 1) >> 1);
+           if (part == NULL)
+               goto error;
+
+           tmp = PyNumber_Multiply(p, part);
+           Py_DECREF(part);
+           if (tmp == NULL)
+               goto error;
+           Py_DECREF(p);
+           p = tmp;
+
+           tmp = PyNumber_Multiply(r, p);
+           if (tmp == NULL)
+               goto error;
+           Py_DECREF(r);
+           r = tmp;
+       }
+    }
+    Py_DECREF(p);
+    return r;
+ error:
+    Py_DECREF(p);
+    Py_DECREF(r);
+    return NULL;
+}
+
+static PyObject *
 math_factorial(PyObject *self, PyObject *arg)
 {
-    long i, x;
-    PyObject *result, *iobj, *newresult;
+    PyObject *result;
 
+    PyObject *r; /* result without trailing zeros */
+    PyObject *ntz; /* number of trailing zeros */
+    long n;
+
     if (PyFloat_Check(arg)) {
-        PyObject *lx;
         double dx = PyFloat_AS_DOUBLE((PyFloatObject *)arg);
         if (!(Py_IS_FINITE(dx) && dx == floor(dx))) {
             PyErr_SetString(PyExc_ValueError,
                 "factorial() only accepts integral values");
             return NULL;
         }
-        lx = PyLong_FromDouble(dx);
-        if (lx == NULL)
+        n = (long) dx;
+    } else {
+        n = PyLong_AsLong(arg);
+        if (n == -1 && PyErr_Occurred())
             return NULL;
-        x = PyLong_AsLong(lx);
-        Py_DECREF(lx);
     }
-    else
-        x = PyLong_AsLong(arg);
 
-    if (x == -1 && PyErr_Occurred())
-        return NULL;
-    if (x < 0) {
+    if (n < 0) {
         PyErr_SetString(PyExc_ValueError,
-            "factorial() not defined for negative values");
+                        "factorial() not defined for negative values");
         return NULL;
     }
 
-    result = (PyObject *)PyLong_FromLong(1);
-    if (result == NULL)
+    if (n <= 12) {
+        static const long lookup[] = {1, 1, 2, 6, 24, 120, 720, 5040, 40320,
+                                      362880, 3628800, 39916800, 479001600};
+        return PyLong_FromLong(lookup[n]);
+    }
+
+    r = factorial_loop(n);
+    if (r == NULL)
         return NULL;
-    for (i=1 ; i<=x ; i++) {
-        iobj = (PyObject *)PyLong_FromLong(i);
-        if (iobj == NULL)
-            goto error;
-        newresult = PyNumber_Multiply(result, iobj);
-        Py_DECREF(iobj);
-        if (newresult == NULL)
-            goto error;
-        Py_DECREF(result);
-        result = newresult;
+
+    ntz = PyLong_FromLong(n - bit_count(n));
+    if (ntz != NULL) {
+       result = PyNumber_Lshift(r, ntz);
+       Py_DECREF(ntz);
     }
+
+    Py_DECREF(r);
     return result;
-
-error:
-    Py_DECREF(result);
-    return NULL;
 }
 
 PyDoc_STRVAR(math_factorial_doc,

_______________________________________________
Python-bugs-list mailing list
Unsubscribe: 
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

[issue8692] Use divide-and-conquer for faster factorials

Reply via email to