commit f3b969b6991f154a1fde1ea6b4488320ed0b486f
Author:     Mattias Andrée <[email protected]>
AuthorDate: Tue Mar 15 11:40:46 2016 +0100
Commit:     Mattias Andrée <[email protected]>
CommitDate: Tue Mar 15 11:40:46 2016 +0100

    Optimise zsetup, zgcd, zmul, and zsqr and add -flto
    
    Signed-off-by: Mattias Andrée <[email protected]>

diff --git a/TODO b/TODO
index 23edfbf..603fabb 100644
--- a/TODO
+++ b/TODO
@@ -8,3 +8,10 @@ Add zranddist value based on % for fitness to bound
 Test big endian
 Test always having used > 0 for zero
   Test negative/non-negative instead of sign
+
+Test optimisation of zmul:
+  bc = [(Hb * Hc) << (m2 << 1)]
+     + [(Hb * Hc) << m2]
+     - [(Hb - Lb)(Hc - Lc) << m2]
+     + [(Lb * Lc) << m2]
+     + (Lb * Lc)
diff --git a/bench/benchmark.c b/bench/benchmark.c
index 7b987e2..d756db8 100644
--- a/bench/benchmark.c
+++ b/bench/benchmark.c
@@ -85,14 +85,18 @@ main(int argc, char *argv[])
        BENCHMARK(zlsh(c, a, 76), 1);
        BENCHMARK(zrsh(c, a, 76), 1);
        BENCHMARK(ztrunc(c, a, 76), 1);
+       BENCHMARK(ztrunc(c, c, 76), 1);
        BENCHMARK(zsplit(c, d, a, 76), 1);
        BENCHMARK(zcmpmag(a, b), 1);
        BENCHMARK(zcmp(a, b), 1);
        BENCHMARK(zcmpi(a, 1000000000LL), 1);
        BENCHMARK(zcmpu(a, 1000000000ULL), 1);
        BENCHMARK(zbset(c, a, 76, 1), 1);
+       BENCHMARK(zbset(a, a, 76, 1), 1);
        BENCHMARK(zbset(c, a, 76, 0), 1);
+       BENCHMARK(zbset(c, c, 76, 0), 1);
        BENCHMARK(zbset(c, a, 76, -1), 1);
+       BENCHMARK(zbset(a, a, 76, -1), 1);
        BENCHMARK(zbtest(a, 76), 1);
        BENCHMARK(zgcd(c, a, b), 0);
        BENCHMARK(zmul(c, a, b), 0);
diff --git a/config.mk b/config.mk
index cad12d8..1e43a43 100644
--- a/config.mk
+++ b/config.mk
@@ -14,5 +14,5 @@ RANLIB = ranlib
 # you have to add -DSECURE_RANDOM_PATHNAME=... to CPPFLAGS.
 
 CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700
-CFLAGS   = -std=c99 -O3 -Wall -pedantic
+CFLAGS   = -std=c99 -flto -O3 -Wall -pedantic
 LDFLAGS  = -s
diff --git a/src/internals.h b/src/internals.h
index c859ce1..456a2df 100644
--- a/src/internals.h
+++ b/src/internals.h
@@ -45,29 +45,29 @@
 #endif
 
 #define LIST_TEMPS\
-       X(libzahl_tmp_cmp)\
-       X(libzahl_tmp_str_num)\
-       X(libzahl_tmp_str_mag)\
-       X(libzahl_tmp_str_div)\
-       X(libzahl_tmp_str_rem)\
-       X(libzahl_tmp_gcd_u)\
-       X(libzahl_tmp_gcd_v)\
-       X(libzahl_tmp_sub)\
-       X(libzahl_tmp_modmul)\
-       X(libzahl_tmp_div)\
-       X(libzahl_tmp_mod)\
-       X(libzahl_tmp_pow_b)\
-       X(libzahl_tmp_pow_c)\
-       X(libzahl_tmp_pow_d)\
-       X(libzahl_tmp_modsqr)\
-       X(libzahl_tmp_divmod_a)\
-       X(libzahl_tmp_divmod_b)\
-       X(libzahl_tmp_divmod_d)\
-       X(libzahl_tmp_ptest_x)\
-       X(libzahl_tmp_ptest_a)\
-       X(libzahl_tmp_ptest_d)\
-       X(libzahl_tmp_ptest_n1)\
-       X(libzahl_tmp_ptest_n4)
+       X(libzahl_tmp_cmp, 1)\
+       X(libzahl_tmp_str_num, 0)\
+       X(libzahl_tmp_str_mag, 0)\
+       X(libzahl_tmp_str_div, 0)\
+       X(libzahl_tmp_str_rem, 0)\
+       X(libzahl_tmp_gcd_u, 0)\
+       X(libzahl_tmp_gcd_v, 0)\
+       X(libzahl_tmp_sub, 0)\
+       X(libzahl_tmp_modmul, 0)\
+       X(libzahl_tmp_div, 0)\
+       X(libzahl_tmp_mod, 0)\
+       X(libzahl_tmp_pow_b, 0)\
+       X(libzahl_tmp_pow_c, 0)\
+       X(libzahl_tmp_pow_d, 0)\
+       X(libzahl_tmp_modsqr, 0)\
+       X(libzahl_tmp_divmod_a, 0)\
+       X(libzahl_tmp_divmod_b, 0)\
+       X(libzahl_tmp_divmod_d, 0)\
+       X(libzahl_tmp_ptest_x, 0)\
+       X(libzahl_tmp_ptest_a, 0)\
+       X(libzahl_tmp_ptest_d, 0)\
+       X(libzahl_tmp_ptest_n1, 0)\
+       X(libzahl_tmp_ptest_n4, 0)
 
 #define LIST_CONSTS\
        X(libzahl_const_1e19, zsetu, 10000000000000000000ULL) /* The largest 
power of 10 < 2⁶⁴. */\
@@ -75,7 +75,7 @@
        X(libzahl_const_2,    zsetu, 2)\
        X(libzahl_const_4,    zsetu, 4)
 
-#define X(x)  extern z_t x;
+#define X(x, s)  extern z_t x;
 LIST_TEMPS
 #undef X
 #define X(x, f, v)  extern z_t x;
@@ -185,3 +185,50 @@ libzahl_add_overflow(zahl_char_t *rp, zahl_char_t a, 
zahl_char_t b)
        return carry;
 }
 #endif
+
+static inline void
+zrsh_taint(z_t a, size_t bits)
+{
+       size_t i, chars, cbits;
+
+       if (unlikely(!bits))
+               return;
+       if (unlikely(zzero(a)))
+               return;
+
+       chars = FLOOR_BITS_TO_CHARS(bits);
+
+       if (unlikely(chars >= a->used || zbits(a) <= bits)) {
+               SET_SIGNUM(a, 0);
+               return;
+       }
+
+       bits = BITS_IN_LAST_CHAR(bits);
+       cbits = BITS_PER_CHAR - bits;
+
+       if (likely(chars)) {
+               a->used -= chars;
+               a->chars += chars;
+       }
+
+       if (unlikely(bits)) { /* This if statement is very important in C. */
+               a->chars[0] >>= bits;
+               for (i = 1; i < a->used; i++) {
+                       a->chars[i - 1] |= a->chars[i] << cbits;
+                       a->chars[i] >>= bits;
+               }
+               TRIM_NONZERO(a);
+       }
+}
+
+static inline void
+zswap_tainted_unsigned(z_t a, z_t b)
+{
+       z_t t;
+       t->used = b->used;
+       b->used = a->used;
+       a->used = t->used;
+       t->chars = b->chars;
+       b->chars = a->chars;
+       a->chars = t->chars;
+}
diff --git a/src/zadd.c b/src/zadd.c
index 557ec6f..b730b81 100644
--- a/src/zadd.c
+++ b/src/zadd.c
@@ -72,6 +72,33 @@ zadd_unsigned(z_t a, z_t b, z_t c)
 }
 
 void
+zadd_unsigned_assign(z_t a, z_t b)
+{
+       size_t size, n;
+
+       if (unlikely(zzero(a))) {
+               zabs(a, b);
+               return;
+       } else if (unlikely(zzero(b))) {
+               return;
+       }
+
+       size = MAX(a->used, b->used);
+       n = a->used + b->used - size;
+
+       ENSURE_SIZE(a, size + 1);
+       a->chars[size] = 0;
+
+       if (a->used < b->used) {
+               n = b->used;
+               zmemset(a->chars + a->used, 0, n - a->used);
+       }
+       zadd_impl(a, b, n);
+
+       SET_SIGNUM(a, 1);
+}
+
+void
 zadd(z_t a, z_t b, z_t c)
 {
        if (unlikely(zzero(b))) {
diff --git a/src/zgcd.c b/src/zgcd.c
index 7c6f2bc..502a779 100644
--- a/src/zgcd.c
+++ b/src/zgcd.c
@@ -12,14 +12,11 @@ zgcd(z_t a, z_t b, z_t c)
         * Binary GCD algorithm.
         */
 
-       size_t shifts = 0, i = 0, min;
-       zahl_char_t uv, bit;
-       int neg;
+       size_t shifts;
+       zahl_char_t *u_orig, *v_orig;
+       size_t u_lsb, v_lsb;
+       int neg, cmpmag;
 
-       if (unlikely(!zcmp(b, c))) {
-               SET(a, b);
-               return;
-       }
        if (unlikely(zzero(b))) {
                SET(a, c);
                return;
@@ -29,37 +26,30 @@ zgcd(z_t a, z_t b, z_t c)
                return;
        }
 
-       zabs(u, b);
-       zabs(v, c);
        neg = znegative2(b, c);
 
-       min = MIN(u->used, v->used);
-       for (; i < min; i++) {
-               uv = u->chars[i] | v->chars[i];
-               for (bit = 1; bit; bit <<= 1, shifts++)
-                       if (uv & bit)
-                               goto loop_done;
+       u_lsb = zlsb(b);
+       v_lsb = zlsb(c);
+       shifts = MIN(u_lsb, v_lsb);
+       zrsh(u, b, u_lsb);
+       zrsh(v, c, v_lsb);
+
+       u_orig = u->chars;
+       v_orig = v->chars;
+
+       for (;;) {
+               if (likely((cmpmag = zcmpmag(u, v)) >= 0)) {
+                       if (unlikely(cmpmag == 0))
+                               break;
+                       zswap_tainted_unsigned(u, v);
+               }
+               zsub_positive_assign(v, u);
+               zrsh_taint(v, zlsb(v));
        }
-       for (; i < u->used; i++)
-               for (bit = 1; bit; bit <<= 1, shifts++)
-                       if (u->chars[i] & bit)
-                               goto loop_done;
-       for (; i < v->used; i++)
-               for (bit = 1; bit; bit <<= 1, shifts++)
-                       if (v->chars[i] & bit)
-                               goto loop_done;
-loop_done:
-       zrsh(u, u, shifts);
-       zrsh(v, v, shifts);
-
-       zrsh(u, u, zlsb(u));
-       do {
-               zrsh(v, v, zlsb(v));
-               if (zcmpmag(u, v) > 0) /* Both are non-negative. */
-                       zswap(u, v);
-               zsub_unsigned(v, v, u);
-       } while (!zzero(v));
 
        zlsh(a, u, shifts);
        SET_SIGNUM(a, neg ? -1 : 1);
+
+       u->chars = u_orig;
+       v->chars = v_orig;
 }
diff --git a/src/zmul.c b/src/zmul.c
index ae02844..ab41213 100644
--- a/src/zmul.c
+++ b/src/zmul.c
@@ -57,39 +57,22 @@ zmul(z_t a, z_t b, z_t c)
        zsplit(b_high, b_low, b, m2);
        zsplit(c_high, c_low, c, m2);
 
-#if 1
+
        zmul(z0, b_low, c_low);
        zmul(z2, b_high, c_high);
-       zadd(b_low, b_low, b_high);
-       zadd(c_low, c_low, c_high);
+       zadd_unsigned_assign(b_low, b_high);
+       zadd_unsigned_assign(c_low, c_high);
        zmul(z1, b_low, c_low);
 
-       zsub(z1, z1, z0);
-       zsub(z1, z1, z2);
+       zsub_nonnegative_assign(z1, z0);
+       zsub_nonnegative_assign(z1, z2);
 
        zlsh(z1, z1, m2);
        m2 <<= 1;
-       zlsh(z2, z2, m2);
-
-       zadd(a, z2, z1);
-       zadd(a, a, z0);
-#else
-       zmul(z0, b_low, c_low);
-       zmul(z2, b_high, c_high);
-       zsub(b_low, b_high, b_low);
-       zsub(c_low, c_high, c_low);
-       zmul(z1, b_low, c_low);
-
-       zlsh(z0, z0, m2 + 1);
-       zlsh(z1, z1, m2);
        zlsh(a, z2, m2);
-       m2 <<= 1;
-       zlsh(z2, z2, m2);
-       zadd(z2, z2, a);
+       zadd_unsigned_assign(a, z1);
+       zadd_unsigned_assign(a, z0);
 
-       zsub(a, z2, z1);
-       zadd(a, a, z0);
-#endif
 
        zfree(z0);
        zfree(z1);
diff --git a/src/zsetup.c b/src/zsetup.c
index 921e509..10fc5f5 100644
--- a/src/zsetup.c
+++ b/src/zsetup.c
@@ -1,7 +1,7 @@
 /* See LICENSE file for copyright and license details. */
 #include "internals.h"
 
-#define X(x)  z_t x;
+#define X(x, s)  z_t x;
 LIST_TEMPS
 #undef X
 #define X(x, f, v)  z_t x;
@@ -30,8 +30,8 @@ zsetup(jmp_buf env)
                memset(libzahl_pool_n, 0, sizeof(libzahl_pool_n));
                memset(libzahl_pool_alloc, 0, sizeof(libzahl_pool_alloc));
 
-#define X(x)\
-               zinit(x), zsetu(x, 1);
+#define X(x, s)\
+               zinit(x); if (s) zsetu(x, 1);
                LIST_TEMPS;
 #undef X
 #define X(x, f, v)\
diff --git a/src/zsqr.c b/src/zsqr.c
index bd03128..68480ba 100644
--- a/src/zsqr.c
+++ b/src/zsqr.c
@@ -42,32 +42,17 @@ zsqr(z_t a, z_t b)
 
        zsplit(high, low, b, m2);
 
-#if 1
+
        zsqr(z0, low);
        zsqr(z2, high);
        zmul(z1, low, high);
 
        zlsh(z1, z1, m2 + 1);
        m2 <<= 1;
-       zlsh(z2, z2, m2);
-
-       zadd(a, z2, z1);
-       zadd(a, a, z0);
-#else
-       zsqr(z0, low);
-       zsqr(z2, high);
-       zmul(z1, low, low);
-
-       zlsh(z0, z0, m2 + 1);
-       zlsh(z1, z1, m2 + 1);
        zlsh(a, z2, m2);
-       m2 <<= 1;
-       zlsh(z2, z2, m2);
-       zadd(z2, z2, a);
+       zadd_unsigned_assign(a, z1);
+       zadd_unsigned_assign(a, z0);
 
-       zsub(a, z2, z1);
-       zadd(a, a, z0);
-#endif
 
        zfree(z0);
        zfree(z1);
diff --git a/src/zsub.c b/src/zsub.c
index b3f12f2..259526f 100644
--- a/src/zsub.c
+++ b/src/zsub.c
@@ -46,7 +46,7 @@ libzahl_zsub_unsigned(z_t a, z_t b, z_t c)
                        SET_SIGNUM(a, 0);
                        return;
                }
-               n = MIN(b->used, c->used);
+               n = b->used;
                if (a == b) {
                        zset(libzahl_tmp_sub, b);
                        SET(a, c);
@@ -56,7 +56,7 @@ libzahl_zsub_unsigned(z_t a, z_t b, z_t c)
                        zsub_impl(a, b, n);
                }
        } else {
-               n = MIN(b->used, c->used);
+               n = c->used;
                if (unlikely(a == c)) {
                        zset(libzahl_tmp_sub, c);
                        SET(a, b);
@@ -77,6 +77,24 @@ zsub_unsigned(z_t a, z_t b, z_t c)
 }
 
 void
+zsub_nonnegative_assign(z_t a, z_t b)
+{
+       if (unlikely(zzero(b))) {
+               zabs(a, a);
+       } else if (unlikely(!zcmpmag(a, b))) {
+               SET_SIGNUM(a, 0);
+       } else {
+               zsub_impl(a, b, b->used);
+       }
+}
+
+void
+zsub_positive_assign(z_t a, z_t b)
+{
+       zsub_impl(a, b, b->used);
+}
+
+void
 zsub(z_t a, z_t b, z_t c)
 {
        if (unlikely(zzero(b))) {
diff --git a/src/zunsetup.c b/src/zunsetup.c
index 9926354..ba84dd7 100644
--- a/src/zunsetup.c
+++ b/src/zunsetup.c
@@ -8,7 +8,7 @@ zunsetup(void)
        size_t i;
        if (libzahl_set_up) {
                libzahl_set_up = 0;
-#define X(x)\
+#define X(x, s)\
                free(x->chars);
                LIST_TEMPS;
 #undef X
diff --git a/zahl.h b/zahl.h
index e700dc7..cd41b7b 100644
--- a/zahl.h
+++ b/zahl.h
@@ -119,6 +119,9 @@ void zmodpowu(z_t, z_t, unsigned long long int, z_t);
 /* These are used internally and may be removed in a future version. */
 void zadd_unsigned(z_t, z_t, z_t);      /* a := |b| + |c| */
 void zsub_unsigned(z_t, z_t, z_t);      /* a := |b| - |c| */
+void zadd_unsigned_assign(z_t, z_t);    /* a := |a| + |b| */
+void zsub_nonnegative_assign(z_t, z_t); /* a := a - b, assuming a ≥ b ≥ 0 
*/
+void zsub_positive_assign(z_t, z_t);    /* a := a - b, assuming a > b > 0 */
 
 
 /* Bitwise operations. */

Reply via email to