Author: cem
Date: Sat Apr 13 04:42:17 2019
New Revision: 346175
URL: https://svnweb.freebsd.org/changeset/base/346175

Log:
  sort(1): Memoize MD5 computation to reduce repeated computation
  
  Experimentally, reduces sort -R time of a 148160 line corpus from about
  3.15s to about 0.93s on this particular system.
  
  There's probably room for improvement using some digest other than md5, but
  I don't want to look at sort(1) anymore.  Some discussion of other possible
  improvements in the Test Plan section of the Differential.
  
  PR:           230792
  Reviewed by:  jhb (earlier version)
  Differential Revision:        https://reviews.freebsd.org/D19885

Modified:
  head/usr.bin/sort/coll.c
  head/usr.bin/sort/coll.h
  head/usr.bin/sort/sort.c

Modified: head/usr.bin/sort/coll.c
==============================================================================
--- head/usr.bin/sort/coll.c    Sat Apr 13 04:03:18 2019        (r346174)
+++ head/usr.bin/sort/coll.c    Sat Apr 13 04:42:17 2019        (r346175)
@@ -981,6 +981,15 @@ hnumcoll(struct key_value *kv1, struct key_value *kv2,
        return (numcoll_impl(kv1, kv2, offset, true));
 }
 
+/* Use hint space to memoize md5 computations, at least. */
+static void
+randomcoll_init_hint(struct key_value *kv, void *hash)
+{
+
+       memcpy(kv->hint->v.Rh.cached, hash, sizeof(kv->hint->v.Rh.cached));
+       kv->hint->status = HS_INITIALIZED;
+}
+
 /*
  * Implements random sort (-R).
  */
@@ -991,6 +1000,7 @@ randomcoll(struct key_value *kv1, struct key_value *kv
        struct bwstring *s1, *s2;
        MD5_CTX ctx1, ctx2;
        unsigned char hash1[MD5_DIGEST_LENGTH], hash2[MD5_DIGEST_LENGTH];
+       int cmp;
 
        s1 = kv1->k;
        s2 = kv2->k;
@@ -1003,6 +1013,14 @@ randomcoll(struct key_value *kv1, struct key_value *kv
        if (s1 == s2)
                return (0);
 
+       if (kv1->hint->status == HS_INITIALIZED &&
+           kv2->hint->status == HS_INITIALIZED) {
+               cmp = memcmp(kv1->hint->v.Rh.cached,
+                   kv2->hint->v.Rh.cached, sizeof(kv1->hint->v.Rh.cached));
+               if (cmp != 0)
+                       return (cmp);
+       }
+
        memcpy(&ctx1, &md5_ctx, sizeof(MD5_CTX));
        memcpy(&ctx2, &md5_ctx, sizeof(MD5_CTX));
 
@@ -1011,6 +1029,11 @@ randomcoll(struct key_value *kv1, struct key_value *kv
 
        MD5Final(hash1, &ctx1);
        MD5Final(hash2, &ctx2);
+
+       if (kv1->hint->status == HS_UNINITIALIZED)
+               randomcoll_init_hint(kv1, hash1);
+       if (kv2->hint->status == HS_UNINITIALIZED)
+               randomcoll_init_hint(kv2, hash2);
 
        return (memcmp(hash1, hash2, sizeof(hash1)));
 }

Modified: head/usr.bin/sort/coll.h
==============================================================================
--- head/usr.bin/sort/coll.h    Sat Apr 13 04:03:18 2019        (r346174)
+++ head/usr.bin/sort/coll.h    Sat Apr 13 04:42:17 2019        (r346175)
@@ -65,6 +65,17 @@ struct M_hint
 };
 
 /*
+ * Sort hint data for -R
+ *
+ * This stores the first 12 bytes of the digest rather than the full output to
+ * avoid increasing the size of the 'key_hint' object via the 'v' union.
+ */
+struct R_hint
+{
+       unsigned char            cached[12];
+};
+
+/*
  * Status of a sort hint object
  */
 typedef enum
@@ -83,6 +94,7 @@ struct key_hint
                struct n_hint           nh;
                struct g_hint           gh;
                struct M_hint           Mh;
+               struct R_hint           Rh;
        }                       v;
 };
 

Modified: head/usr.bin/sort/sort.c
==============================================================================
--- head/usr.bin/sort/sort.c    Sat Apr 13 04:03:18 2019        (r346174)
+++ head/usr.bin/sort/sort.c    Sat Apr 13 04:42:17 2019        (r346175)
@@ -583,6 +583,7 @@ set_sort_modifier(struct sort_mods *sm, int c)
                break;
        case 'R':
                sm->Rflag = true;
+               need_hint = true;
                need_random = true;
                break;
        case 'M':
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to