Changeset: b8a26dda67ae for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/b8a26dda67ae
Modified Files:
        gdk/gdk_string.c
Branch: ascii-flag
Log Message:

Use casefold tables to compare strings with GDKstr(n)casecmp.


diffs (298 lines):

diff --git a/gdk/gdk_string.c b/gdk/gdk_string.c
--- a/gdk/gdk_string.c
+++ b/gdk/gdk_string.c
@@ -6850,97 +6850,151 @@ GDKstrncasecmp(const char *str1, const c
 {
        const uint8_t *s1 = (const uint8_t *) str1;
        const uint8_t *s2 = (const uint8_t *) str2;
+       const uint8_t *x1 = NULL, *x2 = NULL;
        int n1, n2;
        int v1, v2;
 
        for (;;) {
                /* check for the end */
-               if (l1 == 0 || l2 == 0)
+               if ((x1 == NULL && l1 == 0) || (x2 == NULL && l2 == 0))
                        return 0;
-               if (*s1 == 0)
-                       return -(*s2 != 0);
-               if (*s2 == 0)
+               if (x1 == NULL && *s1 == 0)
+                       return -(x2 != NULL || *s2 != 0);
+               if (x2 == NULL && *s2 == 0)
                        return 1;
-               v1 = lowercase[*s1++];
-               n1 = 1;
-               l1--;
-               while (v1 && l1 > 0 && (*s1 & 0xC0) == 0x80) {
-                       assert(n1 < 4);
-                       v1 = lowercase[v1 + *s1++];
-                       n1++;
+
+               /* get next character from str1 */
+               if (x1 == NULL) {
+                       v1 = casefold[*s1++];
+                       n1 = 1;
                        l1--;
-               }
-               if (v1 == 0) {
-                       while (l1 > 0 && (*s1 & 0xC0) == 0x80) {
+                       while (v1 && l1 > 0 && (*s1 & 0xC0) == 0x80) {
                                assert(n1 < 4);
+                               v1 = casefold[v1 + *s1++];
                                n1++;
-                               s1++;
                                l1--;
                        }
+                       if (v1 == 0) {
+                               while (l1 > 0 && (*s1 & 0xC0) == 0x80) {
+                                       assert(n1 < 4);
+                                       n1++;
+                                       s1++;
+                                       l1--;
+                               }
+                       } else if (v1 < 0) {
+                               x1 = (const uint8_t *) specialcase[-v1];
+                               v1 = 0;
+                       }
                }
-               v2 = lowercase[*s2++];
-               n2 = 1;
-               l2--;
-               while (v2 && l2 > 0 && (*s2 & 0xC0) == 0x80) {
-                       assert(n2 < 4);
-                       v2 = lowercase[v2 + *s2++];
-                       n2++;
+               if (x1 != NULL) {
+                       n1 = 1;
+                       while ((*++x1 & 0xC0) == 0x80)
+                               n1++;
+               }
+
+               /* get next character from str2 */
+               if (x2 == NULL) {
+                       v2 = casefold[*s2++];
+                       n2 = 1;
                        l2--;
-               }
-               if (v2 == 0) {
-                       while (l2 > 0 && (*s2 & 0xC0) == 0x80) {
+                       while (v2 && l2 > 0 && (*s2 & 0xC0) == 0x80) {
                                assert(n2 < 4);
+                               v2 = casefold[v2 + *s2++];
                                n2++;
-                               s2++;
                                l2--;
                        }
+                       if (v2 == 0) {
+                               while (l2 > 0 && (*s2 & 0xC0) == 0x80) {
+                                       assert(n2 < 4);
+                                       n2++;
+                                       s2++;
+                                       l2--;
+                               }
+                       } else if (v2 < 0) {
+                               x2 = (const uint8_t *) specialcase[-v2];
+                               v2 = 0;
+                       }
                }
+               if (x2 != NULL) {
+                       n2 = 1;
+                       while ((*++x2 & 0xC0) == 0x80)
+                               n2++;
+               }
+
+               /* At this point, if x1 != NULL (then v1 == 0), it
+                * points to the end of a sequence of length n1 that is
+                * (part of) the first string to be compared and if *x1
+                * != 0, it points to the next character to be compared
+                * (in the next iteration, else we continue with s1);
+                * else if v1 == 0, s1 points to the end of a sequence
+                * of length n1 that is (part of) the string to be
+                * compared; else v1 is the codepoint to be compared.
+                * In any case, s1 points to the start of the next
+                * character to be compared (after x1 is exhausted).
+                * The value in l1 is the remaining length of the first
+                * string (i.e. what s1 points to).  The same for x2,
+                * s2, n2, l2, and v2. */
+
+               /* compare */
                if (v1 == 0) {
                        if (v2 == 0) {
                                /* neither converted */
                                if (n1 == n2) {
                                        /* at least the same length, so simple 
strncmp */
-                                       n1 = strncmp((const char *) s1 - n1, 
(const char *) s2 - n2, n1);
+                                       n1 = strncmp((const char *) (x1?x1:s1) 
- n1, (const char *) (x2?x2:s2) - n2, n1);
                                        if (n1 != 0)
                                                return n1;
                                        /* still equal */
-                               } else if (n1 < n2) {
-                                       /* sequence in s1 is shorter, so s1 < 
s2 */
-                                       return -1;
                                } else {
-                                       /* sequence in s1 is longer, so s1 > s2 
*/
-                                       return 1;
+                                       /* length is leading: shorter
+                                        * sequences come before longer
+                                        * ones */
+                                       return n1 - n2;
                                }
                        } else {
                                switch (n1) {
                                case 1:
-                                       if (v2 >= 0x7F)
+                                       if (v2 >= 0x80)
                                                return -1;
-                                       if (s1[-1] != v2)
-                                               return (s1[-1] > v2) - (s1[-1] 
< v2);
+                                       if (x1) {
+                                               if (x1[-1] != v2)
+                                                       return (x1[-1] > v2) - 
(x1[-1] < v2);
+                                       } else {
+                                               if (s1[-1] != v2)
+                                                       return (s1[-1] > v2) - 
(s1[-1] < v2);
+                                       }
                                        break;
                                case 2:
                                        if (v2 < 0x80)
                                                return 1;
-                                       else if (v2 >= 0x7FF)
+                                       else if (v2 >= 0x800)
                                                return -1;
-                                       v1 = ((s1[-2] & 0x1F) << 6) | (s1[-1] & 
0x3F);
+                                       if (x1)
+                                               v1 = ((x1[-2] & 0x1F) << 6) | 
(x1[-1] & 0x3F);
+                                       else
+                                               v1 = ((s1[-2] & 0x1F) << 6) | 
(s1[-1] & 0x3F);
                                        if (v1 != v2)
                                                return (v1 > v2) - (v1 < v2);
                                        break;
                                case 3:
                                        if (v2 < 0x800)
                                                return 1;
-                                       else if (v2 >= 0xFFFF)
+                                       else if (v2 >= 0x10000)
                                                return -1;
-                                       v1 = ((s1[-3] & 0x0F) << 12) | ((s1[-2] 
& 0x3F) << 6) | (s1[-1] & 0x3F);
+                                       if (x1)
+                                               v1 = ((x1[-3] & 0x0F) << 12) | 
((x1[-2] & 0x3F) << 6) | (x1[-1] & 0x3F);
+                                       else
+                                               v1 = ((s1[-3] & 0x0F) << 12) | 
((s1[-2] & 0x3F) << 6) | (s1[-1] & 0x3F);
                                        if (v1 != v2)
                                                return (v1 > v2) - (v1 < v2);
                                        break;
                                case 4:
                                        if (v2 < 0x10000)
                                                return 1;
-                                       v1 = ((s1[-4] & 0x07) << 18) | ((s1[-3] 
& 0x3F) << 12) | ((s1[-2] & 0x3F) << 6) | (s1[-1] & 0x3F);
+                                       if (x1)
+                                               v1 = ((x1[-4] & 0x07) << 18) | 
((x1[-3] & 0x3F) << 12) | ((x1[-2] & 0x3F) << 6) | (x1[-1] & 0x3F);
+                                       else
+                                               v1 = ((s1[-4] & 0x07) << 18) | 
((s1[-3] & 0x3F) << 12) | ((s1[-2] & 0x3F) << 6) | (s1[-1] & 0x3F);
                                        if (v1 != v2)
                                                return (v1 > v2) - (v1 < v2);
                                        break;
@@ -6948,50 +7002,66 @@ GDKstrncasecmp(const char *str1, const c
                                        MT_UNREACHABLE();
                                }
                        }
-               } else {
-                       if (v2 == 0) {
-                               switch (n2) {
-                               case 1:
-                                       if (v1 >= 0x7F)
-                                               return 1;
+               } else if (v2 == 0) {
+                       switch (n2) {
+                       case 1:
+                               if (v1 >= 0x80)
+                                       return 1;
+                               if (x2) {
+                                       if (x2[-1] != v1)
+                                               return (v1 > x2[-1]) - (v1 < 
x2[-1]);
+                               } else {
                                        if (s2[-1] != v1)
                                                return (v1 > s2[-1]) - (v1 < 
s2[-1]);
-                                       break;
-                               case 2:
-                                       if (v1 < 0x80)
-                                               return -1;
-                                       else if (v1 >= 0x7FF)
-                                               return 1;
+                               }
+                               break;
+                       case 2:
+                               if (v1 < 0x80)
+                                       return -1;
+                               else if (v1 >= 0x800)
+                                       return 1;
+                               if (x2)
+                                       v2 = ((x2[-2] & 0x1F) << 6) | (x2[-1] & 
0x3F);
+                               else
                                        v2 = ((s2[-2] & 0x1F) << 6) | (s2[-1] & 
0x3F);
-                                       if (v1 != v2)
-                                               return (v1 > v2) - (v1 < v2);
-                                       break;
-                               case 3:
-                                       if (v1 < 0x800)
-                                               return -1;
-                                       else if (v1 >= 0xFFFF)
-                                               return 1;
+                               if (v1 != v2)
+                                       return (v1 > v2) - (v1 < v2);
+                               break;
+                       case 3:
+                               if (v1 < 0x800)
+                                       return -1;
+                               else if (v1 >= 0x10000)
+                                       return 1;
+                               if (x2)
+                                       v2 = ((x2[-3] & 0x0F) << 12) | ((x2[-2] 
& 0x3F) << 6) | (x2[-1] & 0x3F);
+                               else
                                        v2 = ((s2[-3] & 0x0F) << 12) | ((s2[-2] 
& 0x3F) << 6) | (s2[-1] & 0x3F);
-                                       if (v1 != v2)
-                                               return (v1 > v2) - (v1 < v2);
-                                       break;
-                               case 4:
-                                       if (v1 < 0x10000)
-                                               return -1;
+                               if (v1 != v2)
+                                       return (v1 > v2) - (v1 < v2);
+                               break;
+                       case 4:
+                               if (v1 < 0x10000)
+                                       return -1;
+                               if (x2)
+                                       v2 = ((x2[-4] & 0x07) << 18) | ((x2[-3] 
& 0x3F) << 12) | ((x2[-2] & 0x3F) << 6) | (x2[-1] & 0x3F);
+                               else
                                        v2 = ((s2[-4] & 0x07) << 18) | ((s2[-3] 
& 0x3F) << 12) | ((s2[-2] & 0x3F) << 6) | (s2[-1] & 0x3F);
-                                       if (v1 != v2)
-                                               return (v1 > v2) - (v1 < v2);
-                                       break;
-                               default:
-                                       MT_UNREACHABLE();
-                               }
-                       } else {
-                               if (v1 != v2) {
-                                       /* both converted and they're not equal 
*/
+                               if (v1 != v2)
                                        return (v1 > v2) - (v1 < v2);
-                               }
+                               break;
+                       default:
+                               MT_UNREACHABLE();
+                       }
+               } else {
+                       if (v1 != v2) {
+                               /* both converted and they're not equal */
+                               return (v1 > v2) - (v1 < v2);
                        }
                }
+               if (x1 != NULL && *x1 == 0)
+                       x1 = NULL;
+               if (x2 != NULL && *x2 == 0)
+                       x2 = NULL;
        }
 }
 
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

Reply via email to