Changeset: d9dd394c0bde for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d9dd394c0bde
Modified Files:
        monetdb5/modules/mal/tablet.c
        sql/backends/monet5/sql_scenario.c
        sql/storage/bat/bat_storage.c
        
sql/test/BugTracker-2010/Tests/incomplete-utf8-sequence.Bug-2575.stable.err
        
sql/test/BugTracker-2018/Tests/sqlitelogictest-having-not-null-not-in.Bug-6557.stable.out
Branch: default
Log Message:

Merge with Apr2019 branch.


diffs (truncated from 590 to 300 lines):

diff --git a/monetdb5/modules/mal/tablet.c b/monetdb5/modules/mal/tablet.c
--- a/monetdb5/modules/mal/tablet.c
+++ b/monetdb5/modules/mal/tablet.c
@@ -1226,6 +1226,39 @@ SQLworkdivider(READERtask *task, READERt
  * If we end up with unfinished records, then the rowlimit will terminate the 
process.
  */
 
+typedef unsigned char (*dfa_t)[256];
+
+static dfa_t
+mkdfa(const unsigned char *sep, size_t seplen)
+{
+       dfa_t dfa;
+       size_t i, j, k;
+
+       dfa = GDKzalloc(seplen * sizeof(*dfa));
+       if (dfa == NULL)
+               return NULL;
+       /* Each character in the separator string advances the state by
+        * one.  If state reaches seplen, the separator was recognized.
+        *
+        * The first loop and the nested loop make sure that if in any
+        * state we encounter an invalid character, but part of what we've
+        * matched so far is a prefix of the separator, we go to the
+        * appropriate state. */
+       for (i = 0; i < seplen; i++)
+               dfa[i][sep[0]] = 1;
+       for (j = 0; j < seplen; j++) {
+               dfa[j][sep[j]] = j + 1;
+               for (k = 0; k < j; k++) {
+                       for (i = 0; i < j - k; i++)
+                               if (sep[k + i] != sep[i])
+                                       break;
+                       if (i == j - k && dfa[j][sep[i]] <= i)
+                               dfa[j][sep[i]] = i + 1;
+               }
+       }
+       return dfa;
+}
+
 static void
 SQLproducer(void *p)
 {
@@ -1239,12 +1272,21 @@ SQLproducer(void *p)
        const char *rsep = task->rsep;
        size_t rseplen = strlen(rsep), partial = 0;
        char quote = task->quote;
+       dfa_t rdfa;
+       lng rowno = 1;
 
        MT_sema_down(&task->producer);
        if (task->id < 0) {
                return;
        }
 
+       rdfa = mkdfa((const unsigned char *) rsep, rseplen);
+       if (rdfa == NULL) {
+               tablet_error(task, lng_nil, int_nil, "cannot allocate memory", 
"");
+               ateof[cur] = true;
+               goto reportlackofinput;
+       }
+
 #ifdef _DEBUG_TABLET_CNTRL
        mnstr_printf(GDKout, "#SQLproducer started size %zu len %zu\n",
                                 task->b->size, task->b->len);
@@ -1268,7 +1310,7 @@ SQLproducer(void *p)
                // warn the consumers
                if (ateof[cur] && partial) {
                        if (partial) {
-                               tablet_error(task, lng_nil, int_nil, 
"incomplete record at end of file", s);
+                               tablet_error(task, rowno, int_nil, "incomplete 
record at end of file", s);
                                task->b->pos += partial;
                        }
                        goto reportlackofinput;
@@ -1276,7 +1318,7 @@ SQLproducer(void *p)
 
                if (task->errbuf && task->errbuf[0]) {
                        if (GDKerrbuf && GDKerrbuf[0]) {
-                               tablet_error(task, lng_nil, int_nil, GDKerrbuf, 
"SQLload_file");
+                               tablet_error(task, rowno, int_nil, GDKerrbuf, 
"SQLload_file");
 #ifdef _DEBUG_TABLET_CNTRL
                                mnstr_printf(GDKout, "#bailout on SQLload 
%s\n", msg);
 #endif
@@ -1301,7 +1343,7 @@ SQLproducer(void *p)
                        /* the input buffer should be extended, but 'base' is 
not shared
                           between the threads, which we can not now update.
                           Mimick an ateof instead; */
-                       tablet_error(task, lng_nil, int_nil, "record too long", 
"");
+                       tablet_error(task, rowno, int_nil, "record too long", 
"");
                        ateof[cur] = true;
 #ifdef _DEBUG_TABLET_CNTRL
                        mnstr_printf(GDKout, "#bailout on SQLload confronted 
with too large record\n");
@@ -1335,111 +1377,67 @@ SQLproducer(void *p)
                         * user should supply the correct number of fields.
                         * In the first phase we simply break the lines at the
                         * record boundary. */
-                       if (quote == 0) {
-                               switch (rseplen) {
-                               case 1:
-                                       for (; *e; e++) {
-                                               if (*e == '\\') {
-                                                       if (*++e == 0)
-                                                               break;
-                                                       continue;
-                                               }
-                                               if (*e == *rsep)
-                                                       break;
-                                       }
-                                       break;
-                               case 2:
-                                       for (; *e; e++) {
-                                               if (*e == '\\') {
-                                                       if (*++e == 0)
-                                                               break;
-                                                       continue;
-                                               }
-                                               if (*e == *rsep && e[1] == 
rsep[1])
-                                                       break;
-                                       }
-                                       break;
-                               default:
-                                       for (; *e; e++) {
-                                               if (*e == '\\') {
-                                                       if (*++e == 0)
-                                                               break;
-                                                       continue;
-                                               }
-                                               if (*e == *rsep && strncmp(e, 
rsep, rseplen) == 0)
-                                                       break;
-                                       }
-                               }
-                               if (*e == 0) {
-                                       partial = e - s;
-                                       e = 0;  /* nonterminated record, we 
need more */
+                       int nutf = 0;
+                       int m = 0;
+                       bool bs = false;
+                       char q = 0;
+                       size_t i = 0;
+                       while (*e) {
+                               /* check for correctly encoded UTF-8 */
+                               if (nutf > 0) {
+                                       if ((*e & 0xC0) != 0x80)
+                                               goto badutf8;
+                                       if (m != 0 && (*e & m) == 0)
+                                               goto badutf8;
+                                       m = 0;
+                                       nutf--;
+                               } else if ((*e & 0xE0) == 0xC0) {
+                                       nutf = 1;
+                                       if ((e[0] & 0x1E) == 0)
+                                               goto badutf8;
+                               } else if ((*e & 0xF0) == 0xE0) {
+                                       nutf = 2;
+                                       if ((e[0] & 0x0F) == 0)
+                                               m = 0x20;
+                               } else if ((*e & 0xF8) == 0xF0) {
+                                       nutf = 3;
+                                       if ((e[0] & 0x07) == 0)
+                                               m = 0x30;
+                               } else if ((*e & 0x80) != 0) {
+                                       goto badutf8;
                                }
-                       } else {
-                               char q = 0;
-
-                               switch (rseplen) {
-                               case 1:
-                                       for (; *e; e++) {
-                                               if (*e == q)
-                                                       q = 0;
-                                               else if (*e == quote)
-                                                       q = *e;
-                                               else if (*e == '\\') {
-                                                       if (*++e == 0)
-                                                               break;
-                                               } else if (!q && *e == *rsep)
-                                                       break;
-                                       }
-                                       if (*e == 0) {
-                                               partial = e - s;
-                                               e = 0;  /* nonterminated 
record, we need more */
-                                       }
-                                       break;
-                               case 2:
-                                       for (; *e; e++) {
-                                               if (*e == q)
-                                                       q = 0;
-                                               else if (*e == quote)
-                                                       q = *e;
-                                               else if (*e == '\\') {
-                                                       if (e[1])
-                                                               e++;
-                                               } else if (!q && e[0] == 
rsep[0] && e[1] == rsep[1])
-                                                       break;
-                                       }
-                                       if (*e == 0) {
-                                               partial = e - s;
-                                               e = 0;  /* nonterminated 
record, we need more */
-                                       }
-                                       break;
-                               default:
-                                       for (; *e; e++) {
-                                               if (*e == q)
-                                                       q = 0;
-                                               else if (*e == quote)
-                                                       q = *e;
-                                               else if (*e == '\\') {
-                                                       if (*++e == 0)
-                                                               break;
-                                               } else if (!q && *e == *rsep && 
strncmp(e, rsep, rseplen) == 0)
-                                                       break;
-                                       }
-                                       if (*e == 0) {
-                                               partial = e - s;
-                                               e = 0;  /* nonterminated 
record, we need more */
-                                       }
+                               /* check for quoting and the row separator */
+                               if (bs) {
+                                       bs = false;
+                               } else if (*e == '\\') {
+                                       bs = true;
+                                       i = 0;
+                               } else if (*e == q) {
+                                       q = 0;
+                               } else if (*e == quote) {
+                                       q = quote;
+                                       i = 0;
+                               } else if (q == 0) {
+                                       i = rdfa[i][(unsigned char) *e];
+                                       if (i == rseplen)
+                                               break;
                                }
+                               e++;
+                       }
+                       if (*e == 0) {
+                               partial = e - s;
+                               e = NULL;               /* nonterminated 
record, we need more */
                        }
                        /* check for incomplete line and end of buffer 
condition */
                        if (e) {
+                               rowno++;
                                /* found a complete record, do we need to skip 
it? */
                                if (--task->skip < 0 && cnt < task->maxrow) {
                                        task->lines[cur][task->top[cur]++] = s;
                                        cnt++;
                                }
-                               *e = '\0';
-                               s = e + rseplen;
-                               e = s;
+                               *(e + 1 - rseplen) = 0;
+                               s = ++e;
                                task->b->pos += (size_t) (e - base);
                                base = e;
                                if (task->top[cur] == task->limit)
@@ -1448,7 +1446,7 @@ SQLproducer(void *p)
                                /* found an incomplete record, saved for next 
round */
                                if (s+partial < end) {
                                        /* found a EOS in the input */
-                                       tablet_error(task, lng_nil, int_nil, 
"record too long (EOS found)", "");
+                                       tablet_error(task, rowno, int_nil, 
"record too long (EOS found)", "");
                                        ateof[cur] = true;
                                        goto reportlackofinput;
                                }
@@ -1470,6 +1468,7 @@ SQLproducer(void *p)
                        /* then wait until it is done */
                        MT_sema_down(&task->producer);
                        if (cnt == task->maxrow) {
+                               GDKfree(rdfa);
                                return;
                        }
                } else {
@@ -1483,6 +1482,7 @@ SQLproducer(void *p)
                                MT_sema_down(&task->producer);
                                blocked[(cur + 1) % MAXBUFFERS] = false;
                                if (task->state == ENDOFCOPY) {
+                                       GDKfree(rdfa);
                                        return;
                                }
                        }
@@ -1507,6 +1507,7 @@ SQLproducer(void *p)
 #ifdef _DEBUG_TABLET_CNTRL
                                mnstr_printf(GDKout, "#Producer delivered 
all\n");
 #endif
+                               GDKfree(rdfa);
                                return;
                        }
                }
@@ -1518,6 +1519,7 @@ SQLproducer(void *p)
 #ifdef _DEBUG_TABLET_CNTRL
                        mnstr_printf(GDKout, "#Producer encountered eof\n");
 #endif
+                       GDKfree(rdfa);
                        return;
                }
                /* consumers ask us to stop? */
@@ -1527,6 +1529,7 @@ SQLproducer(void *p)
                                mnstr_printf(GDKout, "#SQL producer early exit 
%.63s\n",
                                                         task->b->buf + 
task->b->pos);
 #endif
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to