MonetDB: default - Implemented Unicode character string literals...

Sjoerd Mullender Fri, 23 Nov 2018 07:12:01 -0800

Changeset: 9f1b1d94b70d for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9f1b1d94b70d
Modified Files:
        sql/ChangeLog
        sql/server/sql_parser.y
        sql/server/sql_scan.c
Branch: default
Log Message:


Implemented Unicode character string literals and Unicode delimited identifiers.
I.e. U&'...' '...'... UESCAPE '...' and U&"..." UESCAPE '...'.
In the former case, the extra '...' strings still undergo C-style
backslash interpretation (incorrectly).


diffs (truncated from 330 to 300 lines):

diff --git a/sql/ChangeLog b/sql/ChangeLog
--- a/sql/ChangeLog
+++ b/sql/ChangeLog
@@ -2,6 +2,12 @@
 # This file is updated with Maddlog
 
 * Fri Nov 23 2018 Sjoerd Mullender <sjo...@acm.org>
+- Implemented U&'...' Unicode character string literals and
+  U&"..." Unicode delimited identifiers, including UESCAPE.  For the
+  string literals, you can have U&'...' '...' '...' UESCAPE '...' where
+  the escape must be as single character and the other '...' strings
+  are also Unicode character string literals.  For now, these latter
+  strings also undergo C-style backslash interpretation.
 - Implemented PostgreSQL-like E'...' strings.  The strings can contain
   C-style backslash escapes.  The old format strings '...' currently
   still also accept C-style escapes, but that feature will be removed
diff --git a/sql/server/sql_parser.y b/sql/server/sql_parser.y
--- a/sql/server/sql_parser.y
+++ b/sql/server/sql_parser.y
@@ -93,6 +93,64 @@ UTF8_strlen(const char *val)
        return pos;
 }
 
+
+static char *
+uescape_xform(char *restrict s, const char *restrict esc)
+{
+       size_t i, j;
+
+       for (i = j = 0; s[i]; i++) {
+               if (s[i] == *esc) {
+                       if (s[i + 1] == *esc) {
+                               s[j++] = *esc;
+                               i++;
+                       } else {
+                               int c = 0;
+                               int n;
+                               if (s[i + 1] == '+') {
+                                       n = 6;
+                                       i++;
+                               } else {
+                                       n = 4;
+                               }
+                               do {
+                                       i++;
+                                       c <<= 4;
+                                       if ('0' <= s[i] && s[i] <= '9')
+                                               c |= s[i] - '0';
+                                       else if ('a' <= s[i] && s[i] <= 'f')
+                                               c |= s[i] - 'a' + 10;
+                                       else if ('A' <= s[i] && s[i] <= 'F')
+                                               c |= s[i] - 'A' + 10;
+                                       else
+                                               return NULL;
+                               } while (--n > 0);
+                               if (c == 0 || c > 0x10FFFF || (c & 0xFFF800) == 
0xD800)
+                                       return NULL;
+                               if (c < 0x80) {
+                                       s[j++] = c;
+                               } else {
+                                       if (c < 0x800) {
+                                               s[j++] = 0xC0 | (c >> 6);
+                                       } else {
+                                               if (c < 0x10000) {
+                                                       s[j++] = 0xE0 | (c >> 
12);
+                                               } else {
+                                                       s[j++] = 0xF0 | (c >> 
18);
+                                                       s[j++] = 0x80 | ((c >> 
12) & 0x3F);
+                                               }
+                                               s[j++] = 0x80 | ((c >> 6) & 
0x3F);
+                                       }
+                                       s[j++] = 0x80 | (c & 0x3F);
+                               }
+                       }
+               } else {
+                       s[j++] = s[i];
+               }
+       }
+       s[j] = 0;
+       return s;
+}
 %}
 /* KNOWN NOT DONE OF sql'99
  *
@@ -319,6 +377,7 @@ int yydebug=1;
        opt_constraint_name
        non_reserved_word
        ident
+       ident_or_uident
        calc_ident
        authorization_identifier
        func_ident
@@ -332,6 +391,8 @@ int yydebug=1;
        opt_using
        opt_null_string
        string
+       sstring
+       ustring
        type_alias
        varchar
        clob
@@ -347,6 +408,7 @@ int yydebug=1;
        XML_namespace_prefix
        XML_PI_target
        function_body
+       opt_uescape
 
 %type <l>
        passwd_schema
@@ -530,12 +592,12 @@ int yydebug=1;
        opt_nulls_first_last
        tz
 
-%right <sval> STRING
+%right <sval> STRING USTRING
 %right <sval> X_BODY
 
 /* sql prefixes to avoid name clashes on various architectures */
 %token <sval>
-       IDENT aTYPE ALIAS AGGR AGGR2 RANK sqlINT OIDNUM HEXADECIMAL INTNUM 
APPROXNUM 
+       IDENT UIDENT aTYPE ALIAS AGGR AGGR2 RANK sqlINT OIDNUM HEXADECIMAL 
INTNUM APPROXNUM 
        USING 
        GLOBAL CAST CONVERT
        CHARACTER VARYING LARGE OBJECT VARCHAR CLOB sqlTEXT BINARY sqlBLOB
@@ -558,7 +620,7 @@ int yydebug=1;
 %token  UNCOMMITTED COMMITTED sqlREPEATABLE SERIALIZABLE DIAGNOSTICS sqlSIZE 
STORAGE
 
 %token <sval> ASYMMETRIC SYMMETRIC ORDER ORDERED BY IMPRINTS
-%token <operation> EXISTS ESCAPE HAVING sqlGROUP sqlNULL
+%token <operation> EXISTS ESCAPE UESCAPE HAVING sqlGROUP sqlNULL
 %token <operation> FROM FOR MATCH
 
 %token <operation> EXTRACT
@@ -4990,7 +5052,7 @@ literal:
                        YYABORT;
                  }
                }
- | IDENT string
+ | ident_or_uident string
                {
                  sql_type *t = mvc_bind_type(m, $1);
                  atom *a;
@@ -5363,7 +5425,7 @@ data_type:
                                sql_find_subtype(&$$, $1, $3, $5);
                          }
                        }
- | IDENT               {
+ | ident_or_uident     {
                          sql_type *t = mvc_bind_type(m, $1);
                          if (!t) {
                                char *msg = sql_message(SQLSTATE(22000) "Type 
(%s) unknown", $1);
@@ -5377,7 +5439,7 @@ data_type:
                          }
                        }
 
- | IDENT '(' nonzero ')'
+ | ident_or_uident '(' nonzero ')'
                        {
                          sql_type *t = mvc_bind_type(m, $1);
                          if (!t) {
@@ -5533,6 +5595,8 @@ authid:           restricted_ident ;
 
 calc_restricted_ident:
     IDENT      { $$ = $1; }
+ |  UIDENT opt_uescape
+               { $$ = uescape_xform($1, $2); }
  |  aTYPE      { $$ = $1; }
  |  ALIAS      { $$ = $1; }
  |  AGGR       { $$ = $1; }    /* without '(' */
@@ -5555,6 +5619,8 @@ restricted_ident:
 
 calc_ident:
     IDENT      { $$ = $1; }
+ |  UIDENT opt_uescape
+               { $$ = uescape_xform($1, $2); }
  |  aTYPE      { $$ = $1; }
  |  FILTER_FUNC        { $$ = $1; }
  |  ALIAS      { $$ = $1; }
@@ -5666,6 +5732,7 @@ non_reserved_word:
 |  NULLS       { $$ = sa_strdup(SA, "nulls"); }
 |  LAST                { $$ = sa_strdup(SA, "last"); }
 |  FIRST       { $$ = sa_strdup(SA, "first"); }
+|  UESCAPE     { $$ = sa_strdup(SA, "uescape"); }
 ;
 
 name_commalist:
@@ -5697,6 +5764,11 @@ lngval:
                  }
                }
 
+ident_or_uident:
+       IDENT                   { $$ = $1; }
+    |  UIDENT opt_uescape      { $$ = uescape_xform($1, $2); }
+    ;
+
 intval:
        sqlINT  
                {
@@ -5719,7 +5791,7 @@ intval:
                        YYABORT;
                  }
                }
- |     IDENT   {
+ |     ident_or_uident {
                  char *name = $1;
                  sql_subtype *tpe;
 
@@ -5755,16 +5827,50 @@ intval:
                }
  ;
 
-string:
+opt_uescape:
+/* empty */    { $$ = "\\"; }
+| UESCAPE string
+               { char *s = $2;
+                 if (strlen(s) != 1 || strchr("\"'0123456789abcdefABCDEF+ 
\t\n\r\f", *s) != NULL) {
+                       yyerror(m, SQLSTATE(22019) "UESCAPE must be one 
character");
+                       $$ = NULL;
+                       YYABORT;
+                 } else {
+                       $$ = s;
+                 }
+               }
+
+ustring:
+    USTRING
+               { $$ = $1; }
+ |  USTRING sstring
+               { char *s = strconcat($1,$2);
+                 $$ = sa_strdup(SA, s);
+                 _DELETE(s);
+               }
+ ;
+
+sstring:
     STRING
                { $$ = $1; }
- |  STRING string
-               { char *s = strconcat($1,$2); 
-                 $$ = sa_strdup(SA, s);        
+ |  STRING sstring
+               { char *s = strconcat($1,$2);
+                 $$ = sa_strdup(SA, s);
                  _DELETE(s);
                }
  ;
 
+string:
+   sstring     { $$ = $1; }
+ | ustring opt_uescape
+               { $$ = uescape_xform($1, $2);
+                 if ($$ == NULL) {
+                       yyerror(m, SQLSTATE(22019) "Bad Unicode string");
+                       YYABORT;
+                 }
+               }
+ ;
+
 exec:
      execute exec_ref
                {
diff --git a/sql/server/sql_scan.c b/sql/server/sql_scan.c
--- a/sql/server/sql_scan.c
+++ b/sql/server/sql_scan.c
@@ -174,6 +174,7 @@ scanner_init_keywords(void)
        failed += keywords_insert("DROP", DROP);
        failed += keywords_insert("ESCAPE", ESCAPE);
        failed += keywords_insert("EXISTS", EXISTS);
+       failed += keywords_insert("UESCAPE", UESCAPE);
        failed += keywords_insert("EXTRACT", EXTRACT);
        failed += keywords_insert("FLOAT", sqlFLOAT);
        failed += keywords_insert("FOR", FOR);
@@ -1142,10 +1143,17 @@ tokenize(mvc * c, int cur)
                } else if (iswdigit(cur)) {
                        return number(c, cur);
                } else if (iswalpha(cur) || cur == '_') {
-                       if (cur == 'E' &&
+                       if ((cur == 'E' || cur == 'e') &&
                            lc->rs->buf[lc->rs->pos + lc->yycur] == '\'') {
                                return scanner_string(c, scanner_getc(lc), 
true);
                        }
+                       if ((cur == 'U' || cur == 'u') &&
+                           lc->rs->buf[lc->rs->pos + lc->yycur] == '&' &&
+                           (lc->rs->buf[lc->rs->pos + lc->yycur + 1] == '\'' ||
+                            lc->rs->buf[lc->rs->pos + lc->yycur + 1] == '"')) {
+                               cur = scanner_getc(lc); /* '&' */
+                               return scanner_string(c, scanner_getc(lc), 
false);
+                       }
                        return keyword_or_ident(c, cur);
                } else if (iswpunct(cur)) {
                        return scanner_symbol(c, cur);
@@ -1230,7 +1238,7 @@ sql_get_next_token(YYSTYPE *yylval, void
        else if (token == STRING) {
                char quote = *yylval->sval;
                char *str = sa_alloc( c->sa, (lc->yycur-lc->yysval-2)*2 + 1 );
-               assert(quote == '"' || quote == '\'' || quote == 'E');
+               assert(quote == '"' || quote == '\'' || quote == 'E' || quote 
== 'e' || quote == 'U' || quote == 'u');
 
                lc->rs->buf[lc->rs->pos + lc->yycur - 1] = 0;
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: default - Implemented Unicode character string literals...

Reply via email to