The GLSL Language Specification (version 4.30.6) is quite clear about the GLSL character set and the expected behavior for other characters:
Section 3.1 Character Set The source character set used for the OpenGL shading languages, outside of comments, is a subset of UTF-8. It includes the following characters: The letters a-z, A-Z, and the underscore ( _ ). The numbers 0-9. The symbols period (.), plus (+), dash (-), slash (/), asterisk (*), percent (%), angled brackets (< and >), square brackets ( [ and ] ), parentheses ( ( and ) ), braces ( { and } ), caret (^), vertical bar (|), ampersand (&), tilde (~), equals (=), exclamation point (!), colon (:), semicolon (;), comma (,), and question mark (?). The number sign (#) for preprocessor use. The backslash (\) as the line-continuation character when used as the last character of a line, just before a new line. White space: the space character, horizontal tab, vertical tab, form feed, carriage-return, and line-feed. A compile-time error will be given if any other character is used outside a comment. By taking the set of all possible 8-bit characters, and subtracting the above, we have the set of illegal characters: 0x00 - 0x08 (^A - ^H) 0x0E - 0x1F (^N - ^Z, ^[, ^\, ^], ^^, ^_) 0x22 (") 0x24 ($) 0x27 (') 0x40 (@) 0x60 (') 0x7F (DEL or ^?) 0x80 - 0xFF (non-ASCII) As well as (#) outside of uses defined by the preprocessor (not starting a directive, nor as part of a legal paste operator in a replacement list), and (\) appearing anywhere but at the end of a line. So instead of the previous whitelist we had for "OTHER" characters, we now add a blacklist for "ILLEGAL" characters based on the above, and then use a simple regular expression of "." to catch any characters that get past the blacklist. This approach also means the internal-error rule with "." can no longer be matched, so it goes away now. v2: Instead of emitting the error as soon as the illegal character is lexed, we instead emit an ILLEGAL token to the parser. This allows the parser to allow the character as part of the replacement list of a macro, (since these are specified to allow any character). However, if such a macro is actually instantiated, the parser will emit an error when it goes to print the illegal character as part of the preprocessed output. --- src/glsl/glcpp/glcpp-lex.l | 32 +++++++++++--------------------- src/glsl/glcpp/glcpp-parse.y | 25 ++++++++++++++++++------- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/src/glsl/glcpp/glcpp-lex.l b/src/glsl/glcpp/glcpp-lex.l index 0dbdab0..0482c4e 100644 --- a/src/glsl/glcpp/glcpp-lex.l +++ b/src/glsl/glcpp/glcpp-lex.l @@ -175,15 +175,7 @@ HASH # IDENTIFIER [_a-zA-Z][_a-zA-Z0-9]* PP_NUMBER [.]?[0-9]([._a-zA-Z0-9]|[eEpP][-+])* PUNCTUATION [][(){}.&*~!/%<>^|;,=+-] - -/* The OTHER class is simply a catch-all for things that the CPP -parser just doesn't care about. Since flex regular expressions that -match longer strings take priority over those matching shorter -strings, we have to be careful to avoid OTHER matching and hiding -something that CPP does care about. So we simply exclude all -characters that appear in any other expressions. */ - -OTHER [^][_#[:space:]#a-zA-Z0-9(){}.&*~!/%<>^|;,=+-] +ILLEGAL [\x00-\x08\x0E-\x1F"$'@`\x7F\x80-\xFF\\] DIGITS [0-9][0-9]* DECIMAL_INTEGER [1-9][0-9]*[uU]? @@ -276,9 +268,10 @@ HEXADECIMAL_INTEGER 0[xX][0-9a-fA-F]+[uU]? * token. */ if (parser->first_non_space_token_this_line) { BEGIN HASH; + RETURN_TOKEN_NEVER_SKIP (HASH_TOKEN); + } else { + RETURN_STRING_TOKEN (ILLEGAL); } - - RETURN_TOKEN_NEVER_SKIP (HASH_TOKEN); } <HASH>version{HSPACE}+ { @@ -505,8 +498,8 @@ HEXADECIMAL_INTEGER 0[xX][0-9a-fA-F]+[uU]? RETURN_TOKEN (yytext[0]); } -{OTHER}+ { - RETURN_STRING_TOKEN (OTHER); +{ILLEGAL} { + RETURN_STRING_TOKEN (ILLEGAL); } {HSPACE} { @@ -539,14 +532,7 @@ HEXADECIMAL_INTEGER 0[xX][0-9a-fA-F]+[uU]? RETURN_TOKEN (NEWLINE); } - /* This is a catch-all to avoid the annoying default flex action which - * matches any character and prints it. If any input ever matches this - * rule, then we have made a mistake above and need to fix one or more - * of the preceding patterns to match that input. */ - -<*>. { - glcpp_error(yylloc, yyextra, "Internal compiler error: Unexpected character: %s", yytext); - +<UNREACHABLE>. { /* We don't actually use the UNREACHABLE start condition. We only have this block here so that we can pretend to call some generated functions, (to avoid "defined but not used" @@ -557,6 +543,10 @@ HEXADECIMAL_INTEGER 0[xX][0-9a-fA-F]+[uU]? } } +<*>. { + RETURN_STRING_TOKEN (OTHER); +} + %% void diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y index e2e8aca..afca990 100644 --- a/src/glsl/glcpp/glcpp-parse.y +++ b/src/glsl/glcpp/glcpp-parse.y @@ -171,11 +171,11 @@ add_builtin_define(glcpp_parser_t *parser, const char *name, int value); /* We use HASH_TOKEN, DEFINE_TOKEN and VERSION_TOKEN (as opposed to * HASH, DEFINE, and VERSION) to avoid conflicts with other symbols, * (such as the <HASH> and <DEFINE> start conditions in the lexer). */ -%token COMMA_FINAL DEFINED ELIF_EXPANDED HASH_TOKEN DEFINE_TOKEN FUNC_IDENTIFIER OBJ_IDENTIFIER ELIF ELSE ENDIF ERROR_TOKEN IF IFDEF IFNDEF LINE PRAGMA UNDEF VERSION_TOKEN GARBAGE IDENTIFIER IF_EXPANDED INTEGER INTEGER_STRING LINE_EXPANDED NEWLINE OTHER PLACEHOLDER SPACE PLUS_PLUS MINUS_MINUS +%token COMMA_FINAL DEFINED ELIF_EXPANDED HASH_TOKEN DEFINE_TOKEN FUNC_IDENTIFIER OBJ_IDENTIFIER ELIF ELSE ENDIF ERROR_TOKEN IF IFDEF IFNDEF LINE PRAGMA UNDEF VERSION_TOKEN GARBAGE IDENTIFIER IF_EXPANDED ILLEGAL INTEGER INTEGER_STRING LINE_EXPANDED NEWLINE OTHER PLACEHOLDER SPACE PLUS_PLUS MINUS_MINUS %token PASTE %type <ival> INTEGER operator SPACE integer_constant %type <expression_value> expression -%type <str> IDENTIFIER FUNC_IDENTIFIER OBJ_IDENTIFIER INTEGER_STRING OTHER ERROR_TOKEN PRAGMA +%type <str> IDENTIFIER FUNC_IDENTIFIER OBJ_IDENTIFIER ILLEGAL INTEGER_STRING OTHER ERROR_TOKEN PRAGMA %type <string_list> identifier_list %type <token> preprocessing_token conditional_token %type <token_list> pp_tokens replacement_list text_line conditional_tokens @@ -692,6 +692,12 @@ preprocessing_token: $$ = _token_create_str (parser, IDENTIFIER, $1); $$->location = yylloc; } + /* We allow illegal characters at this point, but gripe + * if these ever actually get printed in the output. */ +| ILLEGAL { + $$ = _token_create_str (parser, ILLEGAL, $1); + $$->location = yylloc; + } | INTEGER_STRING { $$ = _token_create_str (parser, INTEGER_STRING, $1); $$->location = yylloc; @@ -1118,7 +1124,7 @@ _token_list_equal_ignoring_space (token_list_t *a, token_list_t *b) } static void -_token_print (char **out, size_t *len, token_t *token) +_token_print (glcpp_parser_t *parser, char **out, size_t *len, token_t *token) { if (token->type < 256) { ralloc_asprintf_rewrite_tail (out, len, "%c", token->type); @@ -1176,6 +1182,9 @@ _token_print (char **out, size_t *len, token_t *token) case PLACEHOLDER: /* Nothing to print. */ break; + case ILLEGAL: + glcpp_error (&token->location, parser, "Illegal character '%s'", token->value.str); + break; default: assert(!"Error: Don't know how to print token."); break; @@ -1303,9 +1312,9 @@ _token_paste (glcpp_parser_t *parser, token_t *token, token_t *other) FAIL: glcpp_error (&token->location, parser, ""); ralloc_asprintf_rewrite_tail (&parser->info_log, &parser->info_log_length, "Pasting \""); - _token_print (&parser->info_log, &parser->info_log_length, token); + _token_print (parser, &parser->info_log, &parser->info_log_length, token); ralloc_asprintf_rewrite_tail (&parser->info_log, &parser->info_log_length, "\" and \""); - _token_print (&parser->info_log, &parser->info_log_length, other); + _token_print (parser, &parser->info_log, &parser->info_log_length, other); ralloc_asprintf_rewrite_tail (&parser->info_log, &parser->info_log_length, "\" does not give a valid preprocessing token.\n"); return token; @@ -1319,8 +1328,10 @@ _token_list_print (glcpp_parser_t *parser, token_list_t *list) if (list == NULL) return; - for (node = list->head; node; node = node->next) - _token_print (&parser->output, &parser->output_length, node->token); + for (node = list->head; node; node = node->next) { + _token_print (parser, &parser->output, + &parser->output_length, node->token); + } } void -- 2.0.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev