On Sun, Nov 12, 2023 at 08:02:43PM -1000, Iain Sandoe wrote: > This adds the ability to defer the validation of numeric attribute > arguments until the sequence is parsed if the attribute being > handled is one known to be 'clang form'. > > We do this by considering the arguments to be strings regardless > of content and defer the interpretation of those strings until the > argument processing. > > Since the C++ front end lexes tokens separately (and would report > non-compliant numeric tokens before we can intercept them), we need > to implement a small state machine to track the lexing of attributes. > > PR c++/109877 > > gcc/cp/ChangeLog: > > * parser.cc (enum clang_attr_state): New. > (cp_lexer_attribute_state): New. > (cp_lexer_new_main): Set initial attribute lexing state. > (cp_lexer_get_preprocessor_token): Handle lexing of clang- > form attributes. > (cp_parser_clang_attribute): Handle clang-form attributes. > (cp_parser_gnu_attribute_list): Switch to clang-form parsing > where needed. > * parser.h : New flag to signal lexing clang-form attributes. > > Signed-off-by: Iain Sandoe <i...@sandoe.co.uk> > --- > gcc/cp/parser.cc | 230 +++++++++++++++++++++++++++++++++++++++++++++-- > gcc/cp/parser.h | 3 + > 2 files changed, 227 insertions(+), 6 deletions(-) > > diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc > index 5116bcb78f6..c12f473f2e3 100644 > --- a/gcc/cp/parser.cc > +++ b/gcc/cp/parser.cc > @@ -699,6 +699,91 @@ cp_lexer_handle_early_pragma (cp_lexer *lexer) > static cp_parser *cp_parser_new (cp_lexer *); > static GTY (()) cp_parser *the_parser; > > +/* Context-sensitive parse-checking for clang-style attributes. */ > + > +enum clang_attr_state { > + CA_NONE = 0, > + CA_ATTR, > + CA_BR1, CA_BR2, > + CA_LIST, > + CA_LIST_ARGS, > + CA_IS_CA, > + CA_CA_ARGS, > + CA_LIST_CONT > +}; > + > +/* State machine tracking context of attribute lexing. */ > + > +static enum clang_attr_state > +cp_lexer_attribute_state (cp_token& token, enum clang_attr_state attr_state) > +{ > + /* Implement a context-sensitive parser for clang attributes. > + We detect __attribute__((clang_style_attribute (ARGS))) and lex the > + args ARGS with the following differences from GNU attributes: > + (a) number-like values are lexed as strings [this allows lexing XX.YY.ZZ > + version numbers]. > + (b) we concatenate strings, since clang attributes allow this too. */ > + switch (attr_state) > + { > + case CA_NONE: > + if (token.type == CPP_KEYWORD > + && token.keyword == RID_ATTRIBUTE) > + attr_state = CA_ATTR; > + break; > + case CA_ATTR: > + if (token.type == CPP_OPEN_PAREN) > + attr_state = CA_BR1; > + else > + attr_state = CA_NONE; > + break; > + case CA_BR1: > + if (token.type == CPP_OPEN_PAREN) > + attr_state = CA_BR2; > + else > + attr_state = CA_NONE; > + break; > + case CA_BR2: > + if (token.type == CPP_NAME) > + { > + tree identifier = (token.type == CPP_KEYWORD) > + /* For keywords, use the canonical spelling, not the > + parsed identifier. */ > + ? ridpointers[(int) token.keyword] > + : token.u.value; > + identifier = canonicalize_attr_name (identifier); > + if (attribute_clang_form_p (identifier)) > + attr_state = CA_IS_CA; > + else > + attr_state = CA_LIST; > + } > + else > + attr_state = CA_NONE; > + break; > + case CA_IS_CA: > + case CA_LIST: > + if (token.type == CPP_COMMA) > + attr_state = CA_BR2; /* Back to the list outer. */ > + else if (token.type == CPP_OPEN_PAREN) > + attr_state = attr_state == CA_IS_CA ? CA_CA_ARGS > + : CA_LIST_ARGS; > + else > + attr_state = CA_NONE; > + break; > + case CA_CA_ARGS: /* We will special-case args in this state. */ > + case CA_LIST_ARGS: > + if (token.type == CPP_CLOSE_PAREN) > + attr_state = CA_LIST_CONT; > + break; > + case CA_LIST_CONT: > + if (token.type == CPP_COMMA) > + attr_state = CA_BR2; /* Back to the list outer. */ > + else > + attr_state = CA_NONE; > + break; > + } > + return attr_state; > +} > + > /* Create a new main C++ lexer, the lexer that gets tokens from the > preprocessor, and also create the main parser. */ > > @@ -715,6 +800,9 @@ cp_lexer_new_main (void) > c_common_no_more_pch (); > > cp_lexer *lexer = cp_lexer_alloc (); > + lexer->lex_number_as_string_p = false; > + enum clang_attr_state attr_state = CA_NONE; > + > /* Put the first token in the buffer. */ > cp_token *tok = lexer->buffer->quick_push (token); > > @@ -738,8 +826,20 @@ cp_lexer_new_main (void) > if (tok->type == CPP_PRAGMA_EOL) > cp_lexer_handle_early_pragma (lexer); > > + attr_state = cp_lexer_attribute_state (*tok, attr_state); > tok = vec_safe_push (lexer->buffer, cp_token ()); > - cp_lexer_get_preprocessor_token (C_LEX_STRING_NO_JOIN, tok); > + unsigned int flags = C_LEX_STRING_NO_JOIN; > + if (attr_state == CA_CA_ARGS) > + { > + /* We are handling a clang attribute; > + 1. do not try to diagnose x.y.z as a bad number, it could be a > + version. > + 2. join strings. */ > + flags = C_LEX_NUMBER_AS_STRING; > + lexer->lex_number_as_string_p = true; > + } > + cp_lexer_get_preprocessor_token (flags, tok); > + lexer->lex_number_as_string_p = false; > } > > lexer->next_token = lexer->buffer->address (); > @@ -956,7 +1056,15 @@ cp_lexer_get_preprocessor_token (unsigned flags, > cp_token *token) > { > static int is_extern_c = 0; > > - /* Get a new token from the preprocessor. */ > + /* Get a new token from the preprocessor. > + int lex_flags = 0; > + if (lexer != NULL) > + { > + lex_flags = C_LEX_STRING_NO_JOIN; > + if (lexer->lex_number_as_string_p) > + lex_flags |= C_LEX_NUMBER_AS_STRING; > + } > + */
Why is this commented out? > token->type > = c_lex_with_flags (&token->u.value, &token->location, &token->flags, > flags); > @@ -29302,6 +29410,113 @@ cp_parser_gnu_attributes_opt (cp_parser* parser) > return attributes; > } > > +/* Parse the arguments list for a clang attribute. */ > +static tree > +cp_parser_clang_attribute (cp_parser *parser, tree/*attr_id*/) > +{ > + /* Each entry can be : > + identifier > + identifier=N.MM.Z > + identifier="string" > + followed by ',' or ) for the last entry*/ > + > + matching_parens parens; > + if (!parens.require_open (parser)) > + return NULL; > + > + bool save_translate_strings_p = parser->translate_strings_p; > + parser->translate_strings_p = false; This could be auto ts = make_temp_override (parser->translate_strings_p, false); and then... > + tree attr_args = NULL_TREE; > + cp_token *token = cp_lexer_peek_token (parser->lexer); > + if (token->type == CPP_NAME > + && cp_lexer_peek_nth_token (parser->lexer, 2)->type == CPP_COMMA) > + { > + tree platf = token->u.value; > + cp_lexer_consume_token (parser->lexer); > + attr_args = tree_cons (NULL_TREE, platf, NULL_TREE); > + } > + else > + { > + error_at (token->location, > + "expected a platform name followed by %<,%>"); > + cp_parser_skip_to_closing_parenthesis (parser, > + /*recovering=*/true, > + /*or_comma=*/false, > + /*consume_paren=*/true); > + parser->translate_strings_p = save_translate_strings_p; you don't need this, and the one below. > + return error_mark_node; > + } > + > + cp_lexer_consume_token (parser->lexer); /* consume the ',' */ > + do > + { > + tree name = NULL_TREE; > + tree value = NULL_TREE; > + > + token = cp_lexer_peek_token (parser->lexer); > + if (token->type == CPP_NAME) > + { > + name = token->u.value; > + cp_lexer_consume_token (parser->lexer); > + } > + else > + { > + /* FIXME: context-sensitive for that attrib. */ > + error_at (token->location, "expected an attribute keyword"); > + cp_parser_skip_to_closing_parenthesis (parser, > + /*recovering=*/true, > + /*or_comma=*/false, > + /*consume_paren=*/true); > + attr_args = error_mark_node; > + break; > + } > + > + if (cp_lexer_next_token_is (parser->lexer, CPP_EQ)) > + { > + cp_lexer_consume_token (parser->lexer); /* eat the '=' */ > + token = cp_lexer_peek_token (parser->lexer); > + if (cp_lexer_next_token_is_not (parser->lexer, CPP_CLOSE_PAREN) > + && cp_lexer_next_token_is_not (parser->lexer, CPP_COMMA)) > + { > + value = token->u.value; > + /* ???: check for error mark and early-return? */ > + cp_lexer_consume_token (parser->lexer); > + } > + else > + { > + error_at (token->location, "expected a value"); > + cp_parser_skip_to_closing_parenthesis (parser, > + /*recovering=*/true, > + /*or_comma=*/false, > + /*consume_paren=*/true); > + attr_args = error_mark_node; > + break; > + } > + } > + else if (cp_lexer_next_token_is_not (parser->lexer, CPP_CLOSE_PAREN) > + && cp_lexer_next_token_is_not (parser->lexer, CPP_COMMA)) > + { > + error_at (token->location, "expected %<,%>, %<=%> or %<)%>"); > + cp_parser_skip_to_closing_parenthesis (parser, > + /*recovering=*/true, > + /*or_comma=*/false, > + /*consume_paren=*/true); > + attr_args = error_mark_node; > + break; > + } > + if (cp_lexer_next_token_is (parser->lexer, CPP_COMMA)) > + cp_lexer_consume_token (parser->lexer); > + tree t = tree_cons (value, name, NULL_TREE); > + attr_args = chainon (attr_args, t); attr_chainon > + } while (cp_lexer_next_token_is_not (parser->lexer, CPP_CLOSE_PAREN)); I think we format do-while as do { } while (cond); > + > + parser->translate_strings_p = save_translate_strings_p; > + if (!parens.require_close (parser)) > + return error_mark_node; > + > + return attr_args; > +} > + > /* Parse a GNU attribute-list. > > attribute-list: > @@ -29361,9 +29576,12 @@ cp_parser_gnu_attribute_list (cp_parser* parser, > bool exactly_one /* = false */) > > /* Peek at the next token. */ > token = cp_lexer_peek_token (parser->lexer); > - /* If it's an `(', then parse the attribute arguments. */ > - if (token->type == CPP_OPEN_PAREN) > + if (token->type == CPP_OPEN_PAREN > + && attribute_clang_form_p (identifier)) > + arguments = cp_parser_clang_attribute (parser, identifier); > + else if (token->type == CPP_OPEN_PAREN) I think it'd be nice to eliminate checking token->type == CPP_OPEN_PAREN twice here, and I'd say it's worth it wrapping attribute_clang_form_p (...) in UNLIKELY (), since it should be extremely rare to be true? Marek