On Sun, Nov 12, 2023 at 08:02:43PM -1000, Iain Sandoe wrote:
> This adds the ability to defer the validation of numeric attribute
> arguments until the sequence is parsed if the attribute being
> handled is one known to be 'clang form'.
> 
> We do this by considering the arguments to be strings regardless
> of content and defer the interpretation of those strings until the
> argument processing.
> 
> Since the C++ front end lexes tokens separately (and would report
> non-compliant numeric tokens before we can intercept them), we need
> to implement a small state machine to track the lexing of attributes.
> 
>       PR c++/109877
> 
> gcc/cp/ChangeLog:
> 
>       * parser.cc (enum clang_attr_state): New.
>       (cp_lexer_attribute_state): New.
>       (cp_lexer_new_main): Set initial attribute lexing state.
>       (cp_lexer_get_preprocessor_token): Handle lexing of clang-
>       form attributes.
>       (cp_parser_clang_attribute): Handle clang-form attributes.
>       (cp_parser_gnu_attribute_list): Switch to clang-form parsing
>       where needed.
>       * parser.h : New flag to signal lexing clang-form attributes.
> 
> Signed-off-by: Iain Sandoe <i...@sandoe.co.uk>
> ---
>  gcc/cp/parser.cc | 230 +++++++++++++++++++++++++++++++++++++++++++++--
>  gcc/cp/parser.h  |   3 +
>  2 files changed, 227 insertions(+), 6 deletions(-)
> 
> diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
> index 5116bcb78f6..c12f473f2e3 100644
> --- a/gcc/cp/parser.cc
> +++ b/gcc/cp/parser.cc
> @@ -699,6 +699,91 @@ cp_lexer_handle_early_pragma (cp_lexer *lexer)
>  static cp_parser *cp_parser_new (cp_lexer *);
>  static GTY (()) cp_parser *the_parser;
>  
> +/* Context-sensitive parse-checking for clang-style attributes.  */
> +
> +enum clang_attr_state {
> +  CA_NONE = 0,
> +  CA_ATTR,
> +  CA_BR1, CA_BR2,
> +  CA_LIST,
> +  CA_LIST_ARGS,
> +  CA_IS_CA,
> +  CA_CA_ARGS,
> +  CA_LIST_CONT
> +};
> +
> +/* State machine tracking context of attribute lexing.  */
> +
> +static enum clang_attr_state
> +cp_lexer_attribute_state (cp_token& token, enum clang_attr_state attr_state)
> +{
> +  /* Implement a context-sensitive parser for clang attributes.
> +     We detect __attribute__((clang_style_attribute (ARGS))) and lex the
> +     args ARGS with the following differences from GNU attributes:
> +     (a) number-like values are lexed as strings [this allows lexing XX.YY.ZZ
> +        version numbers].
> +     (b) we concatenate strings, since clang attributes allow this too.  */
> +  switch (attr_state)
> +    {
> +    case CA_NONE:
> +      if (token.type == CPP_KEYWORD
> +       && token.keyword == RID_ATTRIBUTE)
> +     attr_state = CA_ATTR;
> +      break;
> +    case CA_ATTR:
> +      if (token.type == CPP_OPEN_PAREN)
> +     attr_state = CA_BR1;
> +      else
> +     attr_state = CA_NONE;
> +      break;
> +    case CA_BR1:
> +      if (token.type == CPP_OPEN_PAREN)
> +     attr_state = CA_BR2;
> +      else
> +     attr_state = CA_NONE;
> +      break;
> +    case CA_BR2:
> +      if (token.type == CPP_NAME)
> +     {
> +       tree identifier = (token.type == CPP_KEYWORD)
> +         /* For keywords, use the canonical spelling, not the
> +            parsed identifier.  */
> +         ? ridpointers[(int) token.keyword]
> +         : token.u.value;
> +       identifier = canonicalize_attr_name (identifier);
> +       if (attribute_clang_form_p (identifier))
> +         attr_state = CA_IS_CA;
> +       else
> +         attr_state = CA_LIST;
> +     }
> +      else
> +     attr_state = CA_NONE;
> +      break;
> +    case CA_IS_CA:
> +    case CA_LIST:
> +      if (token.type == CPP_COMMA)
> +     attr_state = CA_BR2; /* Back to the list outer.  */
> +      else if (token.type == CPP_OPEN_PAREN)
> +     attr_state = attr_state == CA_IS_CA ? CA_CA_ARGS
> +                                         : CA_LIST_ARGS;
> +      else
> +     attr_state = CA_NONE;
> +      break;
> +    case CA_CA_ARGS: /* We will special-case args in this state.  */
> +    case CA_LIST_ARGS:
> +      if (token.type == CPP_CLOSE_PAREN)
> +     attr_state = CA_LIST_CONT;
> +      break;
> +    case CA_LIST_CONT:
> +      if (token.type == CPP_COMMA)
> +     attr_state = CA_BR2; /* Back to the list outer.  */
> +      else
> +     attr_state = CA_NONE;
> +      break;
> +    }
> +  return attr_state;
> +}
> +
>  /* Create a new main C++ lexer, the lexer that gets tokens from the
>     preprocessor, and also create the main parser.  */
>  
> @@ -715,6 +800,9 @@ cp_lexer_new_main (void)
>    c_common_no_more_pch ();
>  
>    cp_lexer *lexer = cp_lexer_alloc ();
> +  lexer->lex_number_as_string_p = false;
> +  enum clang_attr_state attr_state = CA_NONE;
> +
>    /* Put the first token in the buffer.  */
>    cp_token *tok = lexer->buffer->quick_push (token);
>  
> @@ -738,8 +826,20 @@ cp_lexer_new_main (void)
>        if (tok->type == CPP_PRAGMA_EOL)
>       cp_lexer_handle_early_pragma (lexer);
>  
> +      attr_state = cp_lexer_attribute_state (*tok, attr_state);
>        tok = vec_safe_push (lexer->buffer, cp_token ());
> -      cp_lexer_get_preprocessor_token (C_LEX_STRING_NO_JOIN, tok);
> +      unsigned int flags = C_LEX_STRING_NO_JOIN;
> +      if (attr_state == CA_CA_ARGS)
> +     {
> +       /* We are handling a clang attribute;
> +          1. do not try to diagnose x.y.z as a bad number, it could be a
> +             version.
> +          2. join strings.  */
> +          flags = C_LEX_NUMBER_AS_STRING;
> +       lexer->lex_number_as_string_p = true;
> +     }
> +      cp_lexer_get_preprocessor_token (flags, tok);
> +      lexer->lex_number_as_string_p = false;
>      }
>  
>    lexer->next_token = lexer->buffer->address ();
> @@ -956,7 +1056,15 @@ cp_lexer_get_preprocessor_token (unsigned flags, 
> cp_token *token)
>  {
>    static int is_extern_c = 0;
>  
> -   /* Get a new token from the preprocessor.  */
> +   /* Get a new token from the preprocessor.
> +  int lex_flags = 0;
> +  if (lexer != NULL)
> +    {
> +      lex_flags = C_LEX_STRING_NO_JOIN;
> +      if (lexer->lex_number_as_string_p)
> +     lex_flags |= C_LEX_NUMBER_AS_STRING;
> +    }
> +  */

Why is this commented out?

>    token->type
>      = c_lex_with_flags (&token->u.value, &token->location, &token->flags,
>                       flags);
> @@ -29302,6 +29410,113 @@ cp_parser_gnu_attributes_opt (cp_parser* parser)
>    return attributes;
>  }
>  
> +/* Parse the arguments list for a clang attribute.   */
> +static tree
> +cp_parser_clang_attribute (cp_parser *parser, tree/*attr_id*/)
> +{
> +  /* Each entry can be :
> +     identifier
> +     identifier=N.MM.Z
> +     identifier="string"
> +     followed by ',' or ) for the last entry*/
> +
> +  matching_parens parens;
> +  if (!parens.require_open (parser))
> +    return NULL;
> +
> +  bool save_translate_strings_p = parser->translate_strings_p;
> +  parser->translate_strings_p = false;

This could be

  auto ts = make_temp_override (parser->translate_strings_p, false);

and then...

> +  tree attr_args = NULL_TREE;
> +  cp_token *token = cp_lexer_peek_token (parser->lexer);
> +  if (token->type == CPP_NAME
> +      && cp_lexer_peek_nth_token (parser->lexer, 2)->type == CPP_COMMA)
> +    {
> +      tree platf = token->u.value;
> +      cp_lexer_consume_token (parser->lexer);
> +      attr_args = tree_cons (NULL_TREE, platf, NULL_TREE);
> +    }
> +  else
> +    {
> +      error_at (token->location,
> +             "expected a platform name followed by %<,%>");
> +      cp_parser_skip_to_closing_parenthesis (parser,
> +                                          /*recovering=*/true,
> +                                          /*or_comma=*/false,
> +                                          /*consume_paren=*/true);
> +      parser->translate_strings_p = save_translate_strings_p;

you don't need this, and the one below.

> +      return error_mark_node;
> +    }
> +
> +  cp_lexer_consume_token (parser->lexer); /* consume the ',' */
> +  do
> +    {
> +      tree name = NULL_TREE;
> +      tree value = NULL_TREE;
> +
> +      token = cp_lexer_peek_token (parser->lexer);
> +      if (token->type == CPP_NAME)
> +     {
> +       name = token->u.value;
> +       cp_lexer_consume_token (parser->lexer);
> +     }
> +      else
> +     {
> +       /* FIXME: context-sensitive for that attrib.  */
> +       error_at (token->location, "expected an attribute keyword");
> +       cp_parser_skip_to_closing_parenthesis (parser,
> +                                              /*recovering=*/true,
> +                                              /*or_comma=*/false,
> +                                              /*consume_paren=*/true);
> +       attr_args = error_mark_node;
> +       break;
> +     }
> +
> +      if (cp_lexer_next_token_is (parser->lexer, CPP_EQ))
> +     {
> +       cp_lexer_consume_token (parser->lexer); /* eat the '=' */
> +       token = cp_lexer_peek_token (parser->lexer);
> +       if (cp_lexer_next_token_is_not (parser->lexer, CPP_CLOSE_PAREN)
> +           && cp_lexer_next_token_is_not (parser->lexer, CPP_COMMA))
> +         {
> +           value = token->u.value;
> +           /* ???: check for error mark and early-return?  */
> +           cp_lexer_consume_token (parser->lexer);
> +         }
> +       else
> +         {
> +           error_at (token->location, "expected a value");
> +           cp_parser_skip_to_closing_parenthesis (parser,
> +                                                  /*recovering=*/true,
> +                                                  /*or_comma=*/false,
> +                                                  /*consume_paren=*/true);
> +           attr_args = error_mark_node;
> +           break;
> +         }
> +     }
> +      else if (cp_lexer_next_token_is_not (parser->lexer, CPP_CLOSE_PAREN)
> +            && cp_lexer_next_token_is_not (parser->lexer, CPP_COMMA))
> +     {
> +       error_at (token->location, "expected %<,%>, %<=%> or %<)%>");
> +       cp_parser_skip_to_closing_parenthesis (parser,
> +                                              /*recovering=*/true,
> +                                              /*or_comma=*/false,
> +                                              /*consume_paren=*/true);
> +       attr_args = error_mark_node;
> +       break;
> +     }
> +    if (cp_lexer_next_token_is (parser->lexer, CPP_COMMA))
> +      cp_lexer_consume_token (parser->lexer);
> +    tree t = tree_cons (value, name, NULL_TREE);
> +    attr_args = chainon (attr_args, t);

attr_chainon

> +  } while (cp_lexer_next_token_is_not (parser->lexer, CPP_CLOSE_PAREN));

I think we format do-while as

  do
    {
    }
  while (cond);

> +
> +  parser->translate_strings_p = save_translate_strings_p;
> +  if (!parens.require_close (parser))
> +    return error_mark_node;
> +
> +  return attr_args;
> +}
> +
>  /* Parse a GNU attribute-list.
>  
>     attribute-list:
> @@ -29361,9 +29576,12 @@ cp_parser_gnu_attribute_list (cp_parser* parser, 
> bool exactly_one /* = false */)
>  
>         /* Peek at the next token.  */
>         token = cp_lexer_peek_token (parser->lexer);
> -       /* If it's an `(', then parse the attribute arguments.  */
> -       if (token->type == CPP_OPEN_PAREN)
> +       if (token->type == CPP_OPEN_PAREN
> +           && attribute_clang_form_p (identifier))
> +         arguments = cp_parser_clang_attribute (parser, identifier);
> +       else if (token->type == CPP_OPEN_PAREN)

I think it'd be nice to eliminate checking token->type == CPP_OPEN_PAREN
twice here, and I'd say it's worth it wrapping attribute_clang_form_p (...)
in UNLIKELY (), since it should be extremely rare to be true?

Marek

Reply via email to