=?utf-8?q?Donát?= Nagy <donat.n...@ericsson.com>, =?utf-8?q?Donát?= Nagy <donat.n...@ericsson.com>, =?utf-8?q?Donát?= Nagy <donat.n...@ericsson.com>, =?utf-8?q?Donát?= Nagy <donat.n...@ericsson.com> Message-ID: In-Reply-To: <llvm.org/llvm/llvm-project/pull/135...@github.com>
================ @@ -0,0 +1,286 @@ +#!/usr/bin/env python3 +# A tool to automatically generate documentation for the config options of the +# clang static analyzer by reading `AnalyzerOptions.def`. + +import argparse +from collections import namedtuple +from enum import Enum, auto +import re +import sys +import textwrap + + +# The following code implements a trivial parser for the narrow subset of C++ +# which is used in AnalyzerOptions.def. This supports the following features: +# - ignores preprocessor directives, even if they are continued with \ at EOL +# - ignores comments: both /* ... */ and // ... +# - parses string literals (even if they contain \" escapes) +# - concatenates adjacent string literals +# - parses numbers even if they contain ' as a thousands separator +# - recognizes MACRO(arg1, arg2, ..., argN) calls + + +class TT(Enum): + "Token type enum." + number = auto() + ident = auto() + string = auto() + punct = auto() + + +TOKENS = [ + (re.compile(r"-?[0-9']+"), TT.number), + (re.compile(r"\w+"), TT.ident), + (re.compile(r'"([^\\"]|\\.)*"'), TT.string), + (re.compile(r"[(),]"), TT.punct), + (re.compile(r"/\*((?!\*/).)*\*/", re.S), None), # C-style comment + (re.compile(r"//.*\n"), None), # C++ style oneline comment + (re.compile(r"#.*(\\\n.*)*(?<!\\)\n"), None), # preprocessor directive + (re.compile(r"\s+"), None), # whitespace +] + +Token = namedtuple("Token", "kind code") + + +class ErrorHandler: + def __init__(self): + self.seen_errors = False + + # This script uses some heuristical tweaks to modify the documentation + # of some analyzer options. As this code is fragile, we record the use + # of these tweaks and report them if they become obsolete: + self.unused_tweaks = [ + "ctu-max-nodes-*", + "accepted values", + "example file content", + ] + + def record_use_of_tweak(self, tweak_name): + try: + self.unused_tweaks.remove(tweak_name) + except ValueError: + pass + + def report_error(self, msg): + print("Error:", msg, file=sys.stderr) + self.seen_errors = True + + def report_unexpected_char(self, s, pos): + lines = (s[:pos] + "X").split("\n") + lineno, col = (len(lines), len(lines[-1])) + self.report_error( + "unexpected character %r in AnalyzerOptions.def at line %d column %d" + % (s[pos], lineno, col), + ) + + def report_unused_tweaks(self): + if not self.unused_tweaks: + return + _is = " is" if len(self.unused_tweaks) == 1 else "s are" + names = ", ".join(self.unused_tweaks) + self.report_error(f"textual tweak{_is} unused in script: {names}") + + +err_handler = ErrorHandler() + + +def tokenize(s): + result = [] + pos = 0 + while pos < len(s): + for regex, kind in TOKENS: + if m := regex.match(s, pos): + if kind is not None: + result.append(Token(kind, m.group(0))) + pos = m.end() + break + else: + err_handler.report_unexpected_char(s, pos) + pos += 1 + return result + + +def join_strings(tokens): + result = [] + for tok in tokens: + if tok.kind == TT.string and result and result[-1].kind == TT.string: + # If this token is a string, and the previous non-ignored token is + # also a string, then merge them into a single token. We need to + # discard the closing " of the previous string and the opening " of + # this string. + prev = result.pop() + result.append(Token(TT.string, prev.code[:-1] + tok.code[1:])) + else: + result.append(tok) + return result + + +MacroCall = namedtuple("MacroCall", "name args") + + +class State(Enum): + "States of the state machine used for parsing the macro calls." + init = auto() + after_ident = auto() + before_arg = auto() + after_arg = auto() + + +def get_calls(tokens, macro_names): + state = State.init + result = [] + current = None + for tok in tokens: + if state == State.init and tok.kind == TT.ident and tok.code in macro_names: + current = MacroCall(tok.code, []) + state = State.after_ident + elif state == State.after_ident and tok == Token(TT.punct, "("): + state = State.before_arg + elif state == State.before_arg: + if current is not None: + current.args.append(tok) + state = State.after_arg + elif state == State.after_arg and tok.kind == TT.punct: + if tok.code == ")": + result.append(current) + current = None + state = State.init + elif tok.code == ",": + state = State.before_arg + else: + current = None + state = State.init + return result + + +# The information will be extracted from calls to these two macros: +# #define ANALYZER_OPTION(TYPE, NAME, CMDFLAG, DESC, DEFAULT_VAL) +# #define ANALYZER_OPTION_DEPENDS_ON_USER_MODE(TYPE, NAME, CMDFLAG, DESC, +# SHALLOW_VAL, DEEP_VAL) + +MACRO_NAMES_ARGCOUNTS = { + "ANALYZER_OPTION": 5, + "ANALYZER_OPTION_DEPENDS_ON_USER_MODE": 6, +} + + +def string_value(tok): + if tok.kind != TT.string: + raise ValueError(f"expected a string token, got {tok.kind.name}") + text = tok.code[1:-1] # Remove quotes + text = re.sub(r"\\(.)", r"\1", text) # Resolve backslash escapes + return text + + +def cmdflag_to_rst_title(cmdflag_tok): + text = string_value(cmdflag_tok) + underline = "-" * len(text) + ref = f".. _analyzer-option-{text}:" + + return f"{ref}\n\n{text}\n{underline}\n\n" + + +def desc_to_rst_paragraphs(tok): + base_desc = string_value(tok) + + # Escape a star that would act as inline emphasis within RST. + desc = base_desc.replace("ctu-max-nodes-*", r"ctu-max-nodes-\*") + if desc != base_desc: + err_handler.record_use_of_tweak("ctu-max-nodes-*") ---------------- steakhal wrote: If this is about escaping, then couldn't we just replace `*` characters with `\*` without knowing the string itself? https://github.com/llvm/llvm-project/pull/135169 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits