[clang] [NFC][analyzer] Document configuration options (PR #135169)

Donát Nagy via cfe-commits Mon, 12 May 2025 10:54:19 -0700

================
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+# A tool to automatically generate documentation for the config options of the
+# clang static analyzer by reading `AnalyzerOptions.def`.
+
+import argparse
+from collections import namedtuple
+from enum import Enum, auto
+import re
+import sys
+import textwrap
+
+
+# The following code implements a trivial parser for the narrow subset of C++
+# which is used in AnalyzerOptions.def. This supports the following features:
+# - ignores preprocessor directives, even if they are continued with \ at EOL
+# - ignores comments: both /* ... */ and // ...
+# - parses string literals (even if they contain \" escapes)
+# - concatenates adjacent string literals
+# - parses numbers even if they contain ' as a thousands separator
+# - recognizes MACRO(arg1, arg2, ..., argN) calls
+
+
+class TT(Enum):
+    "Token type enum."
+    number = auto()
+    ident = auto()
+    string = auto()
+    punct = auto()
+
+
+TOKENS = [
+    (re.compile(r"-?[0-9']+"), TT.number),
+    (re.compile(r"\w+"), TT.ident),
+    (re.compile(r'"([^\\"]|\\.)*"'), TT.string),
+    (re.compile(r"[(),]"), TT.punct),
+    (re.compile(r"/\*((?!\*/).)*\*/", re.S), None),  # C-style comment
+    (re.compile(r"//.*\n"), None),  # C++ style oneline comment
+    (re.compile(r"#.*(\\\n.*)*(?<!\\)\n"), None),  # preprocessor directive
+    (re.compile(r"\s+"), None),  # whitespace
+]
+
+Token = namedtuple("Token", "kind code")
+
+
+def report_unexpected(s, pos):
+    lines = (s[:pos] + "X").split("\n")
+    lineno, col = (len(lines), len(lines[-1]))
+    print(
+        "unexpected character %r in AnalyzerOptions.def at line %d column %d"
+        % (s[pos], lineno, col),
+        file=sys.stderr,
+    )
+
+
+def tokenize(s):
+    result = []
+    pos = 0
+    while pos < len(s):
+        for regex, kind in TOKENS:
+            if m := regex.match(s, pos):
+                if kind is not None:
+                    result.append(Token(kind, m.group(0)))
+                pos = m.end()
+                break
+        else:
+            report_unexpected(s, pos)
+            pos += 1
+    return result
+
+
+def join_strings(tokens):
+    result = []
+    for tok in tokens:
+        if tok.kind == TT.string and result and result[-1].kind == TT.string:
+            # If this token is a string, and the previous non-ignored token is
+            # also a string, then merge them into a single token. We need to
+            # discard the closing " of the previous string and the opening " of
+            # this string.
+            prev = result.pop()
+            result.append(Token(TT.string, prev.code[:-1] + tok.code[1:]))
+        else:
+            result.append(tok)
+    return result
+
+
+MacroCall = namedtuple("MacroCall", "name args")
+
+
+class State(Enum):
+    "States of the state machine used for parsing the macro calls."
+    init = auto()
+    after_ident = auto()
+    before_arg = auto()
+    after_arg = auto()
+
+
+def get_calls(tokens, macro_names):
+    state = State.init
+    result = []
+    current = None
+    for tok in tokens:
+        if state == State.init and tok.kind == TT.ident and tok.code in 
macro_names:
+            current = MacroCall(tok.code, [])
+            state = State.after_ident
+        elif state == State.after_ident and tok == Token(TT.punct, "("):
+            state = State.before_arg
+        elif state == State.before_arg:
+            if current is not None:
+                current.args.append(tok)
+                state = State.after_arg
+        elif state == State.after_arg and tok.kind == TT.punct:
+            if tok.code == ")":
+                result.append(current)
+                current = None
+                state = State.init
+            elif tok.code == ",":
+                state = State.before_arg
+        else:
+            current = None
+            state = State.init
+    return result
+
+
+# The information will be extracted from calls to these two macros:
+# #define ANALYZER_OPTION(TYPE, NAME, CMDFLAG, DESC, DEFAULT_VAL)
+# #define ANALYZER_OPTION_DEPENDS_ON_USER_MODE(TYPE, NAME, CMDFLAG, DESC,
+#                                              SHALLOW_VAL, DEEP_VAL)
+
+MACRO_NAMES_ARGCOUNTS = {
+    "ANALYZER_OPTION": 5,
+    "ANALYZER_OPTION_DEPENDS_ON_USER_MODE": 6,
+}
+
+
+def string_value(tok):
+    if tok.kind != TT.string:
+        raise ValueError(f"expected a string token, got {tok.kind.name}")
+    text = tok.code[1:-1]  # Remove quotes
+    text = re.sub(r"\\(.)", r"\1", text)  # Resolve backslash escapes
+    return text
+
+
+def cmdflag_to_rst_title(cmdflag_tok):
+    text = string_value(cmdflag_tok)
+    underline = "-" * len(text)
+    ref = f".. _analyzer-option-{text}:"
+
+    return f"{ref}\n\n{text}\n{underline}\n\n"
+
+
+def desc_to_rst_paragraphs(tok):
+    desc = string_value(tok)
+
+    # Escape a star that would act as inline emphasis within RST.
+    desc = desc.replace("ctu-max-nodes-*", r"ctu-max-nodes-\*")
+
+    # Many descriptions end with "Value: <list of accepted values>", which is
+    # OK for a terse command line printout, but should be prettified for web
+    # documentation.
+    # Moreover, the option ctu-invocation-list shows some example file content
+    # which is formatted as a preformatted block.
+    paragraphs = [desc]
+    extra = ""
+    if m := re.search(r"(^|\s)Value:", desc):
+        paragraphs = [desc[: m.start()], "Accepted values:" + desc[m.end() :]]
+    elif m := re.search(r"\s*Example file.content:", desc):
+        paragraphs = [desc[: m.start()]]
+        extra = "Example file content::\n\n  " + desc[m.end() :] + "\n\n"
+
+    wrapped = [textwrap.fill(p, width=80) for p in paragraphs if p.strip()]
+
+    return "\n\n".join(wrapped + [""]) + extra
+
+
+def default_to_rst(tok):
+    if tok.kind == TT.string:
+        if tok.code == '""':
+            return "(empty string)"
+        return tok.code
+    if tok.kind == TT.ident:
+        return tok.code
+    if tok.kind == TT.number:
+        return tok.code.replace("'", "")
+    raise ValueError(f"unexpected token as default value: {tok.kind.name}")
+
+
+def defaults_to_rst_paragraph(defaults):
+    strs = [default_to_rst(d) for d in defaults]
+
+    if len(strs) == 1:
+        return f"Default value: {strs[0]}\n\n"
+    if len(strs) == 2:
+        return (
+            f"Default value: {strs[0]} (in shallow mode) / {strs[1]} (in deep 
mode)\n\n"
+        )
+    raise ValueError("unexpected count of default values: %d" % len(defaults))
+
+
+def macro_call_to_rst_paragraphs(macro_call):
+    if len(macro_call.args) != MACRO_NAMES_ARGCOUNTS[macro_call.name]:
+        return ""
+
+    try:
+        _, _, cmdflag, desc, *defaults = macro_call.args
+
+        return (
+            cmdflag_to_rst_title(cmdflag)
+            + desc_to_rst_paragraphs(desc)
+            + defaults_to_rst_paragraph(defaults)
+        )
+    except ValueError as ve:
+        print(ve.args[0], file=sys.stderr)
+        return ""
----------------
NagyDonat wrote:


I still feel that testing the error reporting logic of the script (with an 
input where the script is expected to fail) would be an overkill, because it is 
simple, straightforward, and I don't expect that further changes would break it 
accidentally.

On the other hand, I added some checks which would cause a test failure if 
changes in `AnalyzerOptions.def` would make obsolete one of the content tweaks 
in the script -- because that part of the script *is* very fragile.

If you think that there are other parts of the code that could easily and 
silently break during normal evolution of the code, tell me and I'll add tests 
for them.

https://github.com/llvm/llvm-project/pull/135169
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [NFC][analyzer] Document configuration options (PR #135169)

Reply via email to