Bruno Haible wrote: > A large part of them are in comments. I would suggest to exempt comments > (since comments are not code). Like this: > > # A sed expression that removes ANSI C and ISO C99 comments. > # Taken from GNU gettext's 'moopp' preprocessor. > sed_remove_comments=" > /[/][/*]/{ > ta > :a > > s,^\\(\\([^\"'/]\\|\"\\([^\\\"]\\|[\\].\\)*\"\\|'\\([^\\']\\|[\\].\\)*'\\|[/][^\"'/*]\\|[/]\"\\([^\\\"]\\|[\\].\\)*\"\\|[/]'\\([^\\']\\|[\\].\\)*'\\)*\\)//.*,\\1, > te > > s,^\\(\\([^\"'/]\\|\"\\([^\\\"]\\|[\\].\\)*\"\\|'\\([^\\']\\|[\\].\\)*'\\|[/][^\"'/*]\\|[/]\"\\([^\\\"]\\|[\\].\\)*\"\\|[/]'\\([^\\']\\|[\\].\\)*'\\)*\\)/[*]\\([^*]\\|[*][^/*]\\)*[*][*]*/,\\1 > , > ta > > /^\\([^\"'/]\\|\"\\([^\\\"]\\|[\\].\\)*\"\\|'\\([^\\']\\|[\\].\\)*'\\|[/][^\"'/*]\\|[/]\"\\([^\\\"]\\|[\\].\\)*\"\\|[/]'\\([^\\']\\|[\\].\\)*'\\)*[/][*]/{ > > s,^\\(\\([^\"'/]\\|\"\\([^\\\"]\\|[\\].\\)*\"\\|'\\([^\\']\\|[\\].\\)*'\\|[/][^\"'/*]\\|[/]\"\\([^\\\"]\\|[\\].\\)*\"\\|[/]'\\([^\\']\\|[\\].\\)*'\\)*\\)/[*].*,\\1 > , > tu > :u > n > s,^\\([^*]\\|[*][^/*]\\)*[*][*]*/,, > tv > s,^.*\$,, > bu > :v > } > :e > }" > > $ for file in `git ls-files | grep '\.[ch]$'`; do > sed -e "$sed_remove_comments" < $file | grep '[[:alnum:]](' | grep -vE '^ > *(\* |#|({ )?/\*)' | sed -e "s|^|$file:|" > done
I like that. But prefer to avoid the long lines and duplication. It's hard enough to read long regexps, but when the same 100-byte subexpression appears so many times... In the patch below, I've factored that, unquoted and re-quoted for make and added semicolons. > This leaves 916 occurrences, which is more reasonable (attached below). > > The occurrences in strings should also be excluded. > > Out of these, I would be willing to fix those which refer to function names > and C keyword such as 'if' or 'switch'. > > Regarding macro names, such as in > INTUSE(__dcgettext) (domain, msgid, LC_MESSAGES) > CONCAT(TABLE,_get) (struct TABLE *t, uint32_t wc) > I would prefer to keep it as is: The lack of a space indicates that the > macro is not function-like; it is rearranging or concatenating tokens. Sounds reasonable. Here's a change to incorporate your sed command into the syntax-check rule in coreutils: >From 3512e80e8801df079568cda2714f2ff53d617812 Mon Sep 17 00:00:00 2001 From: Jim Meyering <meyer...@redhat.com> Date: Sat, 20 Mar 2010 21:05:24 +0100 Subject: [PATCH] cfg.mk: remove comments with sed rather than cpp -fpreprocessed * cfg.mk (_sed_remove_comments): Define. (sc_space_before_open_paren): Adapt. Based on suggestion and code from Bruno Haible. --- cfg.mk | 61 +++++++++++++++++++++++++++++++++++++++++++++---------------- 1 files changed, 45 insertions(+), 16 deletions(-) diff --git a/cfg.mk b/cfg.mk index 0ce1c61..06080f7 100644 --- a/cfg.mk +++ b/cfg.mk @@ -261,27 +261,56 @@ sc_prohibit_sleep: msg='prefer xnanosleep over other sleep interfaces' \ $(_prohibit_regexp) +########################################################### +_p0 = \([^"'/]\|"\([^\"]\|[\].\)*"\|'\([^\']\|[\].\)*' +_pre = $(_p0)\|[/][^"'/*]\|[/]"\([^\"]\|[\].\)*"\|[/]'\([^\']\|[\].\)*'\)* +_pre_anchored = ^\($(_pre)\) +_comment_and_close = [^*]\|[*][^/*]\)*[*][*]*/ +# help font-lock mode: ' + +# A sed expression that removes ANSI C and ISO C99 comments. +# Derived from the one in GNU gettext's 'moopp' preprocessor. +_sed_remove_comments = \ +/[/][/*]/{ \ + ta; \ + :a; \ + s,$(_pre_anchored)//.*,\1,; \ + te; \ + s,$(_pre_anchored)/[*]\($(_comment_and_close),\1 ,; \ + ta; \ + /^$(_pre)[/][*]/{ \ + s,$(_pre_anchored)/[*].*,\1 ,; \ + tu; \ + :u; \ + n; \ + s,^\($(_comment_and_close),,; \ + tv; \ + s,^.*$$,,; \ + bu; \ + :v; \ + }; \ + :e; \ +} +# Quote all single quotes. +_sed_rm_comments_q = $(subst ','\'',$(_sed_remove_comments)) +# help font-lock mode: ' + _space_before_paren_exempt =? \\n\\$$ _space_before_paren_exempt = \ - (\\n\\$$|%s\(to %s|delimit-method|(date|group|character)\(s\)) + (^ *\#|\\n\\$$|%s\(to %s|delimit-method|(date|group|character)\(s\)) # Ensure that there is a space before each open parenthesis in C code. sc_space_before_open_paren: @if $(VC_LIST_EXCEPT) | grep -l '\.[ch]$$' > /dev/null; then \ - if (cpp -fpreprocessed < /dev/null > /dev/null 2>&1); then \ - fail=0; \ - for c in $$($(VC_LIST_EXCEPT) | grep '\.[ch]$$'); do \ - cpp -fpreprocessed $$c 2>/dev/null \ - | grep -ni '[[:alnum:]](' \ - | grep -vE '$(_space_before_paren_exempt)' \ - | grep . && { fail=1; echo "*** $$c"; }; \ - done; \ - test $$fail = 1 && \ - { echo '$(ME): the above files lack a space-before-open-paren' \ - 1>&2; exit 1; } || :; \ - else \ - echo '$(ME): skipping test $@: cpp -fpreprocessed does not work' \ - 1>&2; \ - fi; \ + fail=0; \ + for c in $$($(VC_LIST_EXCEPT) | grep '\.[ch]$$'); do \ + sed '$(_sed_rm_comments_q)' $$c 2>/dev/null \ + | grep -i '[[:alnum:]](' \ + | grep -vE '$(_space_before_paren_exempt)' \ + | grep . && { fail=1; echo "*** $$c"; }; \ + done; \ + test $$fail = 1 && \ + { echo '$(ME): the above files lack a space-before-open-paren' \ + 1>&2; exit 1; } || :; \ else :; \ fi -- 1.7.0.2.455.g91132