Package: libmagic-mgc
Version: 1:5.30-1
Tags: patch
File: /usr/share/file/magic.mgc
Severity: wishlist

$ file /usr/include/c++/6/*
/usr/include/c++/6/algorithm:              C source, ASCII text
/usr/include/c++/6/array:                  C source, ASCII text
/usr/include/c++/6/atomic:                 C source, ASCII text
/usr/include/c++/6/backward:               directory
/usr/include/c++/6/bits:                   directory
/usr/include/c++/6/bitset:                 C source, ASCII text
/usr/include/c++/6/cassert:                C source, ASCII text
/usr/include/c++/6/ccomplex:               C source, ASCII text
/usr/include/c++/6/cctype:                 C source, ASCII text
/usr/include/c++/6/cerrno:                 C source, ASCII text
/usr/include/c++/6/cfenv:                  C source, ASCII text
/usr/include/c++/6/cfloat:                 C source, ASCII text
/usr/include/c++/6/chrono:                 C source, ASCII text
/usr/include/c++/6/cinttypes:              C source, ASCII text
/usr/include/c++/6/ciso646:                C source, ASCII text
/usr/include/c++/6/climits:                C source, ASCII text
/usr/include/c++/6/clocale:                C source, ASCII text
/usr/include/c++/6/cmath:                  C source, ASCII text
/usr/include/c++/6/codecvt:                C source, ASCII text
/usr/include/c++/6/complex:                C source, ASCII text
/usr/include/c++/6/complex.h:              C source, ASCII text
/usr/include/c++/6/condition_variable:     C source, ASCII text
/usr/include/c++/6/csetjmp:                C source, ASCII text
/usr/include/c++/6/csignal:                C source, ASCII text
/usr/include/c++/6/cstdalign:              ASCII text
/usr/include/c++/6/cstdarg:                C source, ASCII text
/usr/include/c++/6/cstdbool:               ASCII text
/usr/include/c++/6/cstddef:                C source, ASCII text
/usr/include/c++/6/cstdint:                C source, ASCII text
/usr/include/c++/6/cstdio:                 C source, ASCII text
/usr/include/c++/6/cstdlib:                C source, ASCII text
/usr/include/c++/6/cstring:                C source, ASCII text
/usr/include/c++/6/ctgmath:                C source, ASCII text
/usr/include/c++/6/ctime:                  C source, ASCII text
/usr/include/c++/6/cuchar:                 C source, ASCII text
/usr/include/c++/6/cwchar:                 C source, ASCII text
/usr/include/c++/6/cwctype:                C source, ASCII text
/usr/include/c++/6/cxxabi.h:               C source, ASCII text
/usr/include/c++/6/debug:                  directory
/usr/include/c++/6/decimal:                directory
/usr/include/c++/6/deque:                  C source, ASCII text
/usr/include/c++/6/exception:              C source, ASCII text
/usr/include/c++/6/experimental:           directory
/usr/include/c++/6/ext:                    directory
/usr/include/c++/6/fenv.h:                 C source, ASCII text
/usr/include/c++/6/forward_list:           C source, ASCII text
/usr/include/c++/6/fstream:                C source, ASCII text
/usr/include/c++/6/functional:             C source, ASCII text
/usr/include/c++/6/future:                 C source, UTF-8 Unicode text
/usr/include/c++/6/initializer_list:       C source, ASCII text
/usr/include/c++/6/iomanip:                C source, ASCII text
/usr/include/c++/6/ios:                    C source, ASCII text
/usr/include/c++/6/iosfwd:                 C source, ASCII text
/usr/include/c++/6/iostream:               C source, ASCII text
/usr/include/c++/6/istream:                C source, ASCII text
/usr/include/c++/6/iterator:               C source, ASCII text
/usr/include/c++/6/limits:                 C source, ASCII text
/usr/include/c++/6/list:                   C source, ASCII text
/usr/include/c++/6/locale:                 C source, ASCII text
/usr/include/c++/6/map:                    C source, ASCII text
/usr/include/c++/6/math.h:                 ASCII text
/usr/include/c++/6/memory:                 C++ source, ASCII text
/usr/include/c++/6/mutex:                  C source, ASCII text
/usr/include/c++/6/new:                    C source, ASCII text
/usr/include/c++/6/numeric:                C source, ASCII text
/usr/include/c++/6/ostream:                C source, ASCII text
/usr/include/c++/6/parallel:               directory
/usr/include/c++/6/profile:                directory
/usr/include/c++/6/queue:                  C source, ASCII text
/usr/include/c++/6/random:                 C source, ASCII text
/usr/include/c++/6/ratio:                  C source, ASCII text
/usr/include/c++/6/regex:                  C source, ASCII text
/usr/include/c++/6/scoped_allocator:       C source, ASCII text
/usr/include/c++/6/set:                    C source, ASCII text
/usr/include/c++/6/shared_mutex:           C source, ASCII text
/usr/include/c++/6/sstream:                C source, ASCII text
/usr/include/c++/6/stack:                  C source, ASCII text
/usr/include/c++/6/stdexcept:              C source, ASCII text
/usr/include/c++/6/stdlib.h:               ASCII text
/usr/include/c++/6/streambuf:              C source, ASCII text
/usr/include/c++/6/string:                 C source, ASCII text
/usr/include/c++/6/sun:                    directory
/usr/include/c++/6/system_error:           C source, ASCII text
/usr/include/c++/6/tgmath.h:               C source, ASCII text
/usr/include/c++/6/thread:                 C source, ASCII text
/usr/include/c++/6/tr1:                    directory
/usr/include/c++/6/tr2:                    directory
/usr/include/c++/6/tuple:                  C source, ASCII text
/usr/include/c++/6/type_traits:            C source, ASCII text
/usr/include/c++/6/typeindex:              C source, ASCII text
/usr/include/c++/6/typeinfo:               C source, ASCII text
/usr/include/c++/6/unordered_map:          C source, ASCII text
/usr/include/c++/6/unordered_set:          C source, ASCII text
/usr/include/c++/6/utility:                C source, ASCII text
/usr/include/c++/6/valarray:               C source, ASCII text
/usr/include/c++/6/vector:                 C source, ASCII text
$

It does identify one of those C++ headers as C++ indeed. The issue here
seems to be that a lot of these headers put stuff inside a namespace and
indent it. The patterns from file generally search for unindented
keywords though. Another issue seems to be the common pattern of adding
a line break after a template specification.

After applying the attached patch detection is slightly improved:
$ file -m ./magic.mgc /usr/include/c++/6/*
/usr/include/c++/6/algorithm:          C source, ASCII text
/usr/include/c++/6/array:              C source, ASCII text
/usr/include/c++/6/atomic:             C++ source, ASCII text
/usr/include/c++/6/backward:           directory
/usr/include/c++/6/bits:               directory
/usr/include/c++/6/bitset:             C source, ASCII text
/usr/include/c++/6/cassert:            C source, ASCII text
/usr/include/c++/6/ccomplex:           C source, ASCII text
/usr/include/c++/6/cctype:             C source, ASCII text
/usr/include/c++/6/cerrno:             C source, ASCII text
/usr/include/c++/6/cfenv:              C source, ASCII text
/usr/include/c++/6/cfloat:             C source, ASCII text
/usr/include/c++/6/chrono:             C++ source, ASCII text
/usr/include/c++/6/cinttypes:          C source, ASCII text
/usr/include/c++/6/ciso646:            C source, ASCII text
/usr/include/c++/6/climits:            C source, ASCII text
/usr/include/c++/6/clocale:            C source, ASCII text
/usr/include/c++/6/cmath:              C source, ASCII text
/usr/include/c++/6/codecvt:            C++ source, ASCII text
/usr/include/c++/6/complex:            C++ source, ASCII text
/usr/include/c++/6/complex.h:          C source, ASCII text
/usr/include/c++/6/condition_variable: C++ source, ASCII text
/usr/include/c++/6/csetjmp:            C source, ASCII text
/usr/include/c++/6/csignal:            C source, ASCII text
/usr/include/c++/6/cstdalign:          ASCII text
/usr/include/c++/6/cstdarg:            C source, ASCII text
/usr/include/c++/6/cstdbool:           ASCII text
/usr/include/c++/6/cstddef:            C source, ASCII text
/usr/include/c++/6/cstdint:            C source, ASCII text
/usr/include/c++/6/cstdio:             C source, ASCII text
/usr/include/c++/6/cstdlib:            C source, ASCII text
/usr/include/c++/6/cstring:            C source, ASCII text
/usr/include/c++/6/ctgmath:            C source, ASCII text
/usr/include/c++/6/ctime:              C source, ASCII text
/usr/include/c++/6/cuchar:             C source, ASCII text
/usr/include/c++/6/cwchar:             C source, ASCII text
/usr/include/c++/6/cwctype:            C source, ASCII text
/usr/include/c++/6/cxxabi.h:           C++ source, ASCII text
/usr/include/c++/6/debug:              directory
/usr/include/c++/6/decimal:            directory
/usr/include/c++/6/deque:              C source, ASCII text
/usr/include/c++/6/exception:          C++ source, ASCII text
/usr/include/c++/6/experimental:       directory
/usr/include/c++/6/ext:                directory
/usr/include/c++/6/fenv.h:             C source, ASCII text
/usr/include/c++/6/forward_list:       C source, ASCII text
/usr/include/c++/6/fstream:            C++ source, ASCII text
/usr/include/c++/6/functional:         C source, ASCII text
/usr/include/c++/6/future:             C++ source, UTF-8 Unicode text
/usr/include/c++/6/initializer_list:   C++ source, ASCII text
/usr/include/c++/6/iomanip:            C source, ASCII text
/usr/include/c++/6/ios:                C source, ASCII text
/usr/include/c++/6/iosfwd:             C source, ASCII text
/usr/include/c++/6/iostream:           C source, ASCII text
/usr/include/c++/6/istream:            C++ source, ASCII text
/usr/include/c++/6/iterator:           C source, ASCII text
/usr/include/c++/6/limits:             C source, ASCII text
/usr/include/c++/6/list:               C source, ASCII text
/usr/include/c++/6/locale:             C source, ASCII text
/usr/include/c++/6/map:                C source, ASCII text
/usr/include/c++/6/math.h:             ASCII text
/usr/include/c++/6/memory:             C++ source, ASCII text
/usr/include/c++/6/mutex:              C++ source, ASCII text
/usr/include/c++/6/new:                C++ source, ASCII text
/usr/include/c++/6/numeric:            C source, ASCII text
/usr/include/c++/6/ostream:            C++ source, ASCII text
/usr/include/c++/6/parallel:           directory
/usr/include/c++/6/profile:            directory
/usr/include/c++/6/queue:              C source, ASCII text
/usr/include/c++/6/random:             C source, ASCII text
/usr/include/c++/6/ratio:              C++ source, ASCII text
/usr/include/c++/6/regex:              C source, ASCII text
/usr/include/c++/6/scoped_allocator:   C++ source, ASCII text
/usr/include/c++/6/set:                C source, ASCII text
/usr/include/c++/6/shared_mutex:       C++ source, ASCII text
/usr/include/c++/6/sstream:            C++ source, ASCII text
/usr/include/c++/6/stack:              C source, ASCII text
/usr/include/c++/6/stdexcept:          C++ source, ASCII text
/usr/include/c++/6/stdlib.h:           ASCII text
/usr/include/c++/6/streambuf:          C++ source, ASCII text
/usr/include/c++/6/string:             C source, ASCII text
/usr/include/c++/6/sun:                directory
/usr/include/c++/6/system_error:       C++ source, ASCII text
/usr/include/c++/6/tgmath.h:           C source, ASCII text
/usr/include/c++/6/thread:             C++ source, ASCII text
/usr/include/c++/6/tr1:                directory
/usr/include/c++/6/tr2:                directory
/usr/include/c++/6/tuple:              C source, ASCII text
/usr/include/c++/6/type_traits:        C source, ASCII text
/usr/include/c++/6/typeindex:          C++ source, ASCII text
/usr/include/c++/6/typeinfo:           C++ source, ASCII text
/usr/include/c++/6/unordered_map:      C source, ASCII text
/usr/include/c++/6/unordered_set:      C source, ASCII text
/usr/include/c++/6/utility:            C++ source, ASCII text
/usr/include/c++/6/valarray:           C++ source, ASCII text
/usr/include/c++/6/vector:             C source, ASCII text
$

Most of the undetected headers only #include stuff or have a very long
comment blurb.

I'm less sure about how much the attached patch breaks in terms of
misdetections of other files and in terms of performance. Allowing EOL
instead of space should not hurt much, but allowing spaces at the start
of lines might regress.

In any case, I ask for improving C++ vs. C detection and propose the
attached patch as a basis.

A lot of regex patterns end with a plus sign. Removing those plus signs
changes the match region, but it never changes whether a regex matches.
Thus they can safely be removed and that might improve performance.

Helmut
Index: file-5.30/magic/Magdir/c-lang
===================================================================
--- file-5.30.orig/magic/Magdir/c-lang
+++ file-5.30/magic/Magdir/c-lang
@@ -14,37 +14,37 @@
 0      regex   \^#include      C source text
 !:strength +25
 !:mime text/x-c
-0      regex   \^char[\ \t\n]+ C source text
+0      regex   \^[\ \t]*char[\ \t\n]+  C source text
 !:mime text/x-c
-0      regex   \^double[\ \t\n]+               C source text
+0      regex   \^[\ \t]*double[\ \t\n]+                C source text
 !:mime text/x-c
-0      regex   \^extern[\ \t\n]+               C source text
+0      regex   \^[\ \t]*extern[\ \t\n]+                C source text
 !:mime text/x-c
-0      regex   \^float[\ \t\n]+                C source text
+0      regex   \^[\ \t]*float[\ \t\n]+         C source text
 !:mime text/x-c
-0      regex   \^struct[\ \t\n]+               C source text
+0      regex   \^[\ \t]*struct[\ \t\n]+                C source text
 !:mime text/x-c
-0      regex   \^union[\ \t\n]+                C source text
+0      regex   \^[\ \t]*union[\ \t\n]+         C source text
 !:mime text/x-c
 0      search/8192     main(           C source text
 !:mime text/x-c
 
 # C++
 # The strength of these rules is increased so they beat the C rules above
-0      regex   \^template[\ \t]+<.*>[\ \t\n]+  C++ source text
+0      regex   \^[\ \t]*template[\ \t]+<.*>([\ \t\n]|$)        C++ source text
 !:strength + 30
 !:mime text/x-c++
-0      regex   \^virtual[\ \t\n]+              C++ source text
+0      regex   \^[\ \t]*virtual[\ \t\n]+               C++ source text
 !:strength + 30
 !:mime text/x-c++
-0      regex   \^class[\ \t\n]+                C++ source text
+0      regex   \^[\ \t]*class[\ \t\n]+         C++ source text
 # But class is reduced to avoid beating php (Jens Schleusener)
 !:strength + 13
 !:mime text/x-c++
-0      regex   \^public:               C++ source text
+0      regex   \^[\ \t]*public:                C++ source text
 !:strength + 30
 !:mime text/x-c++
-0      regex   \^private:              C++ source text
+0      regex   \^[\ \t]*private:               C++ source text
 !:strength + 30
 !:mime text/x-c++
 

Reply via email to