Hi Joe, This is good! I had something similar cooking - specifically for SCM validation.
My SCM attempt caught a few more issues: - check git branch if specified - check validitiy of "T:" entry, otherwise warn of malformed entry. Example malformed (current next has two instances): 9740 T: git://git.infradead.org/nvme.git Should be: 9740 T: git git://git.infradead.org/nvme.git Also - I believe you intended on warning on all bad SCM entries, not just newly discovered ones? Your change correctly finds a previously $isbad, however the print is enclosed in an else preventing output. I was going to inline these, but in my haste to understand in incorporate changes I sanitized whitespace (BTW - I see both tabs and spaces, which is preferred in this file?) The below git branch special-casing is for these: 567:T: git git://people.freedesktop.org/~airlied/linux (part of drm maint) 3671:T: git git://git.linaro.org/people/vireshk/linux.git (For ARM Updates) See bottom for my suggestions. --Tom On Mon, Nov 06, 2017 at 09:27:25AM -0800, Joe Perches wrote: > Check for duplicate section headers and link reachability. > > Miscellanea: > > o Add --self-test=<foo> options (sections, patterns and scm for now) > where the default without options is all tests > o Rename check_maintainers_patterns to self_test > o Rename self_test_pattern_info to self_test_info > > Signed-off-by: Joe Perches <j...@perches.com> > cc: Tom Saeger <tom.sae...@oracle.com> > --- > scripts/get_maintainer.pl | 114 > +++++++++++++++++++++++++++++++++++++++------- > 1 file changed, 97 insertions(+), 17 deletions(-) > > diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl > index c68a5d1ba709..748bff0790a8 100755 > --- a/scripts/get_maintainer.pl > +++ b/scripts/get_maintainer.pl > @@ -57,7 +57,7 @@ my $sections = 0; > my $file_emails = 0; > my $from_filename = 0; > my $pattern_depth = 0; > -my $self_test = 0; > +my $self_test = undef; > my $version = 0; > my $help = 0; > my $find_maintainer_files = 0; > @@ -221,7 +221,7 @@ if (-f $ignore_file) { > > if ($#ARGV > 0) { > foreach (@ARGV) { > - if ($_ eq "-self-test" || $_ eq "--self-test") { > + if ($_ =~ /^-{1,2}self-test(?:=|$)/) { > die "$P: using --self-test does not allow any other option or > argument\n"; > } > } > @@ -263,7 +263,7 @@ if (!GetOptions( > 'fe|file-emails!' => \$file_emails, > 'f|file' => \$from_filename, > 'find-maintainer-files' => \$find_maintainer_files, > - 'self-test' => \$self_test, > + 'self-test:s' => \$self_test, > 'v|version' => \$version, > 'h|help|usage' => \$help, > )) { > @@ -280,9 +280,9 @@ if ($version != 0) { > exit 0; > } > > -if ($self_test) { > +if (defined $self_test) { > read_all_maintainer_files(); > - check_maintainers_patterns(); > + self_test(); > exit 0; > } > > @@ -329,7 +329,7 @@ if (!top_of_kernel_tree($lk_path)) { > my @typevalue = (); > my %keyword_hash; > my @mfiles = (); > -my @self_test_pattern_info = (); > +my @self_test_info = (); > > sub read_maintainer_file { > my ($file) = @_; > @@ -339,6 +339,7 @@ sub read_maintainer_file { > my $i = 1; > while (<$maint>) { > my $line = $_; > + chomp $line; > > if ($line =~ m/^([A-Z]):\s*(.*)/) { > my $type = $1; > @@ -353,17 +354,16 @@ sub read_maintainer_file { > if ((-d $value)) { > $value =~ s@([^/])$@$1/@; > } > - if ($self_test) { > - push(@self_test_pattern_info, {file=>$file, > line=>$line, linenr=>$i, pat=>$value}); > - } > } elsif ($type eq "K") { > $keyword_hash{@typevalue} = $value; > } > push(@typevalue, "$type:$value"); > } elsif (!(/^\s*$/ || /^\s*\#/)) { > - $line =~ s/\n$//g; > push(@typevalue, $line); > } > + if (defined $self_test) { > + push(@self_test_info, {file=>$file, linenr=>$i, line=>$line}); > + } > $i++; > } > close($maint); > @@ -614,17 +614,97 @@ if ($web) { > > exit($exit); > > -sub check_maintainers_patterns { > +sub self_test { > my @lsfiles = (); > + my @good_links = (); > + my @bad_links = (); > + my @section_headers = (); > > @lsfiles = vcs_list_files($lk_path); > > - for my $x (@self_test_pattern_info) { > - if (!grep(m@^$x->{pat}@, @lsfiles)) { > - my $line = $x->{line}; > - chomp($line); > - print("$x->{file}:$x->{linenr}: warning: no matches $line\n"); > - } > + for my $x (@self_test_info) { > + > + ## Section header duplication > + if (($self_test eq "" || $self_test =~ /\bsections\b/) && > + $x->{line} =~ /^\S[^:]/) { > + if (grep(m@^\Q$x->{line}\E@, @section_headers)) { > + print("$x->{file}:$x->{linenr}: warning: duplicate section > header\t$x->{line}\n"); > + } else { > + push(@section_headers, $x->{line}); > + } > + } > + next if ($x->{line} !~ /^([A-Z]):\s*(.*)/); > + > + my $type = $1; > + my $value = $2; > + > + ## Filename pattern matching > + if (($type eq "F" || $type eq "X") && > + ($self_test eq "" || $self_test =~ /\bpatterns\b/)) { > + $value =~ s@\.@\\\.@g; ##Convert . to \. > + $value =~ s/\*/\.\*/g; ##Convert * to .* > + $value =~ s/\?/\./g; ##Convert ? to . > + ##if pattern is a directory and it lacks a trailing slash, add one > + if ((-d $value)) { > + $value =~ s@([^/])$@$1/@; > + } > + if (!grep(m@^$value@, @lsfiles)) { > + print("$x->{file}:$x->{linenr}: warning: no file > matches\t$x->{line}\n"); > + } > + > + ## Link reachability > + } elsif (($type eq "W" || > + $type eq "B" && $value =~ /^https?:/) && > + ($self_test eq "" || $self_test =~ /\blinks\b/)) { > + next if (grep(m@^\Q$value\E$@, @good_links)); > + my $isbad = 0; > + if (grep(m@^\Q$value\E$@, @bad_links)) { > + $isbad = 1; > + } else { > + my $output = `wget --spider -q --no-check-certificate --timeout > 10 --tries 1 $value`; > + if ($? == 0) { > + push(@good_links, $value); > + } else { > + push(@bad_links, $value); > + $isbad = 1; > + } > + } > + if ($isbad) { > + print("$x->{file}:$x->{linenr}: warning: possible bad > link\t$x->{line}\n"); > + } > + > + ## SCM reachability > + } elsif (($type eq "T" && $value =~ /^(?:git|quilt|hg)\s+\S/) && > + ($self_test eq "" || $self_test =~ /\bscm\b/)) { > + next if (grep(m@^\Q$value\E$@, @good_links)); > + my $isbad = 0; > + if (grep(m@^\Q$value\E$@, @bad_links)) { > + $isbad = 1; > + } else { > + if ($value =~ /^git\s+(\S+)/) { > + my $url = $1; > + my $output = `git ls-remote --exit-code -h "$url" > > /dev/null 2>&1`; > + if ($? == 0) { > + push(@good_links, $value); > + } else { > + push(@bad_links, $value); > + $isbad = 1; > + } > + } elsif ($value =~ /^(?:quilt|hg)\s+(https?:\S+)/) { > + my $url = $1; > + my $output = `wget --spider -q --no-check-certificate > --timeout 10 --tries 1 $url`; > + if ($? == 0) { > + push(@good_links, $value); > + } else { > + push(@bad_links, $value); > + $isbad = 1; > + } > + } > + if ($isbad) { > + print("$x->{file}:$x->{linenr}: warning: possible bad > link\t$x->{line}\n"); > + } > + } > } > } > > -- > 2.15.0 > Changed SCM portion to this, which picks up a few more warnings... Checks git branch on remote if specified. Perhaps a $ismalformed category or some other way to deal with malformed entries? Or just move up to first check of SCM? ## SCM reachability } elsif (($type eq "T") && ($self_test eq "" || $self_test =~ /\bscm\b/)) { next if (grep(m@^\Q$value\E$@, @good_links)); my $isbad = 0; if (grep(m@^\Q$value\E$@, @bad_links)) { $isbad = 1; } else { if ($value !~ /^(?:git|quilt|hg)\s+\S/) { print("$x->{file}:$x->{linenr}: warning: malformed entry\t$x->{line}\n"); } elsif ($value =~ /^git\s+(\S+)(\s+([^\(]+\S+))?/) { my $url = $1; my $branch = ""; $branch = $3 if $3; my $output = `git ls-remote --exit-code -h "$url" $branch> /dev/null 2>&1`; if ($? == 0) { push(@good_links, $value); } else { push(@bad_links, $value); $isbad = 1; } } elsif ($value =~ /^(?:quilt|hg)\s+(https?:\S+)/) { my $url = $1; my $output = `wget --spider -q --no-check-certificate --timeout 10 --tries 1 $url`; if ($? == 0) { push(@good_links, $value); } else { push(@bad_links, $value); $isbad = 1; } } } if ($isbad) { print("$x->{file}:$x->{linenr}: warning: possible bad link\t$x->{line}\n"); } } } }