Here's my second try at fixing this.  Nothing wrong with the first try
really, this time I just implemented some of the performance tuning
suggested in #314548.

Also, I added the option:
     -i, --ignore-perms     Don't check that file owner and permissions match

diff -ru perforate-1.1/debian/finddup.1 perforate-1.1.changed/debian/finddup.1
--- perforate-1.1/debian/finddup.1      2005-11-09 05:40:42.771561886 +0200
+++ perforate-1.1.changed/debian/finddup.1      2005-11-09 05:39:25.414445085 
+0200
@@ -7,8 +7,8 @@
 .B finddup
 searches (starting point is the current directory, doesn't cross device
 boundaries) files for duplicates.  Files are considered as duplicate, if their
-.B md5sum
-match.  Links are not counted as duplicate.
+.BR md5sum ,\  uid ,\  gid
+and permissions match.  Links are not counted as duplicate.
 .PP
 The output is a list of duplicate files.  Each line consists of the size (in
 blocks) of the file(s) and the names it appears as.
Only in perforate-1.1.changed/debian: finddup.1~
diff -ru perforate-1.1/finddup perforate-1.1.changed/finddup
--- perforate-1.1/finddup       2005-11-09 05:40:42.769562296 +0200
+++ perforate-1.1.changed/finddup       2005-11-09 07:26:40.510077591 +0200
@@ -20,11 +20,12 @@
 use vars qw($RCS_VERSION $VERSION @dir $opt %filelist %md5list);
 
 sub wanted;
+sub insert_md5;
 
 $RCS_VERSION = '$Id: finddup,v 2.3 2005/02/06 18:57:42 klaus Exp $';
 ($VERSION = '$Revision: 2.3 $') =~ s/^\D*([\d.]*)\D*$/$1/;
 
-GetOptions($opt = {}, qw(help|h man version noaction|n verbose|v quiet|q 
link|l oldresult|o dir=s@)) || pod2usage 2;
+GetOptions($opt = {}, qw(help|h man version noaction|n ignore-perms|i 
verbose|v quiet|q link|l oldresult|o dir=s@)) || pod2usage 2;
 pod2usage(1) if $opt->{help};
 pod2usage(-exitstatus => 0, -verbose => 2) if $opt->{man};
 if ($opt->{version}) { print "Version: $VERSION\n"; exit 0; }
@@ -56,21 +57,17 @@
    # Traverse desired filesystems
    File::Find::find({wanted => \&wanted}, @dir);
 
-   # Now calculate all md5sums. Afterwards %filelist can be freed.
-   foreach (sort {$a->[1]->[0] cmp $b->[1]->[0]} values(%filelist))
+   my ($prev, $prev2) = ([-1, -1, -1, -1]);
+
+   # Now calculate md5sums for each file that has another file of the same
+   # size. Afterwards %filelist can be freed.
+   foreach (sort {$a->[0] cmp $b->[0]} values(%filelist))
    {
-      if (open(IN, "<", $_->[1]->[0]))
-      {
-        my $md5 = Digest::MD5->new->addfile(*IN)->hexdigest;
-        close IN;
-        $md5list{$md5} = [] unless exists $md5list{$md5};
-        push @{$md5list{$md5}}, $_;
-      }
-      else
-      {
-        warn "Cannot open File '" . $_->[1]->[0] . "'";
-      }
+      $prev2 = $prev;
+      insert_md5($prev) if $_->[0] == $prev->[0] || $prev->[0] == $prev2->[0];
+      $prev = $_;
    } # foreach (sort {$a->[1]->[0] cm...
+   insert_md5($prev) if defined $prev2 && $prev->[0] == $prev2->[0];
    %filelist = ();
 } # if ($opt->{oldresult}) { ... }...
 
@@ -83,11 +80,11 @@
    {
       if ($opt->{link})
       {
-        my $reffile = shift @{$md5list{$_}->[0]->[1]}; # Remove the first file 
to not unlink them
+        my $reffile = shift @{$md5list{$_}->[0]->[4]}; # Remove the first file 
to not unlink them
         print "Länge: $size Files:\t$reffile\n" if $opt->{verbose};
         foreach (@{$md5list{$_}})
         {
-           foreach (@{$_->[1]})
+           foreach (@{$_->[4]})
            {
               print "\t\t\t$_\n" if $opt->{verbose};
               unless ($opt->{noaction})
@@ -104,7 +101,7 @@
         print "$size" unless $opt->{quiet};
         foreach (@{$md5list{$_}})
         {
-           foreach (@{$_->[1]})
+           foreach (@{$_->[4]})
            {
               print " '$_'" unless $opt->{quiet};
            }
@@ -123,8 +120,25 @@
 
    if ((($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size) = lstat($_)) && 
!($File::Find::prune |= ($dev != $File::Find::topdev)) && -f _)
    {
-      $filelist{$ino} = [$size, []] unless exists $filelist{$ino};
-      push @{$filelist{$ino}->[1]}, $name;
+      $filelist{$ino} = [$size, $mode, $uid, $gid, []] unless exists 
$filelist{$ino};
+      push @{$filelist{$ino}->[4]}, $name;
+   }
+}
+
+sub insert_md5
+{
+   my $file = shift;
+   if (open(IN, "<", $file->[4]->[0]))
+   {
+      my $md5 = Digest::MD5->new->addfile(*IN)->hexdigest;
+      $md5 .= "\t".$file->[1]."\t".$file->[2]."\t".$file->[3] unless 
$opt->{'ignore-perms'};
+      close IN;
+      $md5list{$md5} = [] unless exists $md5list{$md5};
+      push @{$md5list{$md5}}, $file;
+   }
+   else
+   {
+      warn "Cannot open File '" . $file->[4]->[0] . "'";
    }
 }
 
@@ -146,6 +160,7 @@
  -q, --quiet            be quiet
  -l, --link            link the identical files together
  -o, --oldresult        Use the old output of this script
+ -i, --ignore-perms     Don't check that file owner and permissions match
  -d, --dir              Define the dir to check (you may specify more than one)
 
 =head1 DESCRIPTION
Only in perforate-1.1.changed/: finddup~

Reply via email to