Here's my second try at fixing this. Nothing wrong with the first try really, this time I just implemented some of the performance tuning suggested in #314548.
Also, I added the option: -i, --ignore-perms Don't check that file owner and permissions match
diff -ru perforate-1.1/debian/finddup.1 perforate-1.1.changed/debian/finddup.1 --- perforate-1.1/debian/finddup.1 2005-11-09 05:40:42.771561886 +0200 +++ perforate-1.1.changed/debian/finddup.1 2005-11-09 05:39:25.414445085 +0200 @@ -7,8 +7,8 @@ .B finddup searches (starting point is the current directory, doesn't cross device boundaries) files for duplicates. Files are considered as duplicate, if their -.B md5sum -match. Links are not counted as duplicate. +.BR md5sum ,\ uid ,\ gid +and permissions match. Links are not counted as duplicate. .PP The output is a list of duplicate files. Each line consists of the size (in blocks) of the file(s) and the names it appears as. Only in perforate-1.1.changed/debian: finddup.1~ diff -ru perforate-1.1/finddup perforate-1.1.changed/finddup --- perforate-1.1/finddup 2005-11-09 05:40:42.769562296 +0200 +++ perforate-1.1.changed/finddup 2005-11-09 07:26:40.510077591 +0200 @@ -20,11 +20,12 @@ use vars qw($RCS_VERSION $VERSION @dir $opt %filelist %md5list); sub wanted; +sub insert_md5; $RCS_VERSION = '$Id: finddup,v 2.3 2005/02/06 18:57:42 klaus Exp $'; ($VERSION = '$Revision: 2.3 $') =~ s/^\D*([\d.]*)\D*$/$1/; -GetOptions($opt = {}, qw(help|h man version noaction|n verbose|v quiet|q link|l oldresult|o dir=s@)) || pod2usage 2; +GetOptions($opt = {}, qw(help|h man version noaction|n ignore-perms|i verbose|v quiet|q link|l oldresult|o dir=s@)) || pod2usage 2; pod2usage(1) if $opt->{help}; pod2usage(-exitstatus => 0, -verbose => 2) if $opt->{man}; if ($opt->{version}) { print "Version: $VERSION\n"; exit 0; } @@ -56,21 +57,17 @@ # Traverse desired filesystems File::Find::find({wanted => \&wanted}, @dir); - # Now calculate all md5sums. Afterwards %filelist can be freed. - foreach (sort {$a->[1]->[0] cmp $b->[1]->[0]} values(%filelist)) + my ($prev, $prev2) = ([-1, -1, -1, -1]); + + # Now calculate md5sums for each file that has another file of the same + # size. Afterwards %filelist can be freed. + foreach (sort {$a->[0] cmp $b->[0]} values(%filelist)) { - if (open(IN, "<", $_->[1]->[0])) - { - my $md5 = Digest::MD5->new->addfile(*IN)->hexdigest; - close IN; - $md5list{$md5} = [] unless exists $md5list{$md5}; - push @{$md5list{$md5}}, $_; - } - else - { - warn "Cannot open File '" . $_->[1]->[0] . "'"; - } + $prev2 = $prev; + insert_md5($prev) if $_->[0] == $prev->[0] || $prev->[0] == $prev2->[0]; + $prev = $_; } # foreach (sort {$a->[1]->[0] cm... + insert_md5($prev) if defined $prev2 && $prev->[0] == $prev2->[0]; %filelist = (); } # if ($opt->{oldresult}) { ... }... @@ -83,11 +80,11 @@ { if ($opt->{link}) { - my $reffile = shift @{$md5list{$_}->[0]->[1]}; # Remove the first file to not unlink them + my $reffile = shift @{$md5list{$_}->[0]->[4]}; # Remove the first file to not unlink them print "Länge: $size Files:\t$reffile\n" if $opt->{verbose}; foreach (@{$md5list{$_}}) { - foreach (@{$_->[1]}) + foreach (@{$_->[4]}) { print "\t\t\t$_\n" if $opt->{verbose}; unless ($opt->{noaction}) @@ -104,7 +101,7 @@ print "$size" unless $opt->{quiet}; foreach (@{$md5list{$_}}) { - foreach (@{$_->[1]}) + foreach (@{$_->[4]}) { print " '$_'" unless $opt->{quiet}; } @@ -123,8 +120,25 @@ if ((($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size) = lstat($_)) && !($File::Find::prune |= ($dev != $File::Find::topdev)) && -f _) { - $filelist{$ino} = [$size, []] unless exists $filelist{$ino}; - push @{$filelist{$ino}->[1]}, $name; + $filelist{$ino} = [$size, $mode, $uid, $gid, []] unless exists $filelist{$ino}; + push @{$filelist{$ino}->[4]}, $name; + } +} + +sub insert_md5 +{ + my $file = shift; + if (open(IN, "<", $file->[4]->[0])) + { + my $md5 = Digest::MD5->new->addfile(*IN)->hexdigest; + $md5 .= "\t".$file->[1]."\t".$file->[2]."\t".$file->[3] unless $opt->{'ignore-perms'}; + close IN; + $md5list{$md5} = [] unless exists $md5list{$md5}; + push @{$md5list{$md5}}, $file; + } + else + { + warn "Cannot open File '" . $file->[4]->[0] . "'"; } } @@ -146,6 +160,7 @@ -q, --quiet be quiet -l, --link link the identical files together -o, --oldresult Use the old output of this script + -i, --ignore-perms Don't check that file owner and permissions match -d, --dir Define the dir to check (you may specify more than one) =head1 DESCRIPTION Only in perforate-1.1.changed/: finddup~