发表于 : 2007-11-09 16:47
这害人的妹托,一直不说明是什么文件。
呵呵,什么文件都有eexpress 写了:这害人的妹托,一直不说明是什么文件。
代码: 全选
#!/usr/bin/perl
#===============================================================================
#
# USAGE: ./dup_dete.pl [dir1] [dir2]
#
# DESCRIPTION: Duplcate file detector
#
# AUTHOR: DawnFantasy (DF), <goldenshore999 ATAT gmail DOTDOT com>
# VERSION: 1.0
# REVISION: 1
#===============================================================================
use strict;
use warnings;
use utf8;
binmode STDOUT, 'utf8';
use File::Find;
use Digest::SHA;
#use File::stat qw(:FIELDS);
use IO qw/ Dir File /;
use Data::Dumper;
sub usage {
print <<EOU;
Usage:
$0 [dir [dir]]
EOU
}
usage(), exit -1 if not @ARGV;
my %hashes;
my @result;
{
no warnings;
foreach my $dir (@ARGV) {
print "Loading file info in DIR $dir\n";
find( { wanted => \&get_info, no_chdir => 1 }, ($dir) );
}
## 过滤只对应一个文件
my %process = map { @{ $hashes{$_} } > 1 ? ( $_, $hashes{$_} ) : () }
keys %hashes;
##print "Progress:\n";
##print Dumper \%process;
my $max = int ( scalar ( keys %process ) / 100 ) + 1;
my $count;
my $percent = 0;
## print "Groups: $max\n";
## 每个大小都看一次
foreach ( sort { $a * 1 <=> $b * 1 } sort keys %process ) {
$count++;
if ( $count == $max ) {
$percent++;
$count = 0;
##print ' ', $percent, "%\r";
}
my @files = @{ $process{$_} };
##print "FILES: @files\n";
my %digests = map { $_, get_digest($_) } @files;
##print Dumper \%digests;
my %rev_d = reverse (%digests);
##print Dumper \%rev_d;
## 每个digest
foreach my $d ( sort keys %rev_d ) {
my @files = grep { $digests{$_} eq $d } sort keys %digests;
next if @files < 2;
##print Dumper \@files;
push @result, [@files];
print "SAME: @files\n";
}
}
##print "100%\n";
{
last;
foreach (@result) {
my @files = @$_;
##print "SAME: @files\n";
}
}
}
sub get_digest {
my $fn = shift;
my $sha = Digest::SHA->new(1);
$sha->addfile($fn);
my $digest = $sha->hexdigest;
return $digest;
##print "$fn $inode $size \n";
}
sub get_info {
my $fn = $File::Find::name;
## file / readable / not symlink
return 1 if not -f $fn;
return 1 if not -r $fn;
return 1 if -l $fn;
my ( $inode, $size ) = ( stat ($fn) )[ 1, 7 ];
return 1 if $size == 0;
return 1 if $size == 1;
return 1 if $size > 16 * 1024 * 1024; ## 16MB
##$hashes{$fn} = { inode => $inode, size => $size };
if ( defined $hashes{$size} ) {
my @s = @{ $hashes{$size} };
push @s, $fn;
$hashes{$size} = [@s];
} else {
$hashes{$size} = [$fn];
}
#my @stats = stat($fn);
#print Dumper \@stats;
#print "$fn $stat\n";
#print " $fn $inode $size\n";
}
代码: 全选
$ ./dup_find.pl /home/fideas/temp /home/fideas/temp
Loading file info in DIR /home/fideas/temp
Loading file info in DIR /home/fideas/temp
SAME: /home/fideas/temp/.emacs.d/auto-save-list/.saves-5133-localhost~ /home/fideas/temp/.emacs.d/auto-save-list/.saves-5306-localhost~
SAME: /home/fideas/temp/.emacs.d/auto-save-list/.saves-13190-localhost~ /home/fideas/temp/.emacs.d/auto-save-list/.saves-6879-localhost~
SAME: /home/fideas/temp/software/statabooks/statamanual/䏿ç
stataå
¥é¨è®²ä¹.pdf /home/fideas/temp/software/statabooks/䏿ç
stataå
¥é¨è®²ä¹.pdf
SAME: /home/fideas/temp/software/Introduction to Modern Econometrics Using Stata.djvu /home/fideas/temp/software/statabooks/StatisticWithStata/Introduction to Modern Econometrics Using Stata.djvu