有没有在一个目录十几万个文件里面搜索所有相同的文件方法 (已找到很多解决方案)
- xiehuoli
- 帖子: 5941
- 注册时间: 2006-06-10 8:43
- 来自: 中国 CS
-
- 帖子: 16
- 注册时间: 2008-02-17 14:27
-
- 帖子: 178
- 注册时间: 2007-10-20 20:15
难道是豆腐MM那天叫我试的BT脚本??
$ cat dup_find.pl
$ cat dup_find.pl
代码: 全选
#!/usr/bin/perl
#===============================================================================
#
# USAGE: ./dup_dete.pl [dir1] [dir2]
#
# DESCRIPTION: Duplcate file detector
#
# AUTHOR: DawnFantasy (DF), <goldenshore999 ATAT gmail DOTDOT com>
# VERSION: 1.0
# REVISION: 1
#===============================================================================
use strict;
use warnings;
use utf8;
binmode STDOUT, 'utf8';
use File::Find;
use Digest::SHA;
#use File::stat qw(:FIELDS);
use IO qw/ Dir File /;
use Data::Dumper;
sub usage {
print <<EOU;
Usage:
$0 [dir [dir]]
EOU
}
usage(), exit -1 if not @ARGV;
my %hashes;
my @result;
{
no warnings;
foreach my $dir (@ARGV) {
print "Loading file info in DIR $dir\n";
find( { wanted => \&get_info, no_chdir => 1 }, ($dir) );
}
## 过滤只对应一个文件
my %process = map { @{ $hashes{$_} } > 1 ? ( $_, $hashes{$_} ) : () }
keys %hashes;
##print "Progress:\n";
##print Dumper \%process;
my $max = int ( scalar ( keys %process ) / 100 ) + 1;
my $count;
my $percent = 0;
## print "Groups: $max\n";
## 每个大小都看一次
foreach ( sort { $a * 1 <=> $b * 1 } sort keys %process ) {
$count++;
if ( $count == $max ) {
$percent++;
$count = 0;
##print ' ', $percent, "%\r";
}
my @files = @{ $process{$_} };
##print "FILES: @files\n";
my %digests = map { $_, get_digest($_) } @files;
##print Dumper \%digests;
my %rev_d = reverse (%digests);
##print Dumper \%rev_d;
## 每个digest
foreach my $d ( sort keys %rev_d ) {
my @files = grep { $digests{$_} eq $d } sort keys %digests;
next if @files < 2;
##print Dumper \@files;
push @result, [@files];
print "SAME: @files\n";
}
}
##print "100%\n";
{
last;
foreach (@result) {
my @files = @$_;
##print "SAME: @files\n";
}
}
}
sub get_digest {
my $fn = shift;
my $sha = Digest::SHA->new(1);
$sha->addfile($fn);
my $digest = $sha->hexdigest;
return $digest;
##print "$fn $inode $size \n";
}
sub get_info {
my $fn = $File::Find::name;
## file / readable / not symlink
return 1 if not -f $fn;
return 1 if not -r $fn;
return 1 if -l $fn;
my ( $inode, $size ) = ( stat ($fn) )[ 1, 7 ];
return 1 if $size == 0;
return 1 if $size == 1;
return 1 if $size > 16 * 1024 * 1024; ## 16MB
##$hashes{$fn} = { inode => $inode, size => $size };
if ( defined $hashes{$size} ) {
my @s = @{ $hashes{$size} };
push @s, $fn;
$hashes{$size} = [@s];
} else {
$hashes{$size} = [$fn];
}
#my @stats = stat($fn);
#print Dumper \@stats;
#print "$fn $stat\n";
#print " $fn $inode $size\n";
}
代码: 全选
$ ./dup_find.pl /home/fideas/temp /home/fideas/temp
Loading file info in DIR /home/fideas/temp
Loading file info in DIR /home/fideas/temp
SAME: /home/fideas/temp/.emacs.d/auto-save-list/.saves-5133-localhost~ /home/fideas/temp/.emacs.d/auto-save-list/.saves-5306-localhost~
SAME: /home/fideas/temp/.emacs.d/auto-save-list/.saves-13190-localhost~ /home/fideas/temp/.emacs.d/auto-save-list/.saves-6879-localhost~
SAME: /home/fideas/temp/software/statabooks/statamanual/ä¸æç
stataå
¥é¨è®²ä¹.pdf /home/fideas/temp/software/statabooks/ä¸æç
stataå
¥é¨è®²ä¹.pdf
SAME: /home/fideas/temp/software/Introduction to Modern Econometrics Using Stata.djvu /home/fideas/temp/software/statabooks/StatisticWithStata/Introduction to Modern Econometrics Using Stata.djvu
Portage 2.1.4.4 (default-linux/x86/2007.0/desktop, gcc-4.2.3, glibc-2.7-r1,
System uname: 2.6.24-gentoo-r3 i686 Intel(R) Celeron(R) M processor 1.30GHz
System uname: 2.6.24-gentoo-r3 i686 Intel(R) Celeron(R) M processor 1.30GHz
- lhw828
- 帖子: 2797
- 注册时间: 2007-03-15 16:58
- 来自: 湖北武汉
- 联系:
看不懂………………
.
Linux下安装QQ的各种办法——2017年3月7日更新——QQ8.8
Linux/Ubuntu学习笔记——用前人的经验,让你快速进入Linux的怀抱
科学上网的姿势,无痛穿越长城
Ubuntu交流QQ群:16308991(500人群)和10993386(500人群)疯狂招人!大家速来!
.
- 吴广德
- 帖子: 347
- 注册时间: 2006-10-26 9:24
- 系统: Ubuntu 18.04 LTS
- 来自: 中国-广西-南宁
- lwent90
- 帖子: 34
- 注册时间: 2011-07-30 21:57
Re: 有没有在一个目录十几万个文件里面搜索所有相同的文件方法 (已找到很多解决方案)
用了版主大大的代码 非常好用