查找重复文件
发布时间:2020-12-15 21:09:50 所属栏目:大数据 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 #!/usr/bin/perl## fdupe tool - finding duplicate files## $Id: fdupe,v 1.7 2011/10/14 20:11:21 root Exp root $## Source code Copyright (c) 19
以下代码由PHP站长网 52php.cn收集自互联网 现在PHP站长网小编把它分享给大家,仅供参考 #!/usr/bin/perl # # fdupe tool - finding duplicate files # # $Id: fdupe,v 1.7 2011/10/14 20:11:21 root Exp root $ # # Source code Copyright (c) 1998,2011 Bernhard Schneider. # May be used only for non-commercial purposes with # appropriate acknowledgement of copyright. # # FILE : fdupe # DESCRIPTION : script finds duplicate Files. # AUTHOR: Bernhard Schneider <[email?protected]> # hints,crrections & ideas are welcome # # usage: fdupe.pl <path> <path> ... # find / -xdev | fdupe.pl # # how to select and remove duplicates: # redirect output to >file,edit the file and mark lines you # wish to move/delete with a preceding dash (-) # Use following script to delete marked files: # #!/usr/bin/perl -n # chomp; unlink if s/^-//; # # history: # 12.05.99 - goto statment replaced with next # 14.05.99 - minor changes # 18.05.99 - removed confusing 'for $y' # included hash-search # 20.05.99 - minor changes # 02.03.00 - some functions rewritten,optimized for speed # 10.01.01 - hint-fix by Ozzie |ozric at kyuzz.org| # 05.03.02 - fixed hangups by reading block/char-Devices # 08.09.11 - skips checking of hard links # 14.10.11 - accept file names from stdin # #use strict; # uncomment for debugging $|=1; local (*F1,*F2); my %farray = (); my $statF1; # ------------------------------ # traverse directories sub scan ($) { my ($dir) = $_[0]; opendir (DIR,$dir) or die "($dir) $!:[email?protected]"; map { (-d) ? scan ($_) : push @{$farray{-s $_}},$_ unless (-l or -S or -p or -c or -b); } map "$dir/$_",grep !/^..?$/,readdir (DIR); closedir (DIR); } # ------------------------------ # get chunk of bytes from a file sub getchunk ($$) { my ($fsize,$pfname) = @_; my $chunksize = 32; my ($nread,$buff); return undef unless open(F1,$$pfname); $statF1 = [(stat F1)[3,1]]; binmode F1; $nread = read (F1,$buff,$chunksize); ($nread == $chunksize || $nread == $fsize) ? "$buff" : undef; } # ------------------------------ # compare two files sub mycmp ($) { my ($fptr) = $_[0]; my ($buffa,$buffb); my ($nread1,$nread2); my $statF2; my ($buffsize) = 16*1024; return -1 unless (open(F2,"<$$fptr")); $statF2 = [(stat F2)[3,1]]; return 0 if ($statF2->[0] > 1 && $statF1->[1] == $statF2->[1]); binmode F2; seek (F1,0); do { $nread1 = read (F1,$buffa,$buffsize); $nread2 = read (F2,$buffb,$buffsize); if (($nread1 != $nread2) || ($buffa cmp $buffb)) { return -1; } } while ($nread1); return 0; } # ------------------------------ print "collecting files and sizes ...n"; if (-t STDIN) { $ARGV[0] = '.' unless $ARGV[0]; # use wd if no arguments given map scan $_,@ARGV; } else { while (<STDIN>) { s°[rn]$°°g; push @{$farray{-s $_}},$_ unless (-l or -S or -p or -c or -b); } } print "now comparing ...n"; for my $fsize (reverse sort {$a <=> $b} keys %farray) { my ($i,$fptr,$fref,$pnum,%dupes,%index,$chunk); # skip files with unique file size next if $#{$farray{$fsize}} == 0; $pnum = 0; %dupes = %index = (); nx: for (my $nx=0;$nx<=$#{$farray{$fsize}};$nx++) # $nx now 1..count of files { # with the same size $fptr = $farray{$fsize}[$nx]; # ref to the first file $chunk = getchunk $fsize,$fptr; if ($pnum) { for $i (@{$index{$chunk}}) { $fref = ${$dupes{$i}}[0]; unless (mycmp $fref) { # found duplicate,collecting push @{$dupes{$i}},$fptr; next nx; } } } # nothing found,collecting push @{$dupes{$pnum}},$fptr; push @{$index{$chunk}},$pnum++; } # show found dupes for actual size for $i (keys %dupes) { $#{$dupes{$i}} || next; print "n size: $fsizenn"; for (@{$dupes{$i}}) { print $$_,"n"; } } } close F1; close F2; 以上内容由PHP站长网【52php.cn】收集整理供大家参考研究 如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |