查找重复文件

发布时间：2020-12-15 21:09:50 所属栏目：大数据来源：网络整理

导读：今天PHP站长网 52php.cn把收集自互联网的代码分享给大家，仅供参考。 #!/usr/bin/perl## fdupe tool - finding duplicate files## $Id: fdupe,v 1.7 2011/10/14 20:11:21 root Exp root $## Source code Copyright (c) 19

以下代码由PHP站长网 52php.cn收集自互联网

现在PHP站长网小编把它分享给大家，仅供参考

#!/usr/bin/perl
#
# fdupe tool - finding duplicate files
#
# $Id: fdupe,v 1.7 2011/10/14 20:11:21 root Exp root $
#
# Source code Copyright (c) 1998,2011 Bernhard Schneider.
# May be used only for non-commercial purposes with
# appropriate acknowledgement of copyright.
#
# FILE :        fdupe
# DESCRIPTION : script finds duplicate Files.
# AUTHOR:       Bernhard Schneider <[email?protected]>
# hints,crrections & ideas are welcome
#
# usage: fdupe.pl <path> <path> ...
#        find / -xdev | fdupe.pl
#
# how to select and remove duplicates:
#   redirect output to >file,edit the file and mark lines you 
#   wish to move/delete with a preceding dash (-)
#   Use following script to delete marked files:
#   #!/usr/bin/perl -n
#   chomp; unlink if s/^-//;
#
# history:
# 12.05.99 - goto statment replaced with next
# 14.05.99 - minor changes
# 18.05.99 - removed confusing 'for $y'
#            included hash-search 
# 20.05.99 - minor changes
# 02.03.00 - some functions rewritten,optimized for speed
# 10.01.01 - hint-fix by Ozzie |ozric at kyuzz.org|
# 05.03.02 - fixed hangups by reading block/char-Devices
# 08.09.11 - skips checking of hard links
# 14.10.11 - accept file names from stdin
#
#use strict; # uncomment for debugging

$|=1; 
local (*F1,*F2); my %farray = (); my $statF1;

# ------------------------------
# traverse directories
sub scan ($) {
    my ($dir) = $_[0];
    opendir (DIR,$dir) or die "($dir) $!:[email?protected]";
    map {
          (-d) ? scan ($_) : push @{$farray{-s $_}},$_
             unless (-l or -S  or -p or -c or -b);
    } map "$dir/$_",grep !/^..?$/,readdir (DIR); closedir (DIR);
}

# ------------------------------
# get chunk of bytes from a file
sub getchunk ($$) {
  my ($fsize,$pfname) = @_;
  my $chunksize = 32;
  my ($nread,$buff);
  
  return undef unless open(F1,$$pfname);

  $statF1 = [(stat  F1)[3,1]];
  binmode F1;
  $nread = read (F1,$buff,$chunksize);
  ($nread == $chunksize || $nread == $fsize) ? "$buff" : undef;
}  

# ------------------------------
# compare two files
sub mycmp ($) {
  my ($fptr) = $_[0];
  my ($buffa,$buffb);
  my ($nread1,$nread2);
  my $statF2;
  my ($buffsize) = 16*1024;
  
  return -1 unless (open(F2,"<$$fptr"));
  
  $statF2 = [(stat  F2)[3,1]];

  return 0 
  	if ($statF2->[0] > 1 && $statF1->[1] == $statF2->[1]);

  binmode F2;
  seek (F1,0);
  
  do {  $nread1 = read (F1,$buffa,$buffsize);
    	$nread2 = read (F2,$buffb,$buffsize);

    	if (($nread1 != $nread2) || ($buffa cmp $buffb)) {
      	  return -1;
        }
  } while ($nread1);
  
  return 0;
}

# ------------------------------

print "collecting files and sizes ...n";

if (-t STDIN) {
	$ARGV[0] = '.' unless $ARGV[0]; # use wd if no arguments given
	map scan $_,@ARGV;
} else {	
	while (<STDIN>)  {
		s°[rn]$°°g;
		push @{$farray{-s $_}},$_
		 unless (-l or -S  or -p or -c or -b);
	}
}

print "now comparing ...n";
for my $fsize (reverse sort {$a <=> $b} keys %farray) {

  my ($i,$fptr,$fref,$pnum,%dupes,%index,$chunk);

  # skip files with unique file size
  next if $#{$farray{$fsize}} == 0; 
  
  $pnum  = 0;
  %dupes = %index = ();
  
  nx:
  for (my $nx=0;$nx<=$#{$farray{$fsize}};$nx++) # $nx now 1..count of files 
  {                                             # with the same size
	$fptr = $farray{$fsize}[$nx];          # ref to the first file
    $chunk = getchunk $fsize,$fptr;
    if ($pnum) {
	  for $i (@{$index{$chunk}}) {
         $fref = ${$dupes{$i}}[0];
	     unless (mycmp $fref) {
            # found duplicate,collecting
	        push @{$dupes{$i}},$fptr;
			next nx;
	     }
	  }
    }

    # nothing found,collecting 
    push @{$dupes{$pnum}},$fptr;
    push @{$index{$chunk}},$pnum++;
  }
  # show found dupes for actual size
  for $i (keys %dupes) {
    $#{$dupes{$i}} || next;
    print "n size: $fsizenn";
    for (@{$dupes{$i}}) {
        print $$_,"n"; 
    }
  }
}

close F1;
close F2;

以上内容由PHP站长网【52php.cn】收集整理供大家参考研究

如果以上内容对您有帮助，欢迎收藏、点赞、推荐、分享。

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!