Perl爬虫代码

发布时间：2020-12-16 00:02:18 所属栏目：大数据来源：网络整理
导读：目前在做Perl页面爬虫的模块，发现一些代码，做个详细的分析，把好的引用一下给自己用用。 1 # !/usr/bin/perl -w 2 3 use strict; 4 5 use HTTP:: Request; 6 use HTTP:: Status; 7 use HTML:: LinkExtor; 8 use URI:: URL; 9 use LWP:: UserAgent; 10 # us
目前在做Perl页面爬虫的模块，发现一些代码，做个详细的分析，把好的引用一下给自己用用。
 1 #!/usr/bin/perl -w
 2 
 3 use strict;  4  
 5 use HTTP::Request;  6 use HTTP::Status;  7 use HTML::LinkExtor;  8 use URI::URL;  9 use LWP::UserAgent;  10 #use Digest::MD5 qw(md5_hex); 
 11  
 12  
 13 use Compress::Zlib;  14  
 15 ####################################################################  16 # Parameters Setting 
 17 our $StartUrl = "http://xxx";  18 our $bRestrict = 1;  19 our @restrictSite = ('cxxx','context:');  20 our $bContinueBefore = 1;  21  
 22  
 23 #################################################################### 
 24  
 25  
 26 print __FILE__,"n";  27  
 28 our %img_seen = ();  29 our %url_seen = ();  30 our @url_queue = ();  31 our %url_processed = ();  32  
 33 our %RobotDisallow = ();  34 our %RobotAllow = ();  35 our %site_seen = ();  36  
 37  
 38 if($bContinueBefore){  39     &LoadBefore();  40 }else{  41     $url_seen{$StartUrl} = 1;  42     push @url_queue,$StartUrl;  43 }  44  
 45 our $pageNum = 0;  46 our $BucketNum = 0;  47  
 48 &OpenOutFile();  49  
 50 open(URLHASH,">>urlhash.txt") or die;  51 open(URLPROCESSED,">>urlprocessed.txt") or die;  52 open(URLREDIRECT,">>urlredirect.txt") or die;  53 open(PAGELIST,">>pagelist.txt") or die;  54 open(IMGLIST,">>imglist.txt") or die;  55  
 56  
 57 $| = 1,select $_ for select URLHASH;  58 $| = 1,select $_ for select URLPROCESSED;  59 $| = 1,select $_ for select URLREDIRECT;  60 $| = 1,select $_ for select PAGELIST;  61 $| = 1,select $_ for select IMGLIST;  62  
 63 our $urlhash_log = *URLHASH;  64 our $urlprocessed_log = *URLPROCESSED;  65 our $urlredirect_log = *URLREDIRECT;  66 our $pagelist_log = *PAGELIST;  67 our $imglist_log = *IMGLIST;  68  
 69  
 70 our $UA =  new LWP::UserAgent(keep_alive =>  1, 71                               timeout    =>  60, 72  );  73 $UA->agent('Mozilla/5.0');  74 $UA->proxy(['ftp','http','wais','gopher'],'http://jpproxy:80/');  75  
 76 our $linkExtor = new HTML::LinkExtor(&;linkCallback);  77 our @tmpLinks = ();  78 our @tmpImgs = ();  79  
 80 my $url;  81 while ( $url = &next_url() )  82 {  83     print $urlprocessed_log $url,"n";  84      
 85     #sleep(1000); 
 86      
 87     my $response = &get_url( $url );  88      
 89     if(!defined $response){  90         next;  91  }  92      
 93     my $base = $response->base;  94     $base = $base->as_string;  95     #$base =~ tr/A-Z/a-z/; 
 96      
 97     if ( $base ne $url )  98  {  99         if(!&ValidUrl($base)){ 100             next; 101  } 102          
103         print $urlredirect_log $url,"t",$base,"n"; 104          
105         $url_seen{$base} ++; 106         print $urlhash_log $base,"n"; 107          
108         if(exists($url_processed{$base})){ 109                next; 110  } 111  } 112      
113     my $contents = $response->content; 114  
115     #my $digest = md5_hex($base); 
116      
117     &SavePage($base,$contents); 118     print $pagelist_log $base,"n"; 119     $url_processed{$base} ++; 120          
121      
122     @tmpLinks = (); 123     @tmpImgs = (); 124     $linkExtor->parse($contents); 125      
126     foreach (@tmpLinks){ 127         $_ = URI::URL->new($_,$base)->abs->as_string; 128         #$_ =~ tr/A-Z/a-z/; 
129  } 130      
131     foreach (@tmpImgs){ 132         $_ = URI::URL->new($_,$base)->abs->as_string; 133         #$_ =~ tr/A-Z/a-z/; 
134  } 135      
136     #@tmpLinks = map {$_ = URI::URL->new($_,$base)->abs->as_string;} @tmpLinks; 137  #@tmpImgs = map {$_ = URI::URL->new($_,$base)->abs->as_string;} @tmpImgs; 
138      
139     &RecordLinks(); 140     &RecordImgs(); 141      
142 } 143  
144  
145  
146 sub next_url 147 { 148  
149     # We return 'undef' to signify no URLs on the list 
150     if (@url_queue == 0 ) 151  { 152         return undef; 153  } 154      
155     return shift @url_queue; 156 } 157  
158 sub get_url 159 { 160     my $url   = shift; 161  
162     my $request = new HTTP::Request( 'HEAD',$url ); 163     return undef unless $request; 164  
165     my $response = $UA->request( $request ); 166     return undef unless defined $response; 167     return undef unless $response->is_success; 168  
169     my $content_type = $response->content_type(); 170     return undef unless defined $content_type; 171  
172     return undef if 'text/html' ne $content_type; 173      
174     $request = new HTTP::Request( 'GET',$url ); 175     return undef unless $request; 176  
177     $response = $UA->request( $request ); 178     return undef unless defined $response; 179     return undef unless $response->is_success; 180       
181     return $response; 182 } 183  
184 sub linkCallback 185 { 186     my($tag,%attr) = @_; 187     if($tag eq 'a' || $tag eq 'frame' || $tag eq 'area'){ 188         push(@tmpLinks,values %attr); 189         return; 190  } 191     if($tag eq 'img'){ 192         push(@tmpImgs,values %attr); 193         return; 194  } 195     return; 196 } 197  
198 sub RecordLinks 199 { 200     foreach (@tmpLinks){ 201         if(//.+.(w{1,4})$/){ 202             if($1 =~ /(html|htm|asp|php|jsp)/i){ 203  
204             }elsif($1 =~ /(jpg|jpeg|bmp|png|gif)/i){ 205                 if(/^http/i){ 206                      
207                     if(exists($img_seen{$_})){ 208                         next; 209  } 210                      
211                     $img_seen{$_} = 1; 212                     print $imglist_log $_,"n"; 213                      
214  } 215                 next; 216  
217             }else{ 218                 next; 219  } 220  } 221              
222         #if(/.(gif|jpg|jpeg|png|xbm|au|wav|mpg|pdf|ps|mp3|mp2|rm|zip|rar|gz|zip)$/i){ 223  # next; 224  #} 
225  
226         if(/^http/i){ 227              
228             if(!&ValidUrl($_)){ 229                 next; 230  } 231              
232             s/#.*//; 
233              
234             if(exists($url_seen{$_})){ 235                 next; 236  } 237              
238             $url_seen{$_} = 1; 239             push @url_queue,$_; 240             print $urlhash_log $_,"n"; 241  } 242  } 243 } 244  
245 sub RecordImgs 246 { 247     foreach (@tmpImgs){ 248         if(/^http/i){ 249             if(!&ValidImage($_)){ 250                 next; 251  } 252              
253             if(exists($img_seen{$_})){ 254                 next; 255  } 256              
257             $img_seen{$_} = 1; 258             print $imglist_log $_,"n"; 259      
260  } 261  } 262 } 263  
264  
265 sub LoadBefore 266 { 267     open(FILE,"urlprocessed.txt") or die; 268     while(<FILE>){ 269         chomp; 270         $url_processed{$_}++; 271  } 272      
273     open(FILE,"pagelist.txt") or die; 274     while(<FILE>){ 275         if(/(S+)s/){ 276             $url_processed{$1}++; 277  } 278  } 279  
280     open(FILE,"urlhash.txt") or die; 281     while(<FILE>){ 282         chomp; 283         $url_seen{$_}++; 284         if(!exists($url_processed{$_})){ 285             push @url_queue,$_; 286  } 287  } 288  
289     open(FILE,"imglist.txt") or die; 290     while(<FILE>){ 291         chomp; 292         $img_seen{$_}++; 293  } 294      
295 } 296  
297  
298 sub ValidUrl 299 { 300     my($url) = shift; 301     if($bRestrict){ 302         foreach (@restrictSite){ 303             if($url =~ /$_/){ 304                 return 1; 305  } 306  } 307         return 0; 308     }else{ 309         return 1; 310  } 311 } 312  
313 sub ValidImage 314 { 315     my($url) = shift; 316     if($url =~ /#/){ 
317         return 0; 318  } 319          
320     if(/spacer.gif/){ 321         return 0; 322  } 323  
324     return 1; 325 } 326  
327  
328 sub get_robotstxt 329 { 330     my $url   = shift; 331     $url .= "/robots.txt"; 332      
333     my $request = new HTTP::Request( 'HEAD',$url ); 334     return undef unless $request; 335      
336     my $response = $UA->request( $request ); 337     return undef unless defined $response; 338     return undef unless $response->is_success; 339      
340     my $content_type = $response->content_type(); 341     return undef unless defined $content_type; 342      
343     return undef if 'text/plain' ne $content_type; 344      
345     $request = new HTTP::Request( 'GET',$url ); 346     return undef unless $request; 347      
348     $response = $UA->request( $request ); 349     return undef unless defined $response; 350     return undef unless $response->is_success; 351       
352     return $response; 353 } 354  
355 sub OpenOutFile 356 { 357     $BucketNum ++; 358     my $fname = sprintf("PageBucket.%05d",$BucketNum); 359     open(PAGEBUCKET,">>$fname") or die; 360     binmode(PAGEBUCKET); 361     $| = 1,select $_ for select PAGEBUCKET; 362 } 363  
364 sub SavePage 365 { 366     my($urlR,$contR) = @_; 367     my $data = compress($$contR); 368     my $len = pack('I',length($$urlR)); 369     print PAGEBUCKET $len; 370     print PAGEBUCKET $$urlR; 371     $len = pack('I',length($data)); 372     print PAGEBUCKET $len; 373     print PAGEBUCKET $data; 374  
375     $pageNum++; 376     if($pageNum % 1000 == 0){ 377         print "$pageNum pages have been crawled!n"; 378  } 379     if($pageNum % 100000 == 0){ 380         &OpenOutFile; 381  } 382 }
点开查看代码
（编辑：李大同）
【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!