Perl爬虫研究
发布时间:2020-12-16 00:02:29 所属栏目:大数据 来源:网络整理
导读:这几天忙着做项目和一些W3A的测试,没啥时间研究别的. 今天趁着快放假,也给自己放放假吧.看了下云总写的Perl爬虫,发现有多处不懂. 但是部分地方算是理解了,看来目标还是很遥远的. 给代码加了下注释,不过太累的,准备睡觉了..写了部分,改天补全.. 凑合着看吧...
这几天忙着做项目和一些W3A的测试,没啥时间研究别的. 今天趁着快放假,也给自己放放假吧.看了下云总写的Perl爬虫,发现有多处不懂. 但是部分地方算是理解了,看来目标还是很遥远的. 给代码加了下注释,不过太累的,准备睡觉了..写了部分,改天补全.. 凑合着看吧.... #!/usr/bin/perl use strict; use warnings; use threads; use threads::shared; use Thread::Queue; use Thread::Semaphore; use Bloom::Filter; use URI::URL; use Web::Scraper; # 设置线程数量 my $max_threads = 15; # 设置目标 my $base_url = $ARGV[0] || 'http://www.icylife.net'; # 这个没解释出来(P1) my $host = URI::URL->new($base_url)->host; # 建立一个线程队列 my $queue = Thread::Queue->new( ); # 创建信号量容器并锁定峰值 my $semaphore = Thread::Semaphore->new( $max_threads ); # 每次创建一个信号量 my $mutex = Thread::Semaphore->new( 1 ); # BS算法,用于测试URL是否重复 my $filter = shared_clone( Bloom::Filter->new(capacity => 1000,error_rate => 0.0001) ); # 将目标放入任务队列 $queue->enqueue( $base_url ); # 放入需要对比的第一个成员 $filter->add( $base_url ); while( 1 ) { # join all threads which can be joined #my $joined = 0; foreach ( threads->list(threads::joinable) ) { #$joined ++; $_->join( ); } #print $joined," joinedn"; # if there are no url need process. my $item = $queue->pending(); if( $item == 0 ) { my $active = threads->list(threads::running); # there are no active thread,we finish the job if( $active == 0 ) { print "All done!n"; last; } # we will get some more url if there are some active threads,just wait for them else { #print "[MAIN] 0 URL,but $active active threadn"; sleep 1; next; } } # if there are some url need process #print "[MAIN] $item URLn"; $semaphore->down; #print "[MAIN]Create thread.n"; threads->create( &;ProcessUrl ); } # join all threads which can be joined foreach ( threads->list() ) { $_->join( ); } sub ProcessUrl { my $scraper = scraper { process '//a','links[]' => '@href'; }; my $res; my $link; while( my $url = $queue->dequeue_nb() ) { eval { $res = $scraper->scrape( URI->new($url) )->{'links'}; }; if( $@ ) { warn "$@n"; next; } next if (! defined $res ); #print "there are ".scalar(threads->list(threads::running))." threads,",$queue->pending()," urls need process.n"; foreach( @{$res} ) { $link = $_->as_string; $link = URI::URL->new($link,$url); # not http and not https? next if( $link->scheme ne 'http' && $link->scheme ne 'https' ); # another domain? next if( $link->host ne $host ); $link = $link->abs->as_string; if( $link =~ /(.*?)#(.*)/ ) { $link = $1; } next if( $link =~ /.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|doc|js|css|docx|xls|xlsx)$/i ); $mutex->down(); if( ! $filter->check($link) ) { print $filter->key_count()," ",$link,"n"; $filter->add($link); $queue->enqueue($link); } $mutex->up(); undef $link; } undef $res; } undef $scraper; $semaphore->up( ); } (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |