Perl 关键词搜索机器人
发布时间:2020-12-16 00:02:26 所属栏目:大数据 来源:网络整理
导读:这段代码在网上找的。觉得很不错,准备弄来分析下。。 看别人的代码也是一种另类的学习方法。在学习的过程当中多看别人的代码能够提升自己的理解。 特别是一些自己没有用过的模块,通过这些实例就能知道怎么去使用。 当然,你也可以自己去研究官方那些文档。
这段代码在网上找的。觉得很不错,准备弄来分析下。。 看别人的代码也是一种另类的学习方法。在学习的过程当中多看别人的代码能够提升自己的理解。 特别是一些自己没有用过的模块,通过这些实例就能知道怎么去使用。 当然,你也可以自己去研究官方那些文档。但是对于我来说,我觉得最快的方法就是看别人写的代码实例。 或许每个人都有点不同吧。 #!/usr/bin/perl # siteindexingbot.pl use warnings; use strict; use LWP::Simple; use LWP::RobotUA; use WWW::RobotRules; use HTML::Parse; use HTML::HeadParser; use URI::URL; my ($response,$tree,$link,%scanned); # the arrays and hashes used to store page data my (@pages,%titles,%keywords); my $url = $ARGV[0] or die "Usage: siteindexingbot [url]n"; my $base_url = &globalize_url('/',$url); my $robots_txt = $base_url . '/robots.txt'; my $robot_rules = new WWW::RobotRules ( "indexifier/1.0 (libwww-perl-$LWP::VERSION)" ); # look for and parse the robots.txt file if (head($robots_txt)) { print "robots.txt file found OK.n"; $robot_rules->parse($robots_txt,get($robots_txt)); } else { print "robots.txt file not found.n"; } # build the user agent my $ua = new LWP::UserAgent ( "indexifier/1.0 (libwww-perl-$LWP::VERSION)",'me@here.com',$robot_rules ); #$ua->proxy('http' => 'http://proxy.mylan.com/' ); $ua->timeout(30); $ua->max_size(1024 * 100); $ua->parse_head('TRUE'); &scan($base_url); open (FILE,">indexed.txt") or die "Opening indexed.txt: $!"; foreach my $page(@pages) { print FILE join( "t",($page,$titles{$page},$keywords{$page}) ),"n"; } close (FILE); exit; sub scan { my $url = shift; print "Scanning '$url':n"; if ($scanned{$url}) { return; } else { &get_info($url); # this is the extra subroutine $scanned{$url} = 'TRUE'; my @links = &get_links($url); foreach $link(@links) { if ($robot_rules->allowed($link)) { if ($link =~ /^$base_url/i) { my $request = HTTP::Request->new ('HEAD' => $link); my $response = $ua->request($request); my $content_type = $response->header('Content-type'); if ($response->is_error) { print "Dead link to $link found on $urln"; } else { print "$url links to $linkn"; if ($content_type eq 'text/html') { &scan($link); } else { print "$link is not HTMLn"; } } } else { print "$link is not local to $base_urln"; } } else { print "Access to $link is not allowed by robots.txtn"; } } } return; } sub globalize_url { my ($link,$referring_url) = @_; my $url_obj = new URI::URL($link,$referring_url); my $absolute_url = $url_obj->abs->as_string; $absolute_url =~ s/^(.+?)#(.+?)$/$1/ig; return $absolute_url; } sub get_links { my $url = shift; my $request = HTTP::Request->new ('GET' => $url); $request->header('Accept' => 'text/html'); my $response = $ua->request($request); my $tree = HTML::Parse::parse_html($response->content); my $links_ref = $tree->extract_links('a','frame','iframe'); my @links; foreach $link(sort @$links_ref) { push(@links,&globalize_url(${$link}[0],$url)); } return @links; } sub get_info { my $url = shift; my $request = HTTP::Request->new('GET' => $url); $request->header('Accept' => 'text/html'); my $response = $ua->request($request); my $html = $response->content; my ($title,$keywords,$type); my $parser = HTML::HeadParser->new; $parser->parse($html); $title = $parser->header('title') || 'Untitled Document'; $keywords = $response->header('X-Meta-description') || 'none'; push (@pages,$url); $titles{$url} = $title; $keywords{$url} = $keywords; return; } (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |