Learn Web.Crawling of Perl
发布时间:2020-12-13 20:15:33 所属栏目:PHP教程 来源:网络整理
导读:######Overview of Web.Crawling related modules.#Note that,below codes can not be executed just for overview intention.######!/usr/bin/perl######HTTP::Thin#####use 5.12.1;use HTTP::Request::Common;use HTTP::Thin;say HTTP::Thin-new()-request
#####
#Overview of Web.Crawling related modules.
#Note that,below codes can not be executed just for overview intention.
#####
#!/usr/bin/perl
#####
#HTTP::Thin
#####
use 5.12.1;
use HTTP::Request::Common;
use HTTP::Thin;
say HTTP::Thin->new()->request(GET 'http://example.com')->as_string;
#####
#HTTP:Tiny
#####
use HTTP::Tiny;
my $response = HTTP::Tiny->new->get('http://example.com/');
die "Failed!
" unless $response->{success};
print "$response->{status} $response->{reason}
";
while (my ($k,$v) = each %{$response->{headers}}) {
for (ref $v eq 'ARRAY' ? @$v : $v) {
print "$k: $_
";
}
}
print $response->{content} if length $response->{content};
#new
$http = HTTP::Tiny->new{ %attrubutes };
#valid attributes include:
#-agent
#-cookie_jar
#-default_headers
#-local_address
#-keep_alive
#-max_redirect
#-max_size
#-https_proxy
#-proxy
#-no_proxy
#-timeout
#-verify_SSL
#-SSL_options
#get[head][put][post]delete
$response = $http->get($url);
$response = $http->get($url,%options);
$response = $http->head($url);
#post_form
$response = $http->post_form($url,$form_data);
$response = $http->post_form($url,$form_data,%options);
#request
$response = $http->request($method,$url);
$response = $http->request($method,$url,%options);
$http->request('GET','http://user:pwd hk.mars@aol.com');
#or
$http->request('GET','http://mars%40:pwd hk.mars@aol.com');
#www_form_urlencode
$params = $http->www_form_urlencode( $data );
$response = $http->get("http://example.com/query?$params");
#SSL support
SSL_options => {
SSL_ca_file => $file_path,}
#proxy support
#####
#www::Mechanize
#
#Stateful programmatic web browsing,used for automating interaction with websites.
#####
use WWW::Mechanize;
my $mech = WWW::Mechanize->new();
$mech->get( $url );
$mech->follow_link( n => 3 );
$mech->follow_link( text_regex => qr/download this/i );
$mech->follow_link( url => 'http://host.com/index.html' );
$mech->submit_form(
form_number => 3,fields => {
username => 'banana',passoword => 'lost-and-alone',}
);
$mech->submit_form(
form_name => 'search',fields => { query => 'pot of gold',},button => 'search now'
);
#testing web applications
use Test::More;
like( $mech->content(),qr/$expected/,"Got expected content" );
#page traverse
$mech->back();
#finer control over page
$mech->find_link( n => $number );
$mech->form_number( $number );
$mech->form_name( $name );
$mech->field( $name,$value );
$mech->set_fields( $field_values );
$mech->set_visible( @criteria );
$mech->click( $button );
#subclass of LWP::UserAgent,eg:
$mech->add_header( $name =>$value );
#page-fecting methods
#status methods
#content-handling methods
#link methods
#image methods
#form methods
#field methods
#miscellaneous methods
#overridden LWP::UserAgent methods
#inherited unchanced LWP::UserAgent methods
#yeah now,it's easy to implement a spider project for future integration use.
Mars(编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |