利用perl基础库抓取百度博客，生成wp导入文件

发布时间：2020-12-15 21:10:04 所属栏目：大数据来源：网络整理

导读：今天PHP站长网 52php.cn把收集自互联网的代码分享给大家，仅供参考。 # Author : thicket# Date : 2013/01/31# WebSite : hi.baidu.com# 在当前文件夹生成以日期为文件名的xml文件，可以导入wordpressuse LWP::Simple;use

以下代码由PHP站长网 52php.cn收集自互联网

现在PHP站长网小编把它分享给大家，仅供参考

# Author : thicket
# Date : 2013/01/31
# WebSite : hi.baidu.com
# 在当前文件夹生成以日期为文件名的xml文件，可以导入wordpress

use LWP::Simple;
use HTML::Parse;
use HTML::Element;
use URI::URL;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use URI::Escape;
use POSIX;


$website = $ARGV[0];

if(!$website){
        print "=== add website ! ===n";
        exit;
}

$website = 'http://hi.baidu.com/'.$website;

my($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time());
my $format_time = sprintf("%d-%d-%d",$year+1900,$mon+1,$sec);

$file_name = './baidu'.$format_time.'.xml';
if(open(OF,">$file_name")){
#    print OF ("Here is an output line.n");

$ua = new LWP::UserAgent;        # 產生 UserAgent 物件

print OF ("$websiten");

my $pages_totle,$pages_row;

$_ = get "$website?page=1";
($pages_totle,$pages_row) = getPageNum($_);
my $len = ceil($pages_totle/$pages_row);

for ($count = 1; $count <= $len; $count++) {

    $url_ind = "$website?page=$count";
print $url_ind."n";

    $request = new HTTP::Request('GET',$url_ind);	# 產生 Request 物件
    $response = $ua->request($request);    # 開始抓取網頁，並將結果傳會 $response
    if ($response->is_success) {    	# 若抓取網頁成功，則印出 HTML 原始碼
        $_ = $response->content;

my $rss = '<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0"
	xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:wp="http://wordpress.org/export/1.2/"
>
<channel>';

        my @words = ($_ =~ m/(<a[^>]+?#reply[^>]+?>)/gi);
        my $words = join('',@words);
        $words =~ s/[]|#reply//ig;

        my @links = getLink($words);

        foreach(@links){
            $url = $_;
            $request = new HTTP::Request('GET',$url);	# 產生 Request 物件
            $response = $ua->request($request);
            if ($response->is_success) {
                ($date,$title,$tag,$content) = getContent($response->content);
                print '===============================';
                print $url."n";
                print $date."n".$title."n".$tag."n";
                @tag = split(' ',$tag);

$rss =	"<item>
    <title>$title</title>
    <link></link>
    <pubDate>Tue,15 Jan 2013 12:53:41 +0000</pubDate>
    <dc:creator>thicket</dc:creator>
    <guid isPermaLink="false"></guid>
    <description></description>
    <content:encoded><![CDATA[$content]]></content:encoded>
    <excerpt:encoded><![CDATA[]]></excerpt:encoded>
    <wp:post_id></wp:post_id>
    <wp:post_date>$date</wp:post_date>
    <wp:post_date_gmt>2013-01-15 12:53:41</wp:post_date_gmt>
    <wp:comment_status>open</wp:comment_status>
    <wp:ping_status>open</wp:ping_status>
    <wp:post_name>";
$rss .= uri_escape($title);
$rss .= "</wp:post_name>
    <wp:status>publish</wp:status>
    <wp:post_parent>0</wp:post_parent>
    <wp:menu_order>0</wp:menu_order>
    <wp:post_type>post</wp:post_type>
    <wp:post_password></wp:post_password>
    <wp:is_sticky>0</wp:is_sticky>";
foreach(@tag){
    $rss .= "
        <category domain="post_tag" nicename=";
        $rss .= uri_escape($_);
        $rss .= "><![CDATA[$_]]></category>";
}
    $rss .= "
        <wp:postmeta>
    	<wp:meta_key>_edit_last</wp:meta_key>
    	<wp:meta_value><![CDATA[1]]></wp:meta_value>
    </wp:postmeta>
	</item>";

print OF ("$rssn");
            }else{
                print $response->error_as_HTML;
            }
        }
print OF ("</channel></rss>n");
    } else {                # 若抓取網頁不成功，則印出錯誤訊息
	    print $response->error_as_HTML;
    }
}

close(OF);
}else{
    print "open file error n";
    exit;
}



##########################################################################################

#获取文章连接
sub getLink{
    my @full_url;
    $parsed_html = HTML::Parse::parse_html(@_[0]);
    for (@{ $parsed_html->extract_links("a") }) {
	    $link = $_->[0];
	    $url = new URI::URL $link;
	    push(@full_url,$url->abs($website));
    }
    return @full_url;
}


#获取html
sub getContent{
    $_ = @_[0];
    my @date = ($_ =~ m/<div[^>]+class=content-other-info>s*(.+?)s*</div>/i);
    my $date = join('',@date);
    $date =~ s/<[^>]*>//g;

    my @title = ($_ =~ m/<h2 class="title content-title">(.+?)</h2>/i);
    my $title = join('',@title);

    my @content = ($_ =~ m/<div id=content[^>]+>(.+?)</div>/i);
    my $content = join('',@content);

    my @tag = ($_ =~ m/<a class="tag"[^>]+>#(.+?)</a>/gi);
    my $tag = join(' ',@tag);
    $tag =~ s/<[^>]*>//g;

    return ($date,$content);
}

#取得页数
sub getPageNum{
    $_ = @_[0];
    my @pages = ($_ =~ m/allCount.*,/gi);
    @pages = (join("",@pages) =~ m/[0-9]+/gi);
    $pages_totle = join("n",@pages);

    my @pages = ($_ =~ m/pageSize.*,@pages) =~ m/[0-9]+/gi);
    $pages_row = join("n",@pages);

    return ($pages_totle,$pages_row);
}

以上内容由PHP站长网【52php.cn】收集整理供大家参考研究

如果以上内容对您有帮助，欢迎收藏、点赞、推荐、分享。

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!