利用perl基础库抓取百度博客,生成wp导入文件
发布时间:2020-12-15 21:10:04 所属栏目:大数据 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 # Author : thicket# Date : 2013/01/31# WebSite : hi.baidu.com# 在当前文件夹生成以日期为文件名的xml文件,可以导入wordpressuse LWP::Simple;use
以下代码由PHP站长网 52php.cn收集自互联网 现在PHP站长网小编把它分享给大家,仅供参考 # Author : thicket # Date : 2013/01/31 # WebSite : hi.baidu.com # 在当前文件夹生成以日期为文件名的xml文件,可以导入wordpress use LWP::Simple; use HTML::Parse; use HTML::Element; use URI::URL; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use URI::Escape; use POSIX; $website = $ARGV[0]; if(!$website){ print "=== add website ! ===n"; exit; } $website = 'http://hi.baidu.com/'.$website; my($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time()); my $format_time = sprintf("%d-%d-%d",$year+1900,$mon+1,$sec); $file_name = './baidu'.$format_time.'.xml'; if(open(OF,">$file_name")){ # print OF ("Here is an output line.n"); $ua = new LWP::UserAgent; # 產生 UserAgent 物件 print OF ("$websiten"); my $pages_totle,$pages_row; $_ = get "$website?page=1"; ($pages_totle,$pages_row) = getPageNum($_); my $len = ceil($pages_totle/$pages_row); for ($count = 1; $count <= $len; $count++) { $url_ind = "$website?page=$count"; print $url_ind."n"; $request = new HTTP::Request('GET',$url_ind); # 產生 Request 物件 $response = $ua->request($request); # 開始抓取網頁,並將結果傳會 $response if ($response->is_success) { # 若抓取網頁成功,則印出 HTML 原始碼 $_ = $response->content; my $rss = '<?xml version="1.0" encoding="UTF-8" ?> <rss version="2.0" xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:wfw="http://wellformedweb.org/CommentAPI/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:wp="http://wordpress.org/export/1.2/" > <channel>'; my @words = ($_ =~ m/(<a[^>]+?#reply[^>]+?>)/gi); my $words = join('',@words); $words =~ s/[]|#reply//ig; my @links = getLink($words); foreach(@links){ $url = $_; $request = new HTTP::Request('GET',$url); # 產生 Request 物件 $response = $ua->request($request); if ($response->is_success) { ($date,$title,$tag,$content) = getContent($response->content); print '==============================='; print $url."n"; print $date."n".$title."n".$tag."n"; @tag = split(' ',$tag); $rss = "<item> <title>$title</title> <link></link> <pubDate>Tue,15 Jan 2013 12:53:41 +0000</pubDate> <dc:creator>thicket</dc:creator> <guid isPermaLink="false"></guid> <description></description> <content:encoded><![CDATA[$content]]></content:encoded> <excerpt:encoded><![CDATA[]]></excerpt:encoded> <wp:post_id></wp:post_id> <wp:post_date>$date</wp:post_date> <wp:post_date_gmt>2013-01-15 12:53:41</wp:post_date_gmt> <wp:comment_status>open</wp:comment_status> <wp:ping_status>open</wp:ping_status> <wp:post_name>"; $rss .= uri_escape($title); $rss .= "</wp:post_name> <wp:status>publish</wp:status> <wp:post_parent>0</wp:post_parent> <wp:menu_order>0</wp:menu_order> <wp:post_type>post</wp:post_type> <wp:post_password></wp:post_password> <wp:is_sticky>0</wp:is_sticky>"; foreach(@tag){ $rss .= " <category domain="post_tag" nicename="; $rss .= uri_escape($_); $rss .= "><![CDATA[$_]]></category>"; } $rss .= " <wp:postmeta> <wp:meta_key>_edit_last</wp:meta_key> <wp:meta_value><![CDATA[1]]></wp:meta_value> </wp:postmeta> </item>"; print OF ("$rssn"); }else{ print $response->error_as_HTML; } } print OF ("</channel></rss>n"); } else { # 若抓取網頁不成功,則印出錯誤訊息 print $response->error_as_HTML; } } close(OF); }else{ print "open file error n"; exit; } ########################################################################################## #获取文章连接 sub getLink{ my @full_url; $parsed_html = HTML::Parse::parse_html(@_[0]); for (@{ $parsed_html->extract_links("a") }) { $link = $_->[0]; $url = new URI::URL $link; push(@full_url,$url->abs($website)); } return @full_url; } #获取html sub getContent{ $_ = @_[0]; my @date = ($_ =~ m/<div[^>]+class=content-other-info>s*(.+?)s*</div>/i); my $date = join('',@date); $date =~ s/<[^>]*>//g; my @title = ($_ =~ m/<h2 class="title content-title">(.+?)</h2>/i); my $title = join('',@title); my @content = ($_ =~ m/<div id=content[^>]+>(.+?)</div>/i); my $content = join('',@content); my @tag = ($_ =~ m/<a class="tag"[^>]+>#(.+?)</a>/gi); my $tag = join(' ',@tag); $tag =~ s/<[^>]*>//g; return ($date,$content); } #取得页数 sub getPageNum{ $_ = @_[0]; my @pages = ($_ =~ m/allCount.*,/gi); @pages = (join("",@pages) =~ m/[0-9]+/gi); $pages_totle = join("n",@pages); my @pages = ($_ =~ m/pageSize.*,@pages) =~ m/[0-9]+/gi); $pages_row = join("n",@pages); return ($pages_totle,$pages_row); } 以上内容由PHP站长网【52php.cn】收集整理供大家参考研究 如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |