perl获取AppAnnie数据
。。。。。。需求: 本blog主要用来记录coding中遇到的一些问题,相关知识点如下:
目录用
before start
简单开始
use strict;
use warnings;
use LWP::Simple qw(get);
my $html = get( "https://www.appannie.com/apps/ios/top/united-states/games/?device=iphone" );
print $html;
while($html =~ m/span title="(.*)" class="oneline-info title-info">s*<a href="/(.*)">/g) {
$app_name = $1; # 这里的$1对应第一个括号(.*)
$app_url = $2;
}
html转义字符处理
$app_name =~ s/&/&/g;
$app_name =~ s/'/'/g;
处理登陆
my $innerHtml = $ua->get("https://www.appannie.com/".$app_url);
#这里用到了UserAgent来模拟浏览器
my $ua = LWP::UserAgent->new();
#生成一个自动保存的cookie
my $cookie_jar = HTTP::Cookies->new(
file => "testcookies.txt",autosave => 1,);
$ua->cookie_jar( $cookie_jar );
$ua->agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/44.0.2403.155 Safari/537.36");
#先获取一下登陆页面
my $res = $ua->get("https://www.appannie.com/account/login/",Referer => "https://www.appannie.com/");
#这里是重点啊,在firefox或者chrom中登陆,然后观察传递的数据,会发现除了用户名和密码是必须要传递的以外,还有个token也是需要传递的,否则死也登陆不上去啊!这个token可以在header中用正则获取,然后一起post给服务器。
my $c = $res->header('set-cookie' );
$c =~ m{csrftoken=(w+);};
my $token = $1;
$res = $ua->post("https://www.appannie.com/account/login/",Content => [
csrfmiddlewaretoken=>$token,next => "/",#这里需要替换成自己的用户名和密码,注意双引号中有些特殊字符是转义的,比如@
username => $cfg{username},password=> $cfg{password},]
);
print $res->content;
#这里的? 是惰性匹配,尽可能少的匹配
my($size,$empty,$language) = ($innerHtml->content =~ m/Size:</b>(.*?)</p><p><b>(.*?)</b>(.*?)</p>/);
获取chart数据
my $res = $ua->get("https://www.appannie.com/".$tempUrl."rank-chart");
my $res = $ua->get("https://www.appannie.com/".$tempUrl."rank-chart",Accept => "application/json,text/plain,*/*",#对应请求的类型
'X-Requested-With' =>"XMLHttpRequest",'X-NewRelic-ID' =>"VwcPUFJXGwEBUlJSDgc=",);
json utf8 unicode
my $chart = $res->content;
#这一步是unicode转换
$chart =~ s/u([0-9a-fA-F]{4})/pack("U",hex($1))/eg;
my $json = new JSON;
#解析json数据
my $obj = $json->decode($chart);
my $type = "";
for (my $var = 0; $var <= $#{$obj->{data}}; $var++) {
#把每个data的name都拿出来
$type = $type.$obj->{data}->[$var]->{category}->{name};
}
#下面贴一下json数据
'{"meta": {"end": "2015-08-22","vertical": "apps","countries": "US","f": null,"app_id": "648668184","start": "2015-07-24","market": "ios"},"data": [{"category": {"name": "u52a8u4f5cu6e38u620f (u6e38u620f)","id": 7001},"country": {"code": "US","name": "u7f8eu56fd"},"ranks": [0,0,91,3,1,0]},{"category": {"name": "u6240u6709u7c7bu522b","id": 36},"ranks":
[0,984,10,{"category": {"name": "u8d5bu8f66u6e38u620f (u6e38u620f)","id": 7013},"country":
{"code": "US",27,{"category": {"name": "u6e38u620f","id": 6014},"country":
{"code": "US",301,0]}],"events": ["","","Initial release",""],"success": true}';
Excel
sub writeExcel{
# 打开之后立刻读写,不要停留很长时间
my $workbook = new Spreadsheet::WriteExcel( $cfg{outfilename} );
my $worksheet = $workbook->add_worksheet( $cfg{firstsheetname} );
# 这里的data是全局的
print Dumper @data;
foreach my $i (0 .. $#data){
my @values = split(/,,/,$data[$i]);
foreach my $j (0 .. $#values){
#excel左上角第一个是【0,0】
$worksheet->write($i,$j,$values[$j]);
}
}
}
Threads 节约时间
#需要用到的模块
use threads;
use threads::shared;
。。。
# 需要调用的函数名,后面是函数对应的参数,这句话放在循环体中,把所有需要访问的url都加入threads里面
threads->create('getInnerData',$app_name,$app_url,$count/3);
。。。
#然后再取出所有的thread,开始同步执行
foreach ( threads->list() ){
$_->join( );
}
#。。。。 然后再getInnerData里面判断threads结束的方法:
writeExcel() unless threads->list(threads::running);
my @data : shared;
share(@data);
细节和总结
open I,"<yzj.cfg" or print("配置文件丢失 yzj.cfg 必须放在一起n");
# cfg中存放yzj.cfg中的键值对 等号周围空格可有可无
my %cfg = map{m/(w+?)s*=s*(.+)/} <I>;
close I;
Annie Api !!!!!!
(编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |