加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 大数据 > 正文

Perl程序抓取TopCoder的题目和解答

发布时间:2020-12-15 21:01:23 所属栏目:大数据 来源:网络整理
导读:TopCoder是一个非常好的学习网站,但由于服务器在国外,查看常常很慢。同时,如果在线看,常常不是很方便,因此用perl写了个小爬虫,去批量抓取想要的这个题目和解答,非常好用。 注意抓取是需要cookie的,自己登录,用fiddler抓个包就可以看到cookie了。 #!

TopCoder是一个非常好的学习网站,但由于服务器在国外,查看常常很慢。同时,如果在线看,常常不是很方便,因此用perl写了个小爬虫,去批量抓取想要的这个题目和解答,非常好用。

注意抓取是需要cookie的,自己登录,用fiddler抓个包就可以看到cookie了。

#!/usr/bin/perl
use strict;

sub main {
	my @SRM_url_list = ();
	my @SRM_idx = ();
	get_SRM_url_list(@SRM_url_list,@SRM_idx);
	foreach my $index (0 .. scalar(@SRM_idx)) {
		my $SRM_idx = $SRM_idx[$index];
		my $SRM_url = $SRM_url_list[$index];
		my @div_dir;
		$div_dir[0] = "../archive/SRM_$SRM_idx/Div_1";
		$div_dir[1] = "../archive/SRM_$SRM_idx/Div_2";
		foreach my $div (1 .. 2) {
			`mkdir  -p $div_dir[$div-1]`;
			my @plm_url = ();
			my @plm_file_name = ();
			get_div_problem($SRM_idx,$SRM_url,$div,@plm_url,@plm_file_name);
			#print "@plm_urln,@plm_file_namen";
			foreach my $j (0 .. scalar(@plm_url)-1) {
				get_plm_stat($div_dir[$div-1],$plm_file_name[$j],$plm_url[$j]);
				get_plm_solve($div_dir[$div-1],$plm_url[$j],$div);
				#get_plm_solve("","http://community.topcoder.com/stat?c=problem_statement&pm=12033&rd=15172",1);
			}
		}
	}
}

sub get_plm_stat {
	my ($div_dir,$plm_file_name,$plm_url) = @_;
	my $plm_file_path = $div_dir . "/" . $plm_file_name;
	get_html_to_file($plm_url,$plm_file_path);
}

sub get_plm_solve {
	my ($div_dir,$plm_url,$div) = @_;
	my $plm_str = get_html_to_string($plm_url);
	my $plm_detail;
	if ($plm_str =~ /ProblemDetail(.+)"&;Single/) {
		my $plm_detail_url = "http://community.topcoder.com/tc?module=ProblemDetail" . $1;
	        $plm_detail = get_html_to_string($plm_detail_url);
       }
#
       while ($plm_detail =~ s/n/;/g) {
       }

       my $solve_list = $plm_detail;

       while ($solve_list =~ s/amp;//g) {}
       while ($solve_list =~ s/;/n/g) {}
       my @temp_list;
       my @solve_name;
       my @solve_url;
       while ($solve_list =~ /problem_solution(.+)/g) {
	       push @temp_list,$1;
       }
       my $base;
       if (scalar(@temp_list)>=10) {
	       $base=$div;
       }else {
	       $base=1;
       }
       foreach my $i (0+($base-1)*5 .. 4+($base-1)*5) {
	       push @solve_url,"http://community.topcoder.com/" . "stat?c=problem_solution" . $temp_list[$i];
	       my $temp_str =  $plm_file_name;
	       if ($i % 5 == 0) {
		       if ($temp_str =~ s/problem/Java_solve/g){}
	       } elsif ($i % 5 == 1) {
		       if ($temp_str =~ s/problem/Cpp_solve/g){}
	       } elsif ($i % 5 == 2) {
		       if ($temp_str =~ s/problem/Csharp_solve/g){}
	       } elsif ($i % 5 == 3) {
		       if ($temp_str =~ s/problem/VB_solve/g){}
	       } elsif ($i % 5 == 4) {
		       if ($temp_str =~ s/problem/Overall_solve/g){}
	       }
	       push @solve_name,$temp_str;
       }
       foreach my $i (0 .. 4) {
	       my $solve_name = $solve_name[$i];
	       my $solve_url;
	       my $temp_url = $solve_url[$i];
	       if ($temp_url =~/(.+)" class=/) {
		       $solve_url = $1;
       }
       if ($solve_url =~ /cr=(d+)/) {
	       if ($1 > 0) {
		       my $solve_file_path = $div_dir . "/" . $solve_name;
		       get_html_to_file($solve_url,$solve_file_path);
	       }
       }
       }
}

sub get_div_problem {
	my ($SRM_idx,$plm_file_name) = @_;
	#print "nSRM_$SRM_idx,$SRM_urln";
	my $srm_div_list = get_html_to_string($SRM_url);
	while ($srm_div_list =~ s/n/;/g) {
	}
	my $plm_list_str;
	if ($div == 1) {
		if ($srm_div_list =~ /Division I Problem Stats(.+)Division II Problem Stats/) {
			$plm_list_str = $1; 
		}
	} elsif ($div == 2) {
		if ($srm_div_list =~ /Division II Problem Stats(.+)submitForm/) {
			$plm_list_str = $1; 
		}
	}
	while ($plm_list_str =~ s/;/n/g) {
	}
	my $cnt = 0;
	while ($plm_list_str =~ /HREF="(.+)" class="statText"&;(.+)&;/A&;&;/td&;/g) {
		push @$plm_url,"http://community.topcoder.com" . $1;
	$cnt ++;
	my $plm_level;
	if ($cnt == 1) {
		$plm_level = "Level_One_problem_";
	} elsif ($cnt == 2) {
		$plm_level = "Level_Two_problem_";
	} elsif ($cnt == 3) {
		$plm_level = "Level_Three_problem_";
	} else {
		$plm_level = "Level_None_problem_";
	}
	push @$plm_file_name,$plm_level . $2 . ".html";
}
}

sub get_SRM_url_list {
	my ($SRM_url_list,$SRM_idx) = @_;
	my $pre_url = "http://community.topcoder.com/tc?module=MatchList&sc=&sd=&nr=50&sr=";
	my $MAX_SRM_CNT = 5000;
	my $flag = 1;
	my $index = 1;
	while ($flag == 1) {
		my $url = $pre_url . "$index";
		#print "index=$index,url=$urln";
		my $SRM_list_page = get_html_to_string($url);
		if ($SRM_list_page =~/An error has occurred when attempting to process your request/) {
			print "index=$index,ALL list has been gotn";
			print @$SRM_url_list;
			$flag = 0;
		}
		if ($flag == 1) {
			while ($SRM_list_page =~ /&;td class="value" nowrap="nowrap"&;&;a href="(.+)">SRM (d+)&;/a&;&;/td&;/g) {
				my ($suf_url,$srm_idx) = ($1,$2);
				my $srm_url = "http://community.topcoder.com" . $suf_url;
				push @$SRM_url_list,$srm_url;
				push @$SRM_idx,$srm_idx;
			}
		}
		$index += 50;
		if ($index >= $MAX_SRM_CNT) {
			last;
		}
	}
}

sub get_html_to_string {
	my ($url) = @_;
	my $temp_file = "../archive/file.tmp.txt";
	get_html_to_file($url,$temp_file);
	my $str = `cat $temp_file`;
	`rm $temp_file`;
	return $str;
}

sub get_html_to_file {
	my ($url,$output_file) = @_;
	my $cookie_file = "../data/cookie.txt";
	open FV,$cookie_file;
	my $cookie = <FV>;
	chomp $cookie;
	my $cmd = "curl "$url" $cookie -o $output_file";
	system($cmd);
}

main();

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读