新年新气象!爬取一个代理服务器列表
发布时间:2020-12-17 17:21:43 所属栏目:Python 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 #!/usr/bin/python3import osimport os.pathimport urllib.requestimport chardetimport timeimport refrom multiprocessing.dummy import Pool as Th
以下代码由PHP站长网 52php.cn收集自互联网 现在PHP站长网小编把它分享给大家,仅供参考 #!/usr/bin/python3 import os import os.path import urllib.request import chardet import time import re from multiprocessing.dummy import Pool as ThreadPool def url_content_read(url): res = urllib.request.urlopen(url).read() return res def url_socket_list_pharse(socket_raw_info_data): p = re.compile(r'<td>(d+.d+.d+.d+)</td><td>(d+)</td>') socket_info_list = p.findall(socket_raw_info_data) socket_pharsed_info_list = [] for s_info in socket_info_list: socket_pharsed_info_list.append("%s:%s"%(s_info[0],s_info[1])) return socket_pharsed_info_list if __name__ == "__main__": urls = [] for i in range(1,42): urls.append("http://www.proxy.com.ru/list_%d.html"%(i)) start_time = time.time() _pool = ThreadPool(4) results = _pool.map(url_content_read,urls) _pool.close() _pool.join() print("time spent: %f"%(time.time() - start_time)) p = re.compile(r'http://[w|d|-|.]+[:d+]*[/|w|d|.|%|@|&|*|+|#|?|=|-]+') #对获取的结果网页进行字符串的编码解码 socket_info_raw_data_list = [] for res in results: det_res = chardet.detect(res) #print(det_res) if det_res['encoding'] == 'utf-8': socket_info_raw_data_list.append(res.decode('utf-8','ignore')) elif det_res['encoding'] == 'GB2312': socket_info_raw_data_list.append(res.decode('GB2312','ignore')) else: socket_info_raw_data_list.append(res.decode('gbk','ignore')) #对结果进行进一步的解析 start_time = time.time() _pool = ThreadPool(40) results = _pool.map(url_socket_list_pharse,socket_info_raw_data_list) _pool.close() _pool.join() print("time spent: %f"%(time.time() - start_time)) #最后对结果进行输出 fp = open(r'/home/mobilefzb/socket_list.txt','w') for res in results: for si_res in res: fp.write("%sn"%(si_res)) fp.close() 以上内容由PHP站长网【52php.cn】收集整理供大家参考研究 如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |