【Python requests多页面爬取案例】 -- 2019-08-07 10:34:05
发布时间:2020-12-20 12:45:21 所属栏目:Python 来源:网络整理
导读:原创: http://106.13.73.98/__/26/ import requestsfrom fake_useragent import UserAgent # 随机ua库class Boring(): def __init__(self,page_scope=(4,7)): """ :param page_scope: 页码范围 """ self.page_scope = page_scope self.all_id = self.get_all
原创: http://106.13.73.98/__/26/ import requests from fake_useragent import UserAgent # 随机ua库 class Boring(): def __init__(self,page_scope=(4,7)): """ :param page_scope: 页码范围 """ self.page_scope = page_scope self.all_id = self.get_all_company_id() self.enterprise_info = self.get_all_company_info() self.show_enterprise_info() @property def firefox_ua(self): """返回随机火狐UA头""" ua = UserAgent(use_cache_server=False) return {'User-Agent': ua.Firefox} # ua.Firefox:随机生成火狐浏览器UA def get_all_company_id(self): """ 将返回指定页码数内的公司的id :param start_page: 起始页码 :param end_page: 结束页码 """ all_id = {} url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList' # 此连接见图1 for page in range(self.page_scope[0],self.page_scope[1] + 1): json_text = requests.post(url,data=self.post_data(page),headers=self.firefox_ua).json() current_page_all_id = [dict['ID'] for dict in json_text['list']] all_id.setdefault(page,current_page_all_id) return all_id def get_all_company_info(self): """开始获取公司信息""" url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById' # 见图3 enterprise_info = {} for page in self.all_id: for id in self.all_id.get(page): response = requests.post(url,data={'id': id},headers=self.firefox_ua) # data={'id': id}:见图4 if response.headers['Content-Type'] == 'application/json;charset=UTF-8': json_text = response.json() enterprise_info.setdefault(json_text.get('businessPerson'),json_text.get('epsName')) # 这里仅获取企业负责人和企业名 return enterprise_info def show_enterprise_info(self): [print(k,v) for k,v in self.enterprise_info.items()] def post_data(self,page): """获取公司列表时要提交的form""" return { 'on': 'true','page': page,'pageSize': '15','productName': '','conditionType': '1','applyname': '','applysn': '',} # 见图2 # go Boring() 原创: http://106.13.73.98/__/26/ (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |