加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 编程开发 > Python > 正文

【Python requests多页面爬取案例】 -- 2019-08-07 10:34:05

发布时间:2020-12-20 12:45:21 所属栏目:Python 来源:网络整理
导读:原创: http://106.13.73.98/__/26/ import requestsfrom fake_useragent import UserAgent # 随机ua库class Boring(): def __init__(self,page_scope=(4,7)): """ :param page_scope: 页码范围 """ self.page_scope = page_scope self.all_id = self.get_all

原创: http://106.13.73.98/__/26/

import requests
from fake_useragent import UserAgent  # 随机ua库


class Boring():

    def __init__(self,page_scope=(4,7)):
        """
        :param page_scope: 页码范围
        """
        self.page_scope = page_scope
        self.all_id = self.get_all_company_id()
        self.enterprise_info = self.get_all_company_info()
        self.show_enterprise_info()

    @property
    def firefox_ua(self):
        """返回随机火狐UA头"""
        ua = UserAgent(use_cache_server=False)
        return {'User-Agent': ua.Firefox}  # ua.Firefox:随机生成火狐浏览器UA

    def get_all_company_id(self):
        """
        将返回指定页码数内的公司的id
        :param start_page: 起始页码
        :param end_page: 结束页码
        """
        all_id = {}
        url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'  # 此连接见图1
        for page in range(self.page_scope[0],self.page_scope[1] + 1):
            json_text = requests.post(url,data=self.post_data(page),headers=self.firefox_ua).json()
            current_page_all_id = [dict['ID'] for dict in json_text['list']]
            all_id.setdefault(page,current_page_all_id)
        return all_id

    def get_all_company_info(self):
        """开始获取公司信息"""
        url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'  # 见图3
        enterprise_info = {}
        for page in self.all_id:
            for id in self.all_id.get(page):
                response = requests.post(url,data={'id': id},headers=self.firefox_ua)  # data={'id': id}:见图4
                if response.headers['Content-Type'] == 'application/json;charset=UTF-8':
                    json_text = response.json()
                    enterprise_info.setdefault(json_text.get('businessPerson'),json_text.get('epsName'))
                    # 这里仅获取企业负责人和企业名
        return enterprise_info

    def show_enterprise_info(self):
        [print(k,v) for k,v in self.enterprise_info.items()]

    def post_data(self,page):
        """获取公司列表时要提交的form"""
        return {
            'on': 'true','page': page,'pageSize': '15','productName': '','conditionType': '1','applyname': '','applysn': '',}  # 见图2


# go
Boring()

原创: http://106.13.73.98/__/26/

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读