python爬虫:初始爬虫一
发布时间:2020-12-20 10:44:31 所属栏目:Python 来源:网络整理
导读:初始爬虫 什么是爬虫:通过编写程序模拟浏览器上网,然后让其去互联网上爬取数据的过程 - 爬虫的分类: ??? - 通用爬虫: ??????? - 抓取互联网中的一整张页面数据 ??? - 聚焦爬虫: ??????? - 抓取页面中的局部数据 ??? - 增量式爬虫: ??????? - 用来监测
初始爬虫什么是爬虫:通过编写程序模拟浏览器上网,然后让其去互联网上爬取数据的过程 - 爬虫的分类: ? request模块的使用 #爬取搜狗首页的页面源码数据 import requests #1.指定url url = ‘https://www.sogou.com/‘ #2.请求发送get:get返回值是一个响应对象 response = requests.get(url=url) #3.获取响应数据 page_text = response.text #返回的是字符串形式的响应数据 #4.持久化存储 with open(‘sogou.html‘,‘w‘,encoding=‘utf-8‘) as fp: fp.write(page_text) #实现一个简易的网页采集器 #需要让url携带的参数动态化 url = ‘https://www.sogou.com/web‘ #实现参数动态化 wd = input(‘enter a key:‘) params = { ‘query‘:wd } #在请求中需要将请求参数对应的字典作用到params这个get方法的参数中 response = requests.get(url=url,params=params) page_text = response.text fileName = wd+‘.html‘ with open(fileName,‘w‘,encoding=‘utf-8‘) as fp: fp.write(page_text) 上述代码执行后发现:
#解决乱码 url = ‘https://www.sogou.com/web‘ #实现参数动态化 wd = input(‘enter a key:‘) params = { ‘query‘:wd } #在请求中需要将请求参数对应的字典作用到params这个get方法的参数中 response = requests.get(url=url,params=params) response.encoding = ‘utf-8‘ #修改响应数据的编码格式 page_text = response.text fileName = wd+‘.html‘ with open(fileName,encoding=‘utf-8‘) as fp: fp.write(page_text)
#解决UA检测 url = ‘https://www.sogou.com/web‘ #实现参数动态化 wd = input(‘enter a key:‘) params = { ‘query‘:wd } headers = { ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/76.0.3809.132 Safari/537.36‘ } #在请求中需要将请求参数对应的字典作用到params这个get方法的参数中 response = requests.get(url=url,params=params,headers=headers) response.encoding = ‘utf-8‘ #修改响应数据的编码格式 page_text = response.text fileName = wd+‘.html‘ with open(fileName,encoding=‘utf-8‘) as fp: fp.write(page_text) 动态加载的页面数据
# 爬取豆瓣排行 #肯德基餐厅查询http://www.kfc.com.cn/kfccda/storelist/index.aspx url = ‘http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword‘ adderss = input(‘请输入查询的关键字‘) for i in range(1,5): dic = { ‘cname‘: ‘‘,‘pid‘: ‘‘,‘keyword‘: adderss,‘pageIndex‘: str(i),‘pageSize‘: ‘10‘,} response = requests.post(url=url,headers=headers,data=dic) page_text = response.json() for a in page_text[‘Table1‘]: print(‘餐厅名称:‘,a[‘storeName‘]+‘餐厅‘,‘---- 餐厅地址:‘,a[‘addressDetail‘]) ? 练习 爬取药监总局中相关企业的详情信息http://125.35.6.84:81/xk/ # 国家药品监督管理局 import requests url = ‘http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList‘ headers = { ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/76.0.3809.132 Safari/537.36‘ } dic = { ‘on‘: ‘true‘,‘page‘: ‘1‘,‘pageSize‘: ‘15‘,‘productName‘:‘‘,‘conditionType‘: ‘1‘,‘applyname‘: ‘‘,‘applysn‘:‘‘,} response = requests.post(url=url,data=dic) page_text = response.json() for page in page_text[‘list‘]: # print(page[‘EPS_NAME‘],page[‘ID‘]) _url = ‘http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById‘ _dic = { ‘id‘: page[‘ID‘] } _response = requests.get(url=_url,params=_dic) _page_text = _response.json() print(_page_text[‘epsName‘],‘-----‘,_page_text[‘legalPerson‘]) (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |