request+正则表达式爬猫眼
发布时间:2020-12-14 06:44:18 所属栏目:百科 来源:网络整理
导读:import json import requests from requests.exceptions import RequestException import re import time def get_one_page (url) : try : response = requests.get(url) if response.status_code == 200 : return response.text return None except Request
import json
import requests
from requests.exceptions import RequestException
import re
import time
def get_one_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?src="(.*?)".*?name"><a'
+ '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
items = re.findall(pattern,html)
for item in items:
yield {
'index': item[0],'image': item[1],'title': item[2],'actor': item[3].strip()[3:],'time': item[4].strip()[5:],'score': item[5] + item[6]
}
def write_to_file(content):
with open('result.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False) + 'n')
def main(offset):
url = 'http://maoyan.com/board/4?offset=' + str(offset)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
for i in range(10):
main(offset=i * 10)
time.sleep(1)
转自:http://cuiqingcai.com/ (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |