分析Ajax来爬取今日头条街拍美图并保存到MongDB

发布时间：2020-12-16 02:59:18 所属栏目：百科来源：网络整理

导读：前提:.需要安装MongDB 注:因今日投票网页发生变更,如下代码不保证能正常使用 # !/usr/bin/env python # -*- coding: utf-8 -*- import json import os from urllib.parse import urlencode import pymongo import requests from bs4 import BeautifulSoup fr

前提:.需要安装MongDB

注:因今日投票网页发生变更,如下代码不保证能正常使用

#!/usr/bin/env python #-*- coding: utf-8 -*-

import json import os from urllib.parse import urlencode import pymongo import requests from bs4 import BeautifulSoup from requests.exceptions import ConnectionError import re from multiprocessing import Pool from hashlib import md5 from json.decoder import JSONDecodeError MONGO_URL = ‘localhost‘ MONGO_DB = ‘toutiao‘ MONGO_TABLE = ‘toutiao‘ GROUP_START = 1 GROUP_END = 20 KEYWORD=‘街拍‘ client = pymongo.MongoClient(MONGO_URL,connect=False) db = client[MONGO_DB] def get_page_index(offset,keyword): data = { ‘autoload‘: ‘true‘,‘count‘: 20,‘cur_tab‘: 3,‘format‘: ‘json‘,‘keyword‘: keyword,‘offset‘: offset,} params = urlencode(data) base = ‘http://www.toutiao.com/search_content/‘ url = base + ‘?‘ + params try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print(‘Error occurred‘) return None def download_image(url): print(‘Downloading‘,url) try: response = requests.get(url) if response.status_code == 200: save_image(response.content) return None except ConnectionError: return None def save_image(content): file_path = ‘{0}/{1}.{2}‘.format(os.getcwd(),md5(content).hexdigest(),‘jpg‘) print(file_path) if not os.path.exists(file_path): with open(file_path,‘wb‘) as f: f.write(content) f.close() def parse_page_index(text): try: data = json.loads(text) if data and ‘data‘ in data.keys(): for item in data.get(‘data‘): yield item.get(‘article_url‘) except JSONDecodeError: pass


def get_page_detail(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print(‘Error occurred‘) return None def parse_page_detail(html,url): soup = BeautifulSoup(html,‘lxml‘) result = soup.select(‘title‘) title = result[0].get_text() if result else ‘‘ images_pattern = re.compile(‘gallery: JSON.parse("(.*)")‘,re.S) result = re.search(images_pattern,html) if result: data = json.loads(result.group(1).replace(‘‘,‘‘)) if data and ‘sub_images‘ in data.keys(): sub_images = data.get(‘sub_images‘) images = [item.get(‘url‘) for item in sub_images] for image in images: download_image(image) return { ‘title‘: title,‘url‘: url,‘images‘: images } def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print(‘Successfully Saved to Mongo‘,result) return True return False def main(offset): text = get_page_index(offset,KEYWORD) urls = parse_page_index(text) for url in urls: html = get_page_detail(url) print(html) result = parse_page_detail(html,url) print(result) if result: save_to_mongo(result) if __name__ == ‘__main__‘: pool = Pool() groups = ([x * 20 for x in range(GROUP_START,GROUP_END + 1)]) pool.map(main,groups) pool.close() pool.join()

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!