加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 百科 > 正文


发布时间:2020-12-16 02:59:18 所属栏目:百科 来源:网络整理
导读:前提:.需要安装MongDB 注:因今日投票网页发生变更,如下代码不保证能正常使用 # !/usr/bin/env python # -*- coding: utf-8 -*- import json import os from urllib.parse import urlencode import pymongo import requests from bs4 import BeautifulSoup fr



#!/usr/bin/env python #-*- coding: utf-8 -*-

import json import os from urllib.parse import urlencode import pymongo import requests from bs4 import BeautifulSoup from requests.exceptions import ConnectionError import re from multiprocessing import Pool from hashlib import md5 from json.decoder import JSONDecodeError MONGO_URL = localhost MONGO_DB = toutiao MONGO_TABLE = toutiao GROUP_START = 1 GROUP_END = 20 KEYWORD=街拍 client = pymongo.MongoClient(MONGO_URL,connect=False) db = client[MONGO_DB] def get_page_index(offset,keyword): data = { autoload: true,count: 20,cur_tab: 3,format: json,keyword: keyword,offset: offset,} params = urlencode(data) base = http://www.toutiao.com/search_content/ url = base + ? + params try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print(Error occurred) return None def download_image(url): print(Downloading,url) try: response = requests.get(url) if response.status_code == 200: save_image(response.content) return None except ConnectionError: return None def save_image(content): file_path = {0}/{1}.{2}.format(os.getcwd(),md5(content).hexdigest(),jpg) print(file_path) if not os.path.exists(file_path): with open(file_path,wb) as f: f.write(content) f.close() def parse_page_index(text): try: data = json.loads(text) if data and data in data.keys(): for item in data.get(data): yield item.get(article_url) except JSONDecodeError: pass

def get_page_detail(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print(Error occurred) return None def parse_page_detail(html,url): soup = BeautifulSoup(html,lxml) result = soup.select(title) title = result[0].get_text() if result else ‘‘ images_pattern = re.compile(gallery: JSON.parse("(.*)"),re.S) result = re.search(images_pattern,html) if result: data = json.loads(result.group(1).replace(,‘‘)) if data and sub_images in data.keys(): sub_images = data.get(sub_images) images = [item.get(url) for item in sub_images] for image in images: download_image(image) return { title: title,url: url,images: images } def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print(Successfully Saved to Mongo,result) return True return False def main(offset): text = get_page_index(offset,KEYWORD) urls = parse_page_index(text) for url in urls: html = get_page_detail(url) print(html) result = parse_page_detail(html,url) print(result) if result: save_to_mongo(result) if __name__ == __main__: pool = Pool() groups = ([x * 20 for x in range(GROUP_START,GROUP_END + 1)]) pool.map(main,groups) pool.close() pool.join()


