加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 编程开发 > Python > 正文

爬取和保存豆瓣小组图片

发布时间:2020-12-17 17:31:19 所属栏目:Python 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 #!/usr/bin/env python3#-*- coding=utf-8 -*-import requestsimport timeimport randomimport reimport configparserimport loggingimport logging.h

以下代码由PHP站长网 52php.cn收集自互联网

现在PHP站长网小编把它分享给大家,仅供参考

#!/usr/bin/env python3
#-*- coding=utf-8 -*-

import requests
import time
import random
import re
import configparser
import logging
import logging.handlers
import lxml.etree as etree
import threading
import queue
import os.path


DOUBAN_HEADERS = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Referer':'http://www.douban.com/search?cat=1019&q=%E5%AE%B3%E7%BE%9E','Accept-Language':'zh-CN,zh;q=0.8','User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/38.0.2125.104 Safari/537.36','Accept-Encoding':'gzip,deflate','Host':'www.douban.com','Connection':'Keep-Alive'
}
IMAGE_HEADERS = {
'Accept':'text/html,like Gecko) Chrome/38.0.2125.104 Safari/537.36'
}
CNFG_FILE = 'douban_crawler.cfg'
LOG_FILE = 'douban_crawler.log'
MAX_LOG_SIZE = 1024 * 1024 #1MB
LOG_BACKUP_COUNT = 3

logger = logging.getLogger('crawler')
logger.setLevel(logging.DEBUG)
fh = logging.handlers.RotatingFileHandler(LOG_FILE,maxBytes=MAX_LOG_SIZE,backupCount=LOG_BACKUP_COUNT,encoding='utf-8')
fh.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(lineno)d - %(message)s")
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to logger
logger.addHandler(fh)
logger.addHandler(ch)
DEBUG = logger.debug
INFO = logger.info
WARNING = logger.warning
ERROR = logger.error


class Parser_Douban_Group(threading.Thread):
    def __init__(self,url,queue,t_name = 'Parser Group'):
        threading.Thread.__init__(self,name=t_name)
        self.data = queue
        self.url = url
        self.s = requests.Session()

    def run(self):
        #解析网页
        INFO("{0} started!".format(self.getName()))
        co = 0
        htm = open_douban_page(self.url,self.s)
        try:
            parser = etree.HTMLParser(recover=True)
            text_dom = etree.fromstring(htm,parser)
        except Exception as e:
            ERROR('Parse douban page error: {0}'.format(e))
            #DEBUG('Page: {0}'.format(htm))
        else:
            group_name = ''.join(text_dom.xpath("//div[@id='group-info']/h1//text()")).strip()
            INFO('Group name: {0}'.format(group_name))
            div_node = text_dom.xpath("//tr[@class='']")
            
            for x in div_node:
                co = co + 1
                item = {}
                url = ''.join(x.xpath("child::td[@class='title']/a/attribute::href"))
                title = ''.join(x.xpath("child::td[@class='title']/a//text()"))
                auth = ''.join(x.xpath("child::td[@nowrap='nowrap']/a[@class='']//text()"))
                reply = ''.join(x.xpath("child::td[@class='']//text()"))
                time = ''.join(x.xpath("child::td[@class='time']//text()"))
                item['title'] = title
                item['url'] = url
                item['auth'] = auth
                item['reply'] = reply
                item['time'] = time
                #将数据依次存入队列
                self.data.put(item,block=True)
                DEBUG('{0} Put({1}) - ({2} ...)'.format(self.getName(),co,item['title'][:20]))
        #存入结束标志
        self.data.put({})
        INFO("{0} finished! put {1} topic to queue.".format(self.getName(),co))

class Parser_Douban_Topic(threading.Thread):
    def __init__(self,topic_queue,content_queue,t_name = 'Parser Topic'):
        threading.Thread.__init__(self,name=t_name)
        self.topic_queue = topic_queue
        self.content_queue = content_queue
        self.s = requests.Session()

    def run(self):
        #解析网页
        INFO("{0} started!".format(self.getName()))
        co = 0
        coo = 0
        while True:
            try:
                #读取队列,最长等待5分钟
                val = self.topic_queue.get(True,300)
                if val:
                    co = co + 1
                    DEBUG('{0} Get({1}) - ({2} ...)'.format(self.getName(),val['title'][:20]))
                    htm = open_douban_page(val['url'],self.s)
                    try:
                        parser = etree.HTMLParser(recover=True)
                        text_dom = etree.fromstring(htm,parser)
                    except Exception as e:
                        ERROR('Parse douban page error: {0}'.format(e))
                        #DEBUG('Page: {0}'.format(htm))
                    else:
                        topic_name = ''.join(text_dom.xpath("//div[@id='content']/h1//text()")).replace('n','').strip()
                        DEBUG('Topic name: {0}'.format(topic_name))
                        div_node = text_dom.xpath("//div[@class='topic-content']")
                        img_list = div_node[0].xpath("descendant::img/attribute::src")
                        for x in img_list:
                            coo = coo + 1

                            item = {}
                            #url = ''.join(x.xpath("descendant::img/attribute::src"))
                            item['title'] = topic_name + str(coo)
                            item['url'] = x
                            #将数据依次存入队列
                            self.content_queue.put(item)
                            DEBUG('{0} Put({1}) - ({2} ...)'.format(self.getName(),coo,item['title'][:20]))
                else:
                    self.topic_queue.put({})
                    INFO("{0} finished! get {1} topic from queue.".format(self.getName(),co))
                    break
            except Exception as e:
                ERROR("{0} timeout! {1}".format(self.getName(),e))
                break
        #存入结束标志
        self.content_queue.put({})
        INFO("{0} finished! put {1} image to queue.".format(self.getName(),coo))

class Save_Douban_Group(threading.Thread):
    def __init__(self,folder_name = 'image',t_name = 'Storage'):
        threading.Thread.__init__(self,name=t_name)
        self.data = queue
        self.folder = folder_name
        self.s = requests.Session()

    def run(self):
        INFO("{0} started!".format(self.getName()))
        co = 0
        coo = 0
        while True:
            try:
                #读取队列,最长等待5分钟
                val = self.data.get(True,300)
                if val:
                    co = co + 1
                    #fp.write('<{0}>.{1} - {2}r{3}r{4}rn'.format(
                    #co,val['title'],val['time'],val['url'],val['abr']))
                    DEBUG('{0} Get({1}) - ({2} ...)'.format(self.getName(),val['title'][:20]))
                    img_dt = open_douban_page(val['url'],self.s,ret_raw = True)
                    img_nm = val['url'].split('/')[-1]
                    if img_dt:
                        fn = '{0}/{1}'.format(self.folder,img_nm)
                        if not os.path.exists(fn):
                            fp = open(fn,'wb')
                            fp.write(img_dt)
                            fp.close()
                            coo = coo + 1
                else:
                    self.data.put({})    #仍然存入结束标识
                    break
            except Exception as e:
                ERROR("{0} timeout! {1}".format(self.getName(),e))
                #break
        #fp.close()
        INFO("{0} finished! save image({1}/{2}).".format(self.getName(),co))

def open_douban_page(group_url,s,retries=3,ret_raw = False):
    #读取网页
    ret = ''
    try:
        cookies = dict(bid="RmFNKKPAd0s")
        if ret_raw:
            r = s.get(group_url,headers=IMAGE_HEADERS,stream=True)
        else:
            r = s.get(group_url,headers=DOUBAN_HEADERS,cookies=cookies)
        #print(r.cookies)
        r.raise_for_status()
        time.sleep(random.uniform(0.3,1.5))
    except requests.ConnectionError as e:
        ERROR('Connect douban error({0}): {1}'.format(retries,e))
        retries = retries - 1
        if retries > 0:
            time.sleep(0.5)
            ret = open_douban_page(group_url,retries)
    except Exception as e:
        ERROR('Open douban url({0}) error: {1}'.format(group_url,e))
    else:
        #INFO('Open douban page finished! - {0}'.format(r.url))
        DEBUG('Request url: {0}'.format(group_url))
        if ret_raw:
            ret = r.raw.read()
        else:
            ret = r.text
    return ret



def crawler_douban(group_url,folder_name,task_name):
    q_topic = queue.Queue()
    q_content = queue.Queue()
    
    parser_group_obj = []
    parser_topic_obj = []
    storage_pic_obj = []
    
    for i in range(1,2):
        parser_group_obj.append(Parser_Douban_Group(group_url,q_topic,'{0} {1}'.format(task_name,i)))
    
    for i in range(1,2):
        parser_topic_obj.append(Parser_Douban_Topic(q_topic,q_content,'Parser Topic {0}'.format(i)))
    
    for i in range(1,3):
        storage_pic_obj.append(Save_Douban_Group(q_content,'Storage {0}'.format(i)))
    
    for obj in parser_group_obj:
        obj.start()
    
    for obj in parser_topic_obj:
        obj.start()
    
    for obj in storage_pic_obj:
        obj.start()
    
    for obj in parser_group_obj:
        obj.join()
    
    for obj in parser_topic_obj:
        obj.join()
    
    for obj in storage_pic_obj:
        obj.join()

    del q_topic
    del q_content

if __name__ == '__main__':
    haixiu_hangzhou_url = 'http://www.douban.com/group/505137/'
    haixiu_url = 'http://www.douban.com/group/haixiuzu/'
    co =0
    while True:
        co = co + 1
        time.sleep(2.0)
        crawler_douban(haixiu_url,'image','Parser HaiXiu Group ({0})'.format(co))

    input('Press any key to exit!')

以上内容由PHP站长网【52php.cn】收集整理供大家参考研究

如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读