爬取豆瓣照片

发布时间：2020-12-17 17:28:28 所属栏目：Python 来源：网络整理

导读：今天PHP站长网 52php.cn把收集自互联网的代码分享给大家，仅供参考。 #!/usr/bin/env python# -*- coding: utf-8 -*-# @Date : 2014-12-22 14:46:40# @Author : Your Name ([email?protected])# @Link : http://example.o

以下代码由PHP站长网 52php.cn收集自互联网

现在PHP站长网小编把它分享给大家，仅供参考

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2014-12-22 14:46:40
# @Author  : Your Name ([email?protected])
# @Link    : http://example.org
# @Version : $Id$

import os
import urllib.request
import re
import time

header={"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","Accept-Encoding": "gzip,deflate,sdch","Cache-Control": "max-age=0","Accept-Language": "zh-cn,zh;q=0.8;","Connection": "keep-alive","Host": "www.douban.com","Referer": "http://www.douban.com","User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko)"
                      " Chrome/39.0.2171.95 Safari/537.36"
        }


def getHtml1(url):
    req = urllib.request.Request(url,headers = header)
    html = urllib.request.urlopen(req).read().decode('utf-8')
    return html


def getHtml(url):
    u = urllib.request.URLopener()
    u.addheaders = []
    u.addheader("User-Agent","Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko)"
                              " Chrome/39.0.2171.95 Safari/537.36")
    u.addheader("Accept-Language","zh-cn,)
    u.addheader("Accept","text/html,*/*;q=0.8")
    f = u.open(url)
    content = f.read().decode('utf-8')
    f.close()
    return content


def getPicURL(html):
    #reg = r"http://img3.douban.com/view/photo/thumb/public/pd+.jpg"
    reg1 = r"http://www.douban.com/online/11865076/photo/d+/?sortby=time"
    picURLs = re.findall(reg1,html)
    return picURLs

def openPic(picURL):
    try:
        html = getHtml(picURL)
        reg = r'<img src="http://imgd{1}.douban.com/view/photo/photo/public/pd{10}.jpg"'
        picURL = re.findall(reg,html)
        #print(picURL)
        picURL_open = picURL[0].split('"')
    except:
        print("!!!!!!!!!!!!!!!!!!!!!WARING:AN ERROR HAPPENED while openPic!!!!!!!!!!!!!!!!!!!!!")
    return picURL_open[1]

def picDownload(picURLs,page_num):
    try:
        download_img = ''
        dirs = os.listdir("C:UsersLyleDesktopdouPIC")
        for picURL in picURLs:
            picURL_new = openPic(picURL)
            if picURL_new[-15:] not in dirs:
                file_name = picURL_new[-15:]
                download_img = urllib.request.urlretrieve(picURL_new,"C:UsersLyleDesktopdouPIC%s" % (file_name))
                dirs.append(file_name)
            else:
                file_name = "副本" + str(picURL_new[-15:])
                download_img = urllib.request.urlretrieve(picURL_new,"C:UsersLyleDesktopdouPIC%s" % (file_name))
            time.sleep(1)
            print("第%d页 第%d张 ......%s......... downloaded" % (page_num+1,picURLs.index(picURL)+1,picURL_new[-15:]))
    except:
        print("!!!!!!!!!!!!!!!!!!!!!WARING:AN ERROR happened wile picDownload!!!!!!!!!!!!!!!!!!!!!")
    return download_img

if __name__ == '__main__':
    num = 0
    page_num = 0
    while True:
        html = getHtml(r'http://www.douban.com/online/11865076/album/137771083/?start=%d&sortby=time' % (num+page_num*90))
        picURLs = getPicURL(html)
        print("**************第%d页下载开始***************" % (page_num+1))
        picDownload(picURLs,page_num)
        print("**************第%d页下载完成***************" % (page_num+1))
        page_num += 1

以上内容由PHP站长网【52php.cn】收集整理供大家参考研究

如果以上内容对您有帮助，欢迎收藏、点赞、推荐、分享。

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!