结果演示

?

代码:
<pre class="has"><code class="language-python">#encoding:utf-8
author = 'donghao'
time = 2018/12/24 15:20
import requests
import threading
import urllib.request
import urllib3
import os
import re
import time
from lxml import etree
from queue import Queue
负责解析图片
class Producer(threading.Thread):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'
}
def __init__(self,img_queue,page_queue,*args,**kwargs):
super(Producer,self).__init__(*args,**kwargs)
self.img_queue = img_queue
self.page_queue = page_queue
def run(self):
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
self.parse_page(url)
def parse_page(self,url):
resp = requests.get(url=url,headers=self.headers)
text = resp.text
html = etree.HTML(text)
imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
for img in imgs:
#获取图片url
img_url = img.get('data-original')
#获取图片Url的后缀名
end = os.path.splitext(img_url)[1]
#替换掉url中特殊字符
end = re.sub(r'[,。??,/·]','',end)
# 获取图片描述,并加上后缀
name = img.get('alt')+end
#存储图片url和文件名队列
self.img_queue.put((img_url,name))
负责下载图片
class Consumer(threading.Thread):
def init(self,kwargs):
super(Consumer,kwargs)
self.img_queue = img_queue
self.page_queue = page_queue
def run(self):
while True:
if self.img_queue.empty() and self.page_queue.empty():
break
img_url,filename = self.img_queue.get()
urllib.request.urlretrieve(img_url,'images/' + filename)
print(filename+'张图片下载完成')
def main():
爬取10页
start = time.time()
image_queue = Queue(1000)
page_queue = Queue(100)
tsk = []
for x in range(1,10):
url = 'http://www.doutula.com/photo/list/?page=%d'%x
#存储页面信息
page_queue.put(url)
for x in range(5):
t = Producer(image_queue,page_queue)
t.start()
tsk.append(t)
for x in range(5):
t = Consumer(image_queue,page_queue)
t.start()
tsk.append(t)
#终止运行,统计时间
for t in tsk:
t.join()
end = time.time()
print('耗时:%0.002fs' % (end - start))
if name == 'main':
main()
(编辑:李大同)
【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!
|