bs4 <span style="color: #0000ff">def<span style="color: #000000"> get_html_text(url):
<span style="color: #800000">'''<span style="color: #800000">
获取网址url的HTML代码,以字符串形式返回html代码
</span><span style="color: #800000">'''</span>
<span style="color: #0000ff">try</span><span style="color: #000000">:
res </span>= requests.get(url,timeout = 6<span style="color: #000000">)
res.raise_for_status()
res.encoding </span>=<span style="color: #000000"> res.apparent_encoding
</span><span style="color: #0000ff">return</span><span style="color: #000000"> res.text
</span><span style="color: #0000ff">except</span><span style="color: #000000">:
</span><span style="color: #0000ff">return</span> <span style="color: #800000">''</span>
<span style="color: #0000ff">print</span>(<span style="color: #800000">'</span><span style="color: #800000">请求异常</span><span style="color: #800000">'</span><span style="color: #000000">)
<span style="color: #0000ff">def<span style="color: #000000"> get_grupic_url(page_url,grupic_url_list,key_url,key_word):
<span style="color: #800000">'''<span style="color: #800000">
获取每张页面中每个图册的url链接,每个图册的url都有共同
且有别于其他链接的url,我们把部分特征的字符串放在key_url
中,通过它我们就可以筛选出页面中所有图册的url
</span><span style="color: #800000">'''</span><span style="color: #000000">
page_html </span>=<span style="color: #000000"> get_html_text(page_url)
</span><span style="color: #008000">#</span><span style="color: #008000">解析页面的html代码</span>
soup = BeautifulSoup(page_html,<span style="color: #800000">'</span><span style="color: #800000">html.parser</span><span style="color: #800000">'</span><span style="color: #000000">)
</span><span style="color: #008000">#</span><span style="color: #008000">获取该页面html代码中的所有<a>标签</span>
a_tags = soup.find_all(<span style="color: #800000">'</span><span style="color: #800000">a</span><span style="color: #800000">'</span>,attrs = {<span style="color: #800000">'</span><span style="color: #800000">href</span><span style="color: #800000">'</span><span style="color: #000000">:True})
select_atag(grupic_url_list,a_tags,key_word)
<span style="color: #0000ff">def<span style="color: #000000"> get_allpages_url(cover_url,pages_url_list):
<span style="color: #800000">'''<span style="color: #800000">通过递归获取所有页面的链接,
直到该页面不存在class = 'next'的标签
</span><span style="color: #800000">'''</span><span style="color: #000000">
html </span>=<span style="color: #000000"> get_html_text(cover_url)
soup </span>= BeautifulSoup(html,<span style="color: #800000">'</span><span style="color: #800000">html.parser</span><span style="color: #800000">'</span><span style="color: #000000">)
</span><span style="color: #008000">#</span><span style="color: #008000">找到属性class = 'next'的<a>标签</span>
a_tags = soup.find_all(<span style="color: #800000">'</span><span style="color: #800000">a</span><span style="color: #800000">'</span>,class_ = <span style="color: #800000">'</span><span style="color: #800000">next</span><span style="color: #800000">'</span><span style="color: #000000">)
</span><span style="color: #008000">#</span><span style="color: #008000">如果<a>标签存在,就将该标签的url加入列表</span>
<span style="color: #0000ff">if</span><span style="color: #000000"> a_tags:
nextpage_url </span>= a_tags[0].get(<span style="color: #800000">'</span><span style="color: #800000">href</span><span style="color: #800000">'</span><span style="color: #000000">)
pages_url_list.append(nextpage_url)
</span><span style="color: #008000">#</span><span style="color: #008000">递归获取下一个页面的<a>标签</span>
<span style="color: #000000"> get_allpages_url(nextpage_url,pages_url_list)
<span style="color: #008000">#<span style="color: #008000">当不存在属性class = 'next'的标签时,说明这是最后一页,结束递归
<span style="color: #0000ff">return<span style="color: #000000"> None
<span style="color: #0000ff">def<span style="color: #000000"> download_each_page(grupic_url_list,file_path1,page):
<span style="color: #800000">'''<span style="color: #800000">
通过调用download_each_group()函数,
下载每一页中的所有组图
</span><span style="color: #800000">'''</span>
<span style="color: #0000ff">print</span>(<span style="color: #800000">'</span><span style="color: #800000">nn第 {0} 页开始下载:n</span><span style="color: #800000">'</span><span style="color: #000000">.format(str(page)))
gup </span>= 1 <span style="color: #008000">#</span><span style="color: #008000">组数标记</span>
<span style="color: #008000">#</span><span style="color: #008000">下载该页面中每个小相册的所有图片</span>
<span style="color: #0000ff">for</span> grupic_url <span style="color: #0000ff">in</span><span style="color: #000000"> grupic_url_list:
file_path2 </span>= file_path1 + <span style="color: #800000">'</span><span style="color: #800000">_{0}</span><span style="color: #800000">'</span><span style="color: #000000">.format(str(gup))
</span><span style="color: #008000">#</span><span style="color: #008000">获取该页面的h1标题</span>
h1_string =<span style="color: #000000"> get_h1_string(grupic_url)
</span><span style="color: #0000ff">try</span><span style="color: #000000">:
download_each_group(grupic_url,file_path2,h1_string,gup)
gup </span>+= 1
<span style="color: #0000ff">except</span><span style="color: #000000">:
</span><span style="color: #0000ff">print</span>(<span style="color: #800000">"</span><span style="color: #800000">下载异常</span><span style="color: #800000">"</span><span style="color: #000000">)
gup </span>+= 1
<span style="color: #0000ff">continue</span>
<span style="color: #0000ff">def<span style="color: #000000"> download_all_page(pages_url_list,file_path,key_word):
<span style="color: #800000">'''<span style="color: #800000">通过调用函数download_each_page(),
来下载所有页面的图片
</span><span style="color: #800000">'''</span><span style="color: #000000">
pages_num </span>=<span style="color: #000000"> len(pages_url_list)
</span><span style="color: #0000ff">print</span>(<span style="color: #800000">'</span><span style="color: #800000">n相册一共有 {0} 页,已经开始下载请您耐心等待...</span><span style="color: #800000">'</span><span style="color: #000000">.format(str(pages_num)))
page </span>= 1 <span style="color: #008000">#</span><span style="color: #008000">页数标记</span>
<span style="color: #0000ff">for</span> page_url <span style="color: #0000ff">in</span><span style="color: #000000"> pages_url_list:
grupic_url_list </span>=<span style="color: #000000"> []
get_grupic_url(page_url,key_word)
file_path1 </span>= file_path + r<span style="color: #800000">'</span><span style="color: #800000">{0}</span><span style="color: #800000">'</span><span style="color: #000000">.format(str(page))
download_each_page(grupic_url_list,page)
page </span>+= 1
<span style="color: #0000ff">def download_each_group(grupic_url,gup,n = 1<span style="color: #000000">):
<span style="color: #800000">'''<span style="color: #800000">
进入链接为grupic_url的图册,下载我们需要的大图片,
并递归进入下一个页面开始下载,直到图册的h1标题发生改变
</span><span style="color: #800000">'''</span><span style="color: #000000">
new_file_path </span>= file_path + <span style="color: #800000">'</span><span style="color: #800000">_{0}.jpg</span><span style="color: #800000">'</span><span style="color: #000000">.format(str(n))
n </span>+= 1<span style="color: #000000">
html </span>=<span style="color: #000000"> get_html_text(grupic_url)
soup </span>= BeautifulSoup(html,<span style="color: #800000">'</span><span style="color: #800000">html.parser</span><span style="color: #800000">'</span><span style="color: #000000">)
</span><span style="color: #008000">#</span><span style="color: #008000">当该页面的h1标题和小相册封面的h1标题相同时开始下载</span>
<span style="color: #0000ff">if</span> h1_string ==<span style="color: #000000"> soup.h1.string:
</span><span style="color: #008000">#</span><span style="color: #008000">找到属性class_ = 'pic-large'的img标签</span>
img_tags = soup.find_all(<span style="color: #800000">'</span><span style="color: #800000">img</span><span style="color: #800000">'</span>,class_ = <span style="color: #800000">'</span><span style="color: #800000">pic-large</span><span style="color: #800000">'</span><span style="color: #000000">)
img_tag </span>=<span style="color: #000000"> img_tags[0]
</span><span style="color: #008000">#</span><span style="color: #008000">下载该img标签属性data-original提供的url链接,即为目标图片的链接</span>
urllib.request.urlretrieve(img_tag.get(<span style="color: #800000">'</span><span style="color: #800000">data-original</span><span style="color: #800000">'</span><span style="color: #000000">),new_file_path)
</span><span style="color: #008000">#</span><span style="color: #008000">获取下一个页面的链接</span>
next_url = img_tag.parent.get(<span style="color: #800000">'</span><span style="color: #800000">href</span><span style="color: #800000">'</span><span style="color: #000000">)
</span><span style="color: #0000ff">print</span>(<span style="color: #800000">'</span><span style="color: #800000">第 {0} 组:{1},第 {2} 张下载完成啦</span><span style="color: #800000">'</span>.format(str(gup),str(n-1<span style="color: #000000">)))
</span><span style="color: #008000">#</span><span style="color: #008000">递归下载下一个页面的目标图片</span>
<span style="color: #000000"> download_each_group(next_url,n)
<span style="color: #008000">#<span style="color: #008000">当h1标题不同时,说明进入到了另一个小相册,结束递归
<span style="color: #0000ff">return<span style="color: #000000"> None
<span style="color: #0000ff">def<span style="color: #000000"> get_h1_string(url):
<span style="color: #800000">'''<span style="color: #800000">
获取网址为url网站的h1标签内容
</span><span style="color: #800000">'''</span>
<span style="color: #0000ff">try</span><span style="color: #000000">:
html </span>=<span style="color: #000000"> get_html_text(url)
soup </span>= BeautifulSoup(html,<span style="color: #800000">'</span><span style="color: #800000">html.parser</span><span style="color: #800000">'</span><span style="color: #000000">)
</span><span style="color: #0000ff">return</span><span style="color: #000000"> soup.h1.string
</span><span style="color: #0000ff">except</span><span style="color: #000000">:
</span><span style="color: #0000ff">print</span>(<span style="color: #800000">'</span><span style="color: #800000">h1标题获取异常</span><span style="color: #800000">'</span><span style="color: #000000">)
</span><span style="color: #0000ff">return</span> <span style="color: #800000">''</span>
<span style="color: #0000ff">def<span style="color: #000000"> select_atag(grupic_url_list,atags,key_word):
<span style="color: #0000ff">for atag <span style="color: #0000ff">in<span style="color: #000000"> atags:
atag_string =<span style="color: #000000"> str(atag)
soup = BeautifulSoup(atag_string,<span style="color: #800000">'<span style="color: #800000">html.parser<span style="color: #800000">'<span style="color: #000000">)
p =<span style="color: #000000"> soup.p
url = atag.get(<span style="color: #800000">'<span style="color: #800000">href<span style="color: #800000">'<span style="color: #000000">)
<span style="color: #0000ff">if soup.img <span style="color: #0000ff">and p <span style="color: #0000ff">and re.search(key_word,p.string) <span style="color: #0000ff">and<span style="color: #000000"> re.match(key_url,url):
grupic_url_list.append(atag.get(<span style="color: #800000">'<span style="color: #800000">href<span style="color: #800000">'<span style="color: #000000">))
<span style="color: #0000ff">def<span style="color: #000000"> main():
<span style="color: #800000">'''<span style="color: #800000">
主函数
</span><span style="color: #800000">'''</span>
<span style="color: #008000">#</span><span style="color: #008000">封面的url链接,也就是第一页的url链接</span>
cover_url = <span style="color: #800000">'</span><span style="color: #800000">http://www.win4000.com/mt/yangzi.html</span><span style="color: #800000">'</span>
<span style="color: #008000">#</span><span style="color: #008000">小相册链接中有别于其他链接的特征字符串</span>
key_url = r<span style="color: #800000">'</span><span style="color: #800000">http://www.win4000.com/meinv</span><span style="color: #800000">'</span><span style="color: #000000">
key_word </span>= <span style="color: #800000">'</span><span style="color: #800000">杨紫</span><span style="color: #800000">'</span>
<span style="color: #008000">#</span><span style="color: #008000">图片存放的目录</span>
file_path = r<span style="color: #800000">'</span><span style="color: #800000">G:picturesyangzi</span><span style="color: #800000">'</span>
<span style="color: #008000">#</span><span style="color: #008000">存放所有页面链接的列表</span>
pages_url_list =<span style="color: #000000"> []
</span><span style="color: #008000">#</span><span style="color: #008000">先将封面,即第一页加入列表</span>
<span style="color: #000000"> pages_url_list.append(cover_url)
</span><span style="color: #008000">#</span><span style="color: #008000">获取其他页面的链接</span>
<span style="color: #000000"> get_allpages_url(cover_url,pages_url_list)
</span><span style="color: #008000">#</span><span style="color: #008000">下载所有页面中所有图片的函数</span>
<span style="color: #000000"> download_all_page(pages_url_list,key_word)
main()