python爬虫入门---第三篇：自动下载图片

发布时间：2020-12-16 23:58:27 所属栏目：Python 来源：美桌源代码：import requestsimp

导读：适用的图片网站：源代码： bs4 span style="color: #0000ff"def span style="color: #000000" get_html_text(url): span style="color: #800000"''' span style="color: #800000" 获取网址url的HTML代码，以字符串形式返回html代码 /spanspan style="color:

适用的图片网站：

源代码：

bs4 def get_html_text(url):
'''
获取网址url的HTML代码，以字符串形式返回html代码

</span><span style="color: #800000"&gt;'''</span>
<span style="color: #0000ff"&gt;try</span><span style="color: #000000"&gt;:
    res </span>= requests.get(url,timeout = 6<span style="color: #000000"&gt;)
    res.raise_for_status()
    res.encoding </span>=<span style="color: #000000"&gt; res.apparent_encoding
    </span><span style="color: #0000ff"&gt;return</span><span style="color: #000000"&gt; res.text
</span><span style="color: #0000ff"&gt;except</span><span style="color: #000000"&gt;:
    </span><span style="color: #0000ff"&gt;return</span> <span style="color: #800000"&gt;''</span>
    <span style="color: #0000ff"&gt;print</span>(<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;请求异常</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;)

def get_grupic_url(page_url,grupic_url_list,key_url,key_word):
'''
获取每张页面中每个图册的url链接，每个图册的url都有共同
且有别于其他链接的url，我们把部分特征的字符串放在key_url
中，通过它我们就可以筛选出页面中所有图册的url

</span><span style="color: #800000"&gt;'''</span><span style="color: #000000"&gt;
page_html </span>=<span style="color: #000000"&gt; get_html_text(page_url)
</span><span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;解析页面的html代码</span>
soup = BeautifulSoup(page_html,<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;html.parser</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;)
</span><span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;获取该页面html代码中的所有<a>标签</span>
a_tags = soup.find_all(<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;a</span><span style="color: #800000"&gt;'</span>,attrs = {<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;href</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;:True})

select_atag(grupic_url_list,a_tags,key_word)

def get_allpages_url(cover_url,pages_url_list):
'''通过递归获取所有页面的链接，
直到该页面不存在class = 'next'的标签

</span><span style="color: #800000"&gt;'''</span><span style="color: #000000"&gt;
html </span>=<span style="color: #000000"&gt; get_html_text(cover_url)
soup </span>= BeautifulSoup(html,<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;html.parser</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;)
</span><span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;找到属性class = 'next'的<a>标签</span>
a_tags = soup.find_all(<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;a</span><span style="color: #800000"&gt;'</span>,class_ = <span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;next</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;)
</span><span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;如果<a>标签存在，就将该标签的url加入列表</span>
<span style="color: #0000ff"&gt;if</span><span style="color: #000000"&gt; a_tags:
    nextpage_url </span>= a_tags[0].get(<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;href</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;)
    pages_url_list.append(nextpage_url)
    </span><span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;递归获取下一个页面的<a>标签</span>

get_allpages_url(nextpage_url,pages_url_list)
#当不存在属性class = 'next'的标签时，说明这是最后一页，结束递归
return None

def download_each_page(grupic_url_list,file_path1,page):
'''
通过调用download_each_group()函数，
下载每一页中的所有组图

</span><span style="color: #800000"&gt;'''</span>
<span style="color: #0000ff"&gt;print</span>(<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;nn第 {0} 页开始下载：n</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;.format(str(page)))

gup </span>= 1    <span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;组数标记</span>
<span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;下载该页面中每个小相册的所有图片</span>
<span style="color: #0000ff"&gt;for</span> grupic_url <span style="color: #0000ff"&gt;in</span><span style="color: #000000"&gt; grupic_url_list:
    file_path2 </span>= file_path1 + <span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;_{0}</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;.format(str(gup))
    </span><span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;获取该页面的h1标题</span>
    h1_string =<span style="color: #000000"&gt; get_h1_string(grupic_url)
    </span><span style="color: #0000ff"&gt;try</span><span style="color: #000000"&gt;:
        download_each_group(grupic_url,file_path2,h1_string,gup)
        gup </span>+= 1
    <span style="color: #0000ff"&gt;except</span><span style="color: #000000"&gt;:
        </span><span style="color: #0000ff"&gt;print</span>(<span style="color: #800000"&gt;"</span><span style="color: #800000"&gt;下载异常</span><span style="color: #800000"&gt;"</span><span style="color: #000000"&gt;)
        gup </span>+= 1
        <span style="color: #0000ff"&gt;continue</span>

def download_all_page(pages_url_list,file_path,key_word):
'''通过调用函数download_each_page()，
来下载所有页面的图片

</span><span style="color: #800000"&gt;'''</span><span style="color: #000000"&gt;
pages_num </span>=<span style="color: #000000"&gt; len(pages_url_list)
</span><span style="color: #0000ff"&gt;print</span>(<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;n相册一共有 {0} 页，已经开始下载请您耐心等待...</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;.format(str(pages_num)))

page </span>= 1    <span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;页数标记</span>
<span style="color: #0000ff"&gt;for</span> page_url <span style="color: #0000ff"&gt;in</span><span style="color: #000000"&gt; pages_url_list:
    grupic_url_list </span>=<span style="color: #000000"&gt; []
    get_grupic_url(page_url,key_word)
    file_path1 </span>= file_path + r<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;{0}</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;.format(str(page))
    download_each_page(grupic_url_list,page)
    page </span>+= 1

def download_each_group(grupic_url,gup,n = 1):
'''
进入链接为grupic_url的图册，下载我们需要的大图片，
并递归进入下一个页面开始下载，直到图册的h1标题发生改变

</span><span style="color: #800000"&gt;'''</span><span style="color: #000000"&gt; 
new_file_path </span>= file_path + <span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;_{0}.jpg</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;.format(str(n))
n </span>+= 1<span style="color: #000000"&gt;
html </span>=<span style="color: #000000"&gt; get_html_text(grupic_url)
soup </span>= BeautifulSoup(html,<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;html.parser</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;)
</span><span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;当该页面的h1标题和小相册封面的h1标题相同时开始下载</span>
<span style="color: #0000ff"&gt;if</span> h1_string ==<span style="color: #000000"&gt; soup.h1.string:
    </span><span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;找到属性class_ = 'pic-large'的img标签</span>
    img_tags = soup.find_all(<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;img</span><span style="color: #800000"&gt;'</span>,class_ = <span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;pic-large</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;)
    img_tag </span>=<span style="color: #000000"&gt; img_tags[0]
    </span><span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;下载该img标签属性data-original提供的url链接，即为目标图片的链接</span>
    urllib.request.urlretrieve(img_tag.get(<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;data-original</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;),new_file_path)
    </span><span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;获取下一个页面的链接</span>
    next_url = img_tag.parent.get(<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;href</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;)
    </span><span style="color: #0000ff"&gt;print</span>(<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;第 {0} 组：{1},第 {2} 张下载完成啦</span><span style="color: #800000"&gt;'</span>.format(str(gup),str(n-1<span style="color: #000000"&gt;)))
    </span><span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;递归下载下一个页面的目标图片</span>

download_each_group(next_url,n)
#当h1标题不同时，说明进入到了另一个小相册，结束递归
return None

def get_h1_string(url):
'''
获取网址为url网站的h1标签内容

</span><span style="color: #800000"&gt;'''</span>
<span style="color: #0000ff"&gt;try</span><span style="color: #000000"&gt;:
    html </span>=<span style="color: #000000"&gt; get_html_text(url)
    soup </span>= BeautifulSoup(html,<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;html.parser</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;)
    </span><span style="color: #0000ff"&gt;return</span><span style="color: #000000"&gt; soup.h1.string
</span><span style="color: #0000ff"&gt;except</span><span style="color: #000000"&gt;:
    </span><span style="color: #0000ff"&gt;print</span>(<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;h1标题获取异常</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;)
    </span><span style="color: #0000ff"&gt;return</span> <span style="color: #800000"&gt;''</span>

def select_atag(grupic_url_list,atags,key_word):
for atag in atags:
atag_string = str(atag)
soup = BeautifulSoup(atag_string,'html.parser')
p = soup.p
url = atag.get('href')
if soup.img and p and re.search(key_word,p.string) and re.match(key_url,url):
grupic_url_list.append(atag.get('href'))

def main():
'''
主函数

</span><span style="color: #800000"&gt;'''</span>
<span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;封面的url链接，也就是第一页的url链接</span>
cover_url = <span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;http://www.win4000.com/mt/yangzi.html</span><span style="color: #800000"&gt;'</span>
<span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;小相册链接中有别于其他链接的特征字符串</span>
key_url = r<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;http://www.win4000.com/meinv</span><span style="color: #800000"&gt;'</span><span style="color: #000000"&gt;
key_word </span>= <span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;杨紫</span><span style="color: #800000"&gt;'</span>
<span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;图片存放的目录</span>
file_path = r<span style="color: #800000"&gt;'</span><span style="color: #800000"&gt;G:picturesyangzi</span><span style="color: #800000"&gt;'</span>

<span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;存放所有页面链接的列表</span>
pages_url_list =<span style="color: #000000"&gt; []
</span><span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;先将封面，即第一页加入列表</span>

pages_url_list.append(cover_url)

</span><span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;获取其他页面的链接</span>

get_allpages_url(cover_url,pages_url_list)

</span><span style="color: #008000"&gt;#</span><span style="color: #008000"&gt;下载所有页面中所有图片的函数</span>

download_all_page(pages_url_list,key_word)

main()

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!