python-爬虫-史书典籍
发布时间:2020-12-20 12:45:07 所属栏目:Python 来源:网络整理
导读:import requests import os from lxml import html import time def get_title_url(tree): ‘‘‘ 一级 获取标题 ‘‘‘ # 史书典籍 # 格式:/book/sanguoyanyi.html History_book_url_list = tree.xpath( " //div[@class=‘index-li‘][3]/ul/li/a/@href "
import requests import os from lxml import html import time def get_title_url(tree): ‘‘‘一级 获取标题‘‘‘ # 史书典籍 # 格式:/book/sanguoyanyi.html History_book_url_list = tree.xpath("//div[@class=‘index-li‘][3]/ul/li/a/@href") # 格式:三国演义 History_book_name_list = tree.xpath("//div[@class=‘index-li‘][3]/ul/li/a/text()") return History_book_url_list,History_book_name_list def get_article_url(tree): ‘‘‘二级 获取文章标题‘‘‘ # 三国演义典籍 # 格式:/book/sanguoyanyi/1.html book_url_list = tree.xpath("//div[@class=‘book-mulu‘]/ul/li/a/@href") # 格式:第一回·宴桃园豪杰三结义 斩黄巾英雄首立功 book_name_list = tree.xpath("//div[@class=‘book-mulu‘]/ul/li/a/text()") return book_url_list,book_name_list def get_article(tree): ‘‘‘三级 获取文章内容‘‘‘ # 第一回·宴桃园豪杰三结义 斩黄巾英雄首立功 # 格式:/book/sanguoyanyi/1.html article_list = tree.xpath("//div[@class=‘chapter_content‘]/p/text()") return ‘‘.join(article_list) def get_request(url,headers): ‘‘‘获取页面‘‘‘ response = requests.get(url=url,headers=headers) tree = html.fromstring(response.text) return tree def save_mkdir(two): ‘‘‘三级 保存文章夹‘‘‘ # 一级文件夹 if os.path.exists(‘史书典籍‘): pass else: os.mkdir(‘史书典籍‘) # 二级文件夹 if os.path.exists(‘史书典籍/‘+ two): pass else: os.mkdir(‘史书典籍/‘+ two) def police_2(a): ‘‘‘二级中断检测‘‘‘ b = None if os.path.exists(‘史书典籍/police_2.txt‘): with open(‘史书典籍/police_2.txt‘,‘r‘) as f: b = f.read() f.close() if b is None: return True elif b is ‘‘: return True if a < int(b): return False # 写入并返回True with open(‘史书典籍/police_2.txt‘,‘w‘) as f: f.write(str(a)) f.close() return True def police_3(a): ‘‘‘三级中断检测‘‘‘ b = None if os.path.exists(‘史书典籍/police_3.txt‘): with open(‘史书典籍/police_3.txt‘,‘r‘) as f: b = f.read() f.close() if b is None: return True elif b is ‘‘: return True if a < int(b): return False # 写入并返回True with open(‘史书典籍/police_3.txt‘,‘w‘) as f: f.write(str(a)) f.close() return True def main(): ‘‘‘主函数‘‘‘ # 根路由 root = ‘http://www.shicimingju.com‘ # 头部 headers = { ‘user-agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/76.0.3809.87 Safari/537.36‘ } # 获取root页面 tree1 = get_request(root,headers) # 获取一级名字和路由 History_book_url_list,History_book_name_list = get_title_url(tree1) # 获取二级页面 for i in range(len(History_book_url_list)): if police_2(i) is False: continue # 二级路由 url2 = root + History_book_url_list[i] print("爬取>>>"+History_book_name_list[i]+‘开始‘) tree2 = get_request(url2,headers) # 获取二级名字和路由 book_url_list,book_name_list = get_article_url(tree2) # 文章夹保存 save_mkdir(History_book_name_list[i]) # 下载文章 for j in range(len(book_url_list)): if police_3(j) is False: continue time.sleep(1) # 三级路由 url3 = root + book_url_list[j] print("爬取:" + book_name_list[j]) # 文章 tree3 = get_request(url3,headers) txt = get_article(tree3) # 文章标题 txt_name = book_name_list[j] # 文章保存 file_path = ‘史书典籍/{}/{}.txt‘.format(History_book_name_list[i],(txt_name.replace(‘ ‘,‘‘)).replace(‘·‘,‘‘)) with open(file_path,‘w‘,encoding=‘utf-8‘) as f: f.write(txt) f.close() print("爬取>>>" + History_book_name_list[i] + ‘结束‘) if __name__ == ‘__main__‘: main() (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |