加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 编程开发 > Python > 正文

python-爬虫-史书典籍

发布时间:2020-12-20 12:45:07 所属栏目:Python 来源:网络整理
导读:import requests import os from lxml import html import time def get_title_url(tree): ‘‘‘ 一级 获取标题 ‘‘‘ # 史书典籍 # 格式:/book/sanguoyanyi.html History_book_url_list = tree.xpath( " //div[@class=‘index-li‘][3]/ul/li/a/@href "
import requests
import os
from lxml import html
import time


def get_title_url(tree):
    ‘‘‘一级  获取标题‘‘‘
    # 史书典籍
    # 格式:/book/sanguoyanyi.html
    History_book_url_list = tree.xpath("//div[@class=‘index-li‘][3]/ul/li/a/@href")
    # 格式:三国演义
    History_book_name_list = tree.xpath("//div[@class=‘index-li‘][3]/ul/li/a/text()")
    return History_book_url_list,History_book_name_list


def get_article_url(tree):
    ‘‘‘二级  获取文章标题‘‘‘
    # 三国演义典籍
    # 格式:/book/sanguoyanyi/1.html
    book_url_list = tree.xpath("//div[@class=‘book-mulu‘]/ul/li/a/@href")
    # 格式:第一回·宴桃园豪杰三结义  斩黄巾英雄首立功
    book_name_list = tree.xpath("//div[@class=‘book-mulu‘]/ul/li/a/text()")
    return book_url_list,book_name_list


def get_article(tree):
    ‘‘‘三级  获取文章内容‘‘‘
    # 第一回·宴桃园豪杰三结义  斩黄巾英雄首立功
    # 格式:/book/sanguoyanyi/1.html
    article_list = tree.xpath("//div[@class=‘chapter_content‘]/p/text()")
    return ‘‘.join(article_list)

def get_request(url,headers):
    ‘‘‘获取页面‘‘‘
    response = requests.get(url=url,headers=headers)
    tree = html.fromstring(response.text)
    return tree

def save_mkdir(two):
    ‘‘‘三级  保存文章夹‘‘‘
    # 一级文件夹
    if os.path.exists(史书典籍):
        pass
    else:
        os.mkdir(史书典籍)
    # 二级文件夹
    if os.path.exists(史书典籍/+ two):
        pass
    else:
        os.mkdir(史书典籍/+ two)

def police_2(a):
    ‘‘‘二级中断检测‘‘‘
    b = None
    if os.path.exists(史书典籍/police_2.txt):
        with open(史书典籍/police_2.txt,r) as f:
            b = f.read()
            f.close()
            if b is None:
                return True
            elif b is ‘‘:
                return True
            if a < int(b):
                return False
    # 写入并返回True
    with open(史书典籍/police_2.txt,w) as f:
        f.write(str(a))
        f.close()
        return True



def police_3(a):
    ‘‘‘三级中断检测‘‘‘
    b = None
    if os.path.exists(史书典籍/police_3.txt):
        with open(史书典籍/police_3.txt,r) as f:
            b = f.read()
            f.close()
            if b is None:
                return True
            elif b is ‘‘:
                return True
            if a < int(b):
                return False
    # 写入并返回True
    with open(史书典籍/police_3.txt,w) as f:
        f.write(str(a))
        f.close()
        return True


def main():
    ‘‘‘主函数‘‘‘
    # 根路由
    root = http://www.shicimingju.com
    # 头部
    headers = {
        user-agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/76.0.3809.87 Safari/537.36
    }


    # 获取root页面
    tree1 = get_request(root,headers)
    # 获取一级名字和路由
    History_book_url_list,History_book_name_list = get_title_url(tree1)
    # 获取二级页面
    for i in range(len(History_book_url_list)):
        if police_2(i) is False:
            continue
        # 二级路由
        url2 = root + History_book_url_list[i]
        print("爬取>>>"+History_book_name_list[i]+开始)
        tree2 = get_request(url2,headers)
        # 获取二级名字和路由
        book_url_list,book_name_list = get_article_url(tree2)
        # 文章夹保存
        save_mkdir(History_book_name_list[i])
        # 下载文章
        for j in range(len(book_url_list)):
            if police_3(j) is False:
                continue
            time.sleep(1)
            # 三级路由
            url3 = root + book_url_list[j]
            print("爬取:" + book_name_list[j])
            # 文章
            tree3 = get_request(url3,headers)
            txt = get_article(tree3)
            # 文章标题
            txt_name = book_name_list[j]
            # 文章保存
            file_path = 史书典籍/{}/{}.txt.format(History_book_name_list[i],(txt_name.replace( ,‘‘)).replace(·,‘‘))
            with open(file_path,w,encoding=utf-8) as f:
                f.write(txt)
                f.close()
        print("爬取>>>" + History_book_name_list[i] + 结束)



if __name__ == __main__:
    main()

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读