python写的有声小说爬虫
发布时间:2020-12-20 11:01:48 所属栏目:Python 来源:网络整理
导读:querybook.py from bs4 import BeautifulSoupfrom lxml import htmlimport xmlimport requestsimport spliderclass QuName: def __init__(self,number): self.number = number def getPageNum(self,url): f = requests.get(url) # Get该网页从而获取该html内
querybook.py from bs4 import BeautifulSoup from lxml import html import xml import requests import splider class QuName: def __init__(self,number): self.number = number def getPageNum(self,url): f = requests.get(url) # Get该网页从而获取该html内容 soup = BeautifulSoup(f.content,"lxml") try: pageNum = soup.find('div',class_="pagesnums").find('span').text print('getPageNum执行成功') return int(pageNum[3:5]) except: print('getPageNum执行失败') finally: print('___________________________') def getBookList(self): for num in range(1,self.number): pageNum = self.getPageNum('http://www.ting89.com/booklist/'+str(num)+'.html') self.getBookInfo('http://www.ting89.com/booklist/'+str(num)+'.html') print('http://www.ting89.com/booklist/'+str(num)+'.html') for num1 in range(2,pageNum): self.getBookInfo('http://www.ting89.com/booklist/'+str(num)+'_'+str(num1)+'.html') print('http://www.ting89.com/booklist/'+str(num)+'_'+str(num1)+'.html') def getBookInfo(self,"lxml") try: bookList = soup.find('div',class_="clist").findAll('li') for i in bookList: imgUrl = i.find('img') print('书籍封面',imgUrl['src']) # print('书名:',i.find('b').text) pList = i.findAll('p') for j in pList: print(j.text) #下载文件 splider.YsSpider(i.find('b').text).download_files() except: print('getBookInfo执行失败') finally: print('___________________________') qn = QuName(13) #这里是网站的类别数量(偷了个懒,直接写了个数字) qn.getBookList() splider.py import requests import urllib import re import os import time class YsSpider: def __init__(self,name): self.search_name = name self.search_url = "http://www.ting89.com/search.asp?searchword=" self.home_url = "http://www.ting89.com/books/" self.index_pattern = r"""<a href="/books/([0-9]+).html" title="(.+?)" target='_blank'>""" self.chapter_pattern=r"""<a href='(/down/?[^-]+-d+.html)' target="_blank">(.+?)</a>""" self.down_pattern=r"""url=(.*)/(.+?).mp3""" self.book_id = '' self.book_name = '' self.Chapter_list = [] # 返回搜索书目的id def searchbook(self): file = requests.get(self.search_url + urllib.parse.quote(self.search_name,encoding='gb2312')) data = file.content.decode('gbk') result = re.findall(self.index_pattern,data) if len(result): for index,i in enumerate(result): print('%d.%s'%(index+1,i[1])) # str = input("输入你要下载的书目名称序号: ") str = '1' self.book_name = result[int(str)-1][1] self.book_id = result[int(str)-1][0] return self.book_id else: print('*******没有找到你输入的相关书籍,请更换后重新运行程序*******') exit() def get_chapter_list(self):#获取各章节list和url data = requests.get(self.home_url+self.searchbook()+'.html').content.decode('gbk') result = re.findall(self.chapter_pattern,data) return result def _getAllUrl(self):# 获得所有的章节的下载地址 chapter_list = self.get_chapter_list() chapter = [x[0] for x in chapter_list] self.Chapter_list= [x[1] for x in chapter_list] _list = [x[1] for x in chapter_list] data = requests.get("http://www.ting89.com" + chapter[0]).content.decode('gbk') result = re.findall(self.down_pattern,data) # return result return self.sub_get_url(result[0][0],_list,re.search("^0.*1$",result[0][1])) def sub_get_url(self,down_url,down_url_flag): url = [] if down_url_flag: xulie = list(range(len(_list))) weishu = len(str(xulie[-1])) for i in xulie: i1 = i + 1 tmp_url = down_url+'/' + str(i1).zfill(weishu) + '.mp3' url.append(urllib.request.quote(tmp_url,safe='/:?=')) else: for item in _list: tmp_url = down_url + '/'+item + ".mp3" url.append(urllib.request.quote(tmp_url,safe='/:?=')) return url # 保存指定URL的文件 def save_a_file(self,url,path,chapter): try: print('尝试下载',chapter) if not os.path.exists(path): response = requests.get(url) with open(path,'wb') as f: f.write(response.content) f.close print(chapter,'保存成功') response.close() time.sleep(1) else: print('文件已经存在') except: print('爬取失败,已下载至',chapter,'即将重新尝试下载') self.save_a_file(url,chapter) def download_files(self): result = self._getAllUrl()# 所有的章节对应的下载地址 root = os.path.join(os.getcwd(),self.book_name) if not os.path.exists(root): os.mkdir(root) for index,i in enumerate(result): path = os.path.join(root,self.Chapter_list[index])+'.mp3' self.save_a_file(i,self.Chapter_list[index]) (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |