半夜不睡觉,写个爬虫爬小说《完美世界》
发布时间:2020-12-17 17:28:39 所属栏目:Python 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 __author__ = 'zouxiaoliang'import urllibimport reimport osimport codecsdef getBookMemu(url_path): menu_patten = 'dd.*?/dd' url_chapter = 'dd
以下代码由PHP站长网 52php.cn收集自互联网 现在PHP站长网小编把它分享给大家,仅供参考 __author__ = 'zouxiaoliang' import urllib import re import os import codecs def getBookMemu(url_path): menu_patten = '<dd>.*?</dd>' url_chapter = '<dd><a href="(.*)">(.*)</a></dd>' thePage = urllib.urlopen(url_path) page = str(thePage.read()).decode('gbk') menu_list = re.findall(menu_patten,page) menu = dict() # map<url,chapter> for chapter in menu_list: g = re.match(url_chapter,chapter) if g: menu[url_path+g.group(1)] = g.group(2) return menu pass; def getContent(url_path): cc = str() thePage = urllib.urlopen(url_path) page = str(thePage.read()).decode('gbk') # print(page) c_patten = '<div id="content">(.*)</div>' g = re.search(c_patten,page) if g: cc = g.group(1) # print(cc) cc = re.sub(' ','',cc) cc = re.sub('<br /><br />','n',cc) # print(cc) return cc pass def writeFile(dirname,filename,content): w_handle = codecs.open(dirname+'//'+filename+".txt",mode='wb',encoding='utf8') w_handle.write(content) w_handle.close() pass if __name__ == '__main__': m = getBookMemu('http://www.biquge.la/book/14/') for c in m.keys(): url = c name = m[c] print('%s,%s' %(url,name)) while True: try: cc = getContent(url) # print(cc) if not os.path.exists('biquge'): os.mkdir('biquge') writeFile('biquge',name,cc) break except: continue print("get book over") 以上内容由PHP站长网【52php.cn】收集整理供大家参考研究 如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |