抓取知乎日报内容在本地另存为txt文档
发布时间:2020-12-17 17:12:21 所属栏目:Python 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 #Filename:getZhihu.pyimport re,osimport urllib2from bs4 import BeautifulSoupimport sysimport timereload(sys)sys.setdefaultencoding("utf-8")d
以下代码由PHP站长网 52php.cn收集自互联网 现在PHP站长网小编把它分享给大家,仅供参考 #Filename:getZhihu.py import re,os import urllib2 from bs4 import BeautifulSoup import sys import time reload(sys) sys.setdefaultencoding("utf-8") def getHtml(url): header={'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1','Referer' : '******'} request=urllib2.Request(url,None,header) response=urllib2.urlopen(request) text=response.read() return text def mkDir(): date=time.strftime('%Y-%m-%d',time.localtime(time.time())) os.mkdir(str(date)) def saveText(text): date=time.strftime('%Y-%m-%d',time.localtime(time.time())) dir_name="/home/wang/Documents/py/Zhihu/"+date soup=BeautifulSoup(text) # i=1 # for i in soup.h2: # i=i+1 if soup.h2.get_text()=='': filename=dir_name+"/ad.txt" fp=file(filename,'w') content=soup.find('div',"content") content=content.get_text() fp.write(content) fp.close() # elif i > 1: # filename=dir_name+"/kiding.txt" # contents=soup.findAll('div',"content")+soup.findAll("div","question") # contents=contents.get_text() # fp=file(filename,'w') # fp.write(contents) # fp.close() else: filename=dir_name+"/"+soup.h2.get_text()+".txt" fp=file(filename,"content") content=content.get_text() fp.write(content) fp.close() # print content #test def getUrl(url): html=getHtml(url) # print html soup=BeautifulSoup(html) urls_page=soup.find('div',"post-body") # print urls_page urls=re.findall('"((http)://.*?)"',str(urls_page)) return urls def main(): mkDir() page="http://zhihudaily.ahorn.me" urls=getUrl(page) for url in urls: text=getHtml(url[0]) saveText(text) if __name__=="__main__": main() 以上内容由PHP站长网【52php.cn】收集整理供大家参考研究 如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |