python单线程爬虫code
发布时间:2020-12-20 10:11:38 所属栏目:Python 来源:网络整理
导读:广度优先算法: # -*- coding: utf-8 -*- import urllib urllib.request from bs4 BeautifulSoup threadingmylock = threading.RLock() class Crawler: unVisitUrl = set() visitedUrl = [] def getHtml(self,url): html = '' req = urllib.request.Request(
广度优先算法: # -*- coding: utf-8 -*- import urllib urllib.request from bs4 BeautifulSoup threading mylock = threading.RLock() class Crawler: unVisitUrl = set() visitedUrl = [] def getHtml(self,url): html = '' req = urllib.request.Request(url,headers = { 'Connection': Keep-Alive',Accepttext/html,application/xhtml+xml,*/*Accept-Languageen-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3User-AgentMozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko }) try: respose = urllib.request.urlopen(req,timeout = 10) html = respose.read().decode(UTF-8').replace( ',) except Exception as e: pass return html; getUrlFromHtml(self,html,sitePath): if(html): soup = BeautifulSoup(html,1)">html.parser) aList = soup.find_all(a) for a in aList: : if sitePath in a[href'] and a['].startswith(http://): self.addUnVisitUrl(a[]) self.addVisitedUrl(a[]) KeyError: pass 解析网页内容 analysis(self,url,sitePath): self.initUnVisitUrl(url) while(len(self.unVisitUrl) > 0): visitingUrl = self.getUnVisitUrl() print(visitingUrl) (visitingUrl): html = self.getHtml(visitingUrl) (html): 获取网页中所有内部链接,存储 self.getUrlFromHtml(html,sitePath) 初始化根链接 initUnVisitUrl(self,url): self.unVisitUrl.add(url) addUnVisitUrl(self,url): if url not in self.unVisitUrl and url self.visitedUrl: self.unVisitUrl.add(url) getUnVisitUrl(self): url = None unVisitUrlTmp = list(self.unVisitUrl) unVisitUrlTmp[0]: url = unVisitUrlTmp[0] self.unVisitUrl.remove(url) url addVisitedUrl(self,url): self.visitedUrl.append(url) ? (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |