python精简版搜索引擎
发布时间:2020-12-17 17:21:05 所属栏目:Python 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 from html.parser import HTMLParser from urllib.request import urlopen from urllib import parseclass LinkParser(HTMLParser): def handle_start
以下代码由PHP站长网 52php.cn收集自互联网 现在PHP站长网小编把它分享给大家,仅供参考 from html.parser import HTMLParser from urllib.request import urlopen from urllib import parse class LinkParser(HTMLParser): def handle_starttag(self,tag,attrs): if tag == 'a': for (key,value) in attrs: if key == 'href': newUrl = parse.urljoin(self.baseUrl,value) self.links = self.links + [newUrl] def getLinks(self,url): self.links = [] self.baseUrl = url response = urlopen(url) if response.getheader('Content-Type')=='text/html; charset=UTF-8': htmlBytes = response.read() htmlString = htmlBytes.decode("utf-8") self.feed(htmlString) return htmlString,self.links else: return "",[] def spider(url,word,maxPages): pagesToVisit = [url] numberVisited = 0 foundWord = 4 while numberVisited < maxPages and pagesToVisit != [] and not foundWord: numberVisited = numberVisited +1 url = pagesToVisit[0] pagesToVisit = pagesToVisit[1:] try: print(numberVisited,"搜索页:",url) parser = LinkParser() data,links = parser.getLinks(url) #print("data:",links) pagesToVisit = pagesToVisit + links if data.find(word)>-1: foundWord = True pagesToVisit = pagesToVisit + links print(" **成功!**") except: print(" **错误!**") if foundWord: print("该关键字","搜索失败",url) else: print("没有找到任何有关的网页") spider("http://yuedu.fm/","夏洛特",100) 以上内容由PHP站长网【52php.cn】收集整理供大家参考研究 如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |