简单的爬虫,从 html 中提取表格信息
发布时间:2020-12-17 17:13:28 所属栏目:Python 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 #!/usr/bin/env python#coding=utf8try: import os import urllib import pycurl try: from cStringIO import StringIO except ImportError: from Str
以下代码由PHP站长网 52php.cn收集自互联网 现在PHP站长网小编把它分享给大家,仅供参考 #!/usr/bin/env python #coding=utf8 try: import os import urllib import pycurl try: from cStringIO import StringIO except ImportError: from StringIO import StringIO from pyquery import PyQuery as pyq import sys reload(sys) sys.setdefaultencoding('utf-8') except ImportError: print >> sys.stderr,""" There was a problem importing one of the Python modules required. The error leading to this problem was: %s Please install a package which provides this module,or verify that the module is installed correctly. It's possible that the above module doesn't match the current version of Python,which is: %s """ % (sys.exc_info(),sys.version) sys.exit(1) __prog__ = "crawl" __site__ = "http://www.oschina.net/code" __version__ = "1.0" class HttpRequest(object): curl = None def __init__(self): self.url = None self.url_para = None self.curl = pycurl.Curl() self.curl.setopt(pycurl.VERBOSE,0) self.curl.setopt(pycurl.USERAGENT,'Miozilla/4.0 (compatible; MSIE 8.0; WindowsNT 6.1)') self.curl.setopt(pycurl.HEADER,1) self.curl.setopt(pycurl.FOLLOWLOCATION,1) self.curl.setopt(pycurl.MAXREDIRS,5) self.curl.setopt(pycurl.COOKIEFILE,'cookie.dat') self.curl.setopt(pycurl.COOKIEJAR,'cookie.dat') self.curl.setopt(pycurl.HTTPGET,1) self.curl.setopt(pycurl.ENCODING,'gzip,deflate') self.curl.setopt(pycurl.CONNECTTIMEOUT,60) self.curl.setopt(pycurl.TIMEOUT,300) def set_url_para(self,para): self.url_para = para url = self.url + para self.curl.setopt(pycurl.URL,url) def set_post_para(self,para): self.curl.setopt(pycurl.POST,1) self.curl.setopt(pycurl.POSTFIELDS,urllib.urlencode(para)) def set_cookie(self,cookie): self.curl.setopt(pycurl.COOKIE,cookie) def dry_write(self,buf): pass def download(self,url,file_path): dir = os.path.dirname(file_path) if not os.path.exists(dir): os.makedirs(dir) self.curl.setopt(pycurl.URL,url) self.curl.setopt(pycurl.HEADER,False) self.curl.setopt(pycurl.HEADERFUNCTION,self.dry_write) #忽略包头信息,否则会写入文件?! with open(file_path,'wb') as outfile: self.curl.setopt(pycurl.WRITEFUNCTION,outfile.write) try: self.curl.perform() except Exception,e: self.curl.close() def perform(self,referer=''): assert url,'url is null!' self.curl.setopt(pycurl.URL,url) referer and self.curl.setopt(pycurl.REFERER,referer) self.buf = StringIO() self.head = StringIO() self.curl.setopt(pycurl.WRITEFUNCTION,self.buf.write) self.curl.setopt(pycurl.HEADERFUNCTION,self.head.write) try: self.curl.perform() self.r = self.buf.getvalue() self.h = self.head.getvalue() self.code = self.curl.getinfo(pycurl.HTTP_CODE) self.info = self.curl.getinfo(pycurl.EFFECTIVE_URL) self.cookie = self.curl.getinfo(pycurl.INFO_COOKIELIST) self.curl.setopt(pycurl.REFERER,self.info) #AUTO REFERER except Exception,e: self.curl.close() self.buf.close() self.head.close() def __del__(self): self.curl.close() def get_body(self): return self.r def get_head(self): return self.h def get_code(self): return self.code def get_info(self): return self.info def get_cookie(self): return self.cookie if __name__ == '__main__': asp_range = xrange(1,10) page_range = xrange(1,10) crawl = HttpRequest() for i in asp_range: for j in page_range: url = 'http://www.nbbicycle.com/html/116/s%d.asp?i=1&page=%d' % (i,j) try: crawl.perform(url) doc = pyq(crawl.get_body()) content = doc('.contd') print content.children('div').eq(0).text() for tr in content.items('tr'): print tr.text() except Exception,e: print e 以上内容由PHP站长网【52php.cn】收集整理供大家参考研究 如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |