加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 编程开发 > Python > 正文

简单的爬虫,从 html 中提取表格信息

发布时间:2020-12-17 17:13:28 所属栏目:Python 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 #!/usr/bin/env python#coding=utf8try: import os import urllib import pycurl try: from cStringIO import StringIO except ImportError: from Str

以下代码由PHP站长网 52php.cn收集自互联网

现在PHP站长网小编把它分享给大家,仅供参考

#!/usr/bin/env python
#coding=utf8

try:
    import os
    import urllib
    import pycurl
    try:
        from cStringIO import StringIO
    except ImportError:
        from StringIO import StringIO
    from pyquery import PyQuery as pyq

    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')

except ImportError:
    print >> sys.stderr,"""

There was a problem importing one of the Python modules required.
The error leading to this problem was:

%s

Please install a package which provides this module,or
verify that the module is installed correctly.

It's possible that the above module doesn't match the current version of Python,which is:

%s

""" % (sys.exc_info(),sys.version)
    sys.exit(1)


__prog__ = "crawl"
__site__ = "http://www.oschina.net/code"
__version__ = "1.0"


class HttpRequest(object):
    curl = None
    def __init__(self):
        self.url = None
        self.url_para = None
        self.curl = pycurl.Curl()
        self.curl.setopt(pycurl.VERBOSE,0)

        self.curl.setopt(pycurl.USERAGENT,'Miozilla/4.0 (compatible; MSIE 8.0; WindowsNT 6.1)')
        self.curl.setopt(pycurl.HEADER,1)
        self.curl.setopt(pycurl.FOLLOWLOCATION,1)
        self.curl.setopt(pycurl.MAXREDIRS,5)
        self.curl.setopt(pycurl.COOKIEFILE,'cookie.dat')
        self.curl.setopt(pycurl.COOKIEJAR,'cookie.dat')
        self.curl.setopt(pycurl.HTTPGET,1)
        self.curl.setopt(pycurl.ENCODING,'gzip,deflate')
        self.curl.setopt(pycurl.CONNECTTIMEOUT,60)
        self.curl.setopt(pycurl.TIMEOUT,300)

    def set_url_para(self,para):
        self.url_para = para
        url = self.url + para
        self.curl.setopt(pycurl.URL,url)

    def set_post_para(self,para):
        self.curl.setopt(pycurl.POST,1)
        self.curl.setopt(pycurl.POSTFIELDS,urllib.urlencode(para))

    def set_cookie(self,cookie):
        self.curl.setopt(pycurl.COOKIE,cookie)

    def dry_write(self,buf):
        pass

    def download(self,url,file_path):
        dir = os.path.dirname(file_path)
        if not os.path.exists(dir):
            os.makedirs(dir)
        self.curl.setopt(pycurl.URL,url)
        self.curl.setopt(pycurl.HEADER,False)
        self.curl.setopt(pycurl.HEADERFUNCTION,self.dry_write) #忽略包头信息,否则会写入文件?!
        with open(file_path,'wb') as outfile:
            self.curl.setopt(pycurl.WRITEFUNCTION,outfile.write)
            try:
                self.curl.perform()
            except Exception,e:
                self.curl.close()

    def perform(self,referer=''):
        assert url,'url is null!'
        self.curl.setopt(pycurl.URL,url)
        referer and self.curl.setopt(pycurl.REFERER,referer)

        self.buf = StringIO()
        self.head = StringIO()
        self.curl.setopt(pycurl.WRITEFUNCTION,self.buf.write)
        self.curl.setopt(pycurl.HEADERFUNCTION,self.head.write)
        try:
            self.curl.perform()
            self.r = self.buf.getvalue()
            self.h = self.head.getvalue()
            self.code = self.curl.getinfo(pycurl.HTTP_CODE)
            self.info = self.curl.getinfo(pycurl.EFFECTIVE_URL)
            self.cookie = self.curl.getinfo(pycurl.INFO_COOKIELIST)

            self.curl.setopt(pycurl.REFERER,self.info) #AUTO REFERER
        except Exception,e:
            self.curl.close()
        self.buf.close()
        self.head.close()

    def __del__(self):
        self.curl.close()

    def get_body(self):
        return self.r
    def get_head(self):
        return self.h
    def get_code(self):
        return self.code
    def get_info(self):
        return self.info
    def get_cookie(self):
        return self.cookie

if __name__ == '__main__':

    asp_range = xrange(1,10)
    page_range = xrange(1,10)
    crawl = HttpRequest()
    for i in asp_range:
        for j in page_range:
            url = 'http://www.nbbicycle.com/html/116/s%d.asp?i=1&page=%d' % (i,j)
            try:
                crawl.perform(url)
                doc = pyq(crawl.get_body())
                content = doc('.contd')
                print content.children('div').eq(0).text()
                for tr in content.items('tr'):
                    print tr.text()
            except Exception,e:
                print e


以上内容由PHP站长网【52php.cn】收集整理供大家参考研究

如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读