Python小爬虫实例

发布时间：2020-12-20 10:11:46 所属栏目：Python 来源：网络整理

导读：有几个注意点： # -*- coding: utf-8 -*- # func passport jw.qdu.edu.cn import re urllib # python3后urllib.request代替urllib2 urllib.request json from bs4 BeautifulSoup class taofen: def getHtml(self,pageurl): 获取网站html代码 req = urllib.re

有几个注意点：

# -*- coding: utf-8 -*- 
# func passport jw.qdu.edu.cn
import re
 urllib
# python3后urllib.request代替urllib2
 urllib.request
 json
from bs4  BeautifulSoup

class taofen:

    def getHtml(self,pageurl):
         获取网站html代码
        req = urllib.request.Request(pageurl,headers = {
            'Connection': Keep-Alive',Accepttext/html,application/xhtml+xml,*/*Accept-Languageen-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3User-AgentMozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko
        })
        _respose = urllib.request.urlopen(req,timeout = 2)
        try:
            html = _respose.read().decode(UTF-8').replace(&nbsp',''except Exception as e:
            pass
        return html
     getKind(self,html):
        soup = BeautifulSoup(html,1)">"html.parser")
        liList = soup.find_all(li)
        res = []
        for li in liList:
            if li.img and li.find(class_ = change_price):
                img = li.img.attrs[original]
                name = li.img.attrs[alt]
                price = li.find(class_ = ).string
                resNode = {img':img,1)">name':name,1)">price:price}
                res.append(resNode)
         res

if __name__ == __main__:

    taofen = taofen()
    html = taofen.getHtml(http://www.taofen8.com/promcat-4/cat-300/subcat-0/page-1/order-3/sp-2)
    
    res = taofen.getKind(html)
    ensure_ascii=False将utf-8编码的中文正确显示
    res = json.dumps(res,ensure_ascii = False) 
    print(res)

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!