反爬虫-python3.6抓取猫眼电影信息

发布时间：2020-12-17 00:30:36 所属栏目：Python 来源：网络整理

导读：思路分解： 1.页面信息 url:http://maoyan.com/cinema/24311?poi=164257570 查看信息发现价格存在乱码现象：刷新页面找到乱码的URL，下载woff格式文件：方法：复制URL：右键单击转到下载完成，即为代码中的baseprice.woff文件，再次刷新网页，同样的方法再

思路分解：

1.页面信息

url:http://maoyan.com/cinema/24311?poi=164257570

查看信息发现价格存在乱码现象：

刷新页面找到乱码的URL，下载woff格式文件：方法：复制URL：右键单击转到下载完成，即为代码中的baseprice.woff文件，再次刷新网页，同样的方法再次下载URL作为匹配的woff文件，即为代码中的maoprice.woff.

用这个网址打开保存的base.woff文件，如下图：

FontEditor

fontstore.baidu.com

与代码行对应：

反爬虫字体解析原理：先在网页上下载乱码文件base.woff，可以转化为xml,用pycharm打开可以看到信息，再刷新页面后下载maoyan.woff文件可以看到二者有对应的关系，就可以编写代码。

进群“960410445 ”? 即可获取数十套PDF哦！@
?

二者的对应关系：

2.字体解析代码：

baseFont = TTFont('C:UsersnanafightingDesktopbaseprice.woff')
 maoyanFont = TTFont('maoprice.woff')
 maoyan_unicode_list = maoyanFont['cmap'].tables[0].ttFont.getGlyphOrder()
 maoyan_num_list = []
 baseNumList = ['.','6','4','7','5','2','8','0','1','9','3']
 baseUniCode = ['x','uniF76E','uniEACB','uniE8D1','uniE737','uniE9B7','uniF098','uniF4DC','uniF85E','uniE2F1','uniEE4E']
 for i in range(1,12):
 maoyanGlyph = maoyanFont['glyf'][maoyan_unicode_list[i]]
 for j in range(11):
 baseGlyph = baseFont['glyf'][baseUniCode[j]]
 if maoyanGlyph == baseGlyph:
 maoyan_num_list.append(baseNumList[j])
 break
 maoyan_unicode_list[1] = 'uni0078'
 utf8List = [eval(r"'u" + uni[3:] + "'").encode("utf-8") for uni in maoyan_unicode_list[1:]]

3.代码中容易出错的地方：字符串的转换

moviewish = mw[i].get_text().encode('utf-8')
 #字符串转换方法1
 #moviewish = str(moviewish,encoding='utf-8')
 #moviewish = '%r' % moviewish
 #moviewish = moviewish[1:-1]
 #字符串转换方法2
 moviewish=''.join('%s' %id for id in moviewish)
 for i in range(len(utf8List)):
 #字符转换
 utf8List[i]=''.join('%s' %id for id in utf8List[i])
 maoyan_num_list[i]=''.join('%s' %id for id in maoyan_num_list[i])
 moviewish = moviewish.replace(utf8List[i],maoyan_num_list[i])
#完整代码import requestsimport refrom fontTools.ttLib import TTFontfrom bs4 import BeautifulSoup as bsfrom lxml import htmlfrom fontTools.ttLib import TTFont# 抓取maoyan票房class MaoyanSpider:
 # 页面初始化
 def __init__(self):
 self.headers = {
 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8","Accept-Encoding": "gzip,deflate,br","Accept-Language": "zh-CN,zh;q=0.8","Cache-Control": "max-age=0","Connection": "keep-alive","Upgrade-Insecure-Requests": "1","Content-Type": "application/x-www-form-urlencoded; charset=UTF-8","User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/59.0.3071.86 Safari/537.36"
 }
 # 获取票房
 def getNote(self):
 url = 'http://maoyan.com/cinema/24311?poi=164257570'
 host = {'host': 'maoyan.com','refer': 'http://maoyan.com/news'}
 # 合并字典
 headers={**self.headers,**host}
 #headers = dict(self.headers.items() + host.items())在python3中会报错
 # 获取页面内容
 r = requests.get(url,headers=headers)
 # print r.text
 response = html.fromstring(r.text)
 u = r.text
 # 匹配ttf font
 cmp = re.compile(",url('(//.*.woff)') format('woff')")
 rst = cmp.findall(u)
 ttf = requests.get("http:" + rst[0],stream=True)
 with open("maoyanprice.woff","wb") as pdf:
 for chunk in ttf.iter_content(chunk_size=1024):
 if chunk:
 pdf.write(chunk)
 # 解析字体库font文件
 #baseprice.woff是自己在网页上下载的乱码字符
 baseFont = TTFont('C:UsersnanafightingDesktopbaseprice.woff')
 maoyanFont = TTFont('maoprice.woff')
 maoyan_unicode_list = maoyanFont['cmap'].tables[0].ttFont.getGlyphOrder()
 maoyan_num_list = []
 baseNumList = ['.',12):
 maoyanGlyph = maoyanFont['glyf'][maoyan_unicode_list[i]]
 for j in range(11):
 baseGlyph = baseFont['glyf'][baseUniCode[j]]
 if maoyanGlyph == baseGlyph:
 maoyan_num_list.append(baseNumList[j])
 break
 maoyan_unicode_list[1] = 'uni0078'
 utf8List = [eval(r"'u" + uni[3:] + "'").encode("utf-8") for uni in maoyan_unicode_list[1:]]
 # 获取发帖内容
 soup = bs(u,"html.parser")
 index = soup.find_all('div',{'class': 'show-list'})
 print('---------------Prices-----------------')
 for n in range(len(index)):
 mn = soup.find_all('h3',{'class': 'movie-name'})
 ting = soup.find_all('span',{'class': 'hall'})
 mt = soup.find_all('span',{'class': 'begin-time'})
 mw = soup.find_all('span',{'class': 'stonefont'})
 for i in range(len(mn)):
 moviename = mn[i].get_text()
 film_ting = ting[i].get_text()
 movietime = mt[i].get_text()
 moviewish = mw[i].get_text().encode('utf-8')
 #字符串转换
 #moviewish = str(moviewish,encoding='utf-8')
 #moviewish = '%r' % moviewish
 #moviewish = moviewish[1:-1]
 moviewish=''.join('%s' %id for id in moviewish)
 for i in range(len(utf8List)):
 #字符转换
 utf8List[i]=''.join('%s' %id for id in utf8List[i])
 maoyan_num_list[i]=''.join('%s' %id for id in maoyan_num_list[i])
 moviewish = moviewish.replace(utf8List[i],maoyan_num_list[i])
 print(moviename,film_ting,movietime,moviewish)spider = MaoyanSpider()print(spider.getNote())

运行结果：

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!