获取慕课网视频教程
发布时间:2020-12-17 17:10:16 所属栏目:Python 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 # -*- coding: utf-8 -*-from scrapy.spider import Spiderfrom scrapy.http.request import Requestfrom scrapy.selector import Selectorfrom scrap
以下代码由PHP站长网 52php.cn收集自互联网 现在PHP站长网小编把它分享给大家,仅供参考 # -*- coding: utf-8 -*- from scrapy.spider import Spider from scrapy.http.request import Request from scrapy.selector import Selector from scrapy.exceptions import NotSupported from scrapy import log import urlparse import json import urllib from meizi.items import ImoocItem __author__ = 'lpe234' class ImoocSpider(Spider): """ 本爬虫旨在帮助不方便联网观看视频教程的同学。 其实“慕课网”还是蛮不错,希望各位不要乱跑本脚本!!! """ name = 'immoc' start_urls = ['http://www.imooc.com/course/list'] def parse(self,response): """ 爬虫默认入口,旨在获取所有视频教程的 标题 以及 详细页 数据 :param response: :return: """ sel = Selector(response) # 获取本页所有视频节点 nodes = sel.xpath('//ul/li[@class="course-one"]') for node in nodes: try: title = node.xpath('./a/h5/span/text()').extract()[0] href = node.xpath('./a/@href').extract()[0] href = urlparse.urljoin(response.url,href) # 将 url 中的 view -> learn 即可得到 视频详细页列表 href = href.replace(u'view',u'learn') if title and href: yield Request( url=href,callback=self.parse_list,meta={'title': title} ) except(IndexError,TypeError): continue # 多页情况 pages = sel.xpath('//div[@class="page"]/a[@href]') for page in pages: try: href = page.xpath('./@href').extract()[0] href = urlparse.urljoin(response.url,href) if href and href.startswith('http'): yield Request( url=href,callback=self.parse,) except(IndexError,TypeError,NotSupported): continue def parse_list(self,response): """ 视频列表页 解析 :param response: :return: """ """ {"result":0,"data":{"result":{"mid":8124,"mpath": ["http://v1.mukewang.com/f9fd506a-bf14-4ede-96c4-1d69d2fd26e7/H.mp4","http://v1.mukewang.com/f9fd506a-bf14-4ede-96c4-1d69d2fd26e7/M.mp4","http://v1.mukewang.com/f9fd506a-bf14-4ede-96c4-1d69d2fd26e7/L.mp4"],"cpid":"2095","name":"NSMutableDictionary","time":"34","practise":[]}},"msg":"u6210u529f"} """ # 查看js文件可以得到 视频地址获取接口如下 GET -> json pre_href = 'http://www.imooc.com/course/ajaxmediainfo/?mid={}' title = response.meta.get('title') sel = Selector(response) nodes = sel.xpath('//div[@class="course_chapter_list"]//a[@class="studyvideo"]') videos = [] for node in nodes: try: section = node.xpath('./text()').extract()[0] section = section.strip().replace('r','').replace('n','') video_id = node.xpath('./@href').extract()[0] video_id = video_id.split('/')[-1] if section and video_id: href = pre_href.format(video_id) response = urllib.urlopen(href) data = json.loads(response.read()) # 此处存在三种模式 0:超清 1:高清 2:普清 url = data['data']['result']['mpath'][0] videos.append({section: url}) except(IndexError,ValueError): continue item = ImoocItem() item['title'] = title item['videos'] = videos yield item def parse_details(self,response): pass 以上内容由PHP站长网【52php.cn】收集整理供大家参考研究 如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |