scrapy简单使用方法

发布时间：2020-12-20 10:11:53 所属栏目：Python 来源：网络整理

导读：scrapy简单使用方法 1.创建项目： scrapy startproject 项目名例如： scrapy startproject baike windows下，cmd进入项目路径例如 d:pythonCodespiderProjectscrapy startproject baidubaike 将创建项目名为 baidubaike 2.使用命令创建一个爬虫： scrapy

scrapy简单使用方法

1.创建项目：
scrapy startproject 项目名
例如：
scrapy startproject baike

windows下，cmd进入项目路径例如
d:pythonCodespiderProject>scrapy startproject baidubaike
将创建项目名为 baidubaike

2.使用命令创建一个爬虫：
scrapy genspider 爬虫名称需要爬取的网址
scrapy genspider baike baike.baidu.com

注意：爬虫名称不能和项目名相同

d:pythonCodespiderProjectbaidubaike>scrapy genspider baike baike.baidu.com

命令执行后将在d:pythonCodespiderProjectbaidubaikebaidubaikespiders下，生成baike.py

3.修改baike.py文件

import scrapy
from baidubaike.items import BaidubaikeItem
from scrapy.http.response.html import HtmlResponse
from scrapy.selector.unified import SelectorList

class BaikeSpider(scrapy.Spider):
? ? ?#爬虫名称
? ? ?name = 'baike'
? ? ?#需要爬取的网址
? ? ?allowed_domains = ['baike.baidu.com']
? ? ?#起始网址
? ? ?start_urls = ['https://baike.baidu.com/art/%E6%8B%8D%E5%8D%96%E8%B5%84%E8%AE%AF']

? ? ?def parse(self,response):
? ? ? ? ? ? ?uls = response.xpath("//div[@class='list-content']/ul")
? ? ? ? ? ? ?for ul in uls:
? ? ? ? ? ? ? ? ? ?lis = ul.xpath(".//li")
? ? ? ? ? ? ? ? ? ?#print(lis)
? ? ? ? ? ? ? ? ? ?for li in lis:
? ? ? ? ? ? ? ? ? ? ? ? ?title = li.xpath(".//a/text()").get()
? ? ? ? ? ? ? ? ? ? ? ? ?time = li.xpath(".//span/text()").get()
? ? ? ? ? ? ? ? ? ? ? ? ?item = BaidubaikeItem(title=title,time=time)
? ? ? ? ? ? ? ? ? ? ? ? ?yield item

4.items.py

import scrapy

class BaidubaikeItem(scrapy.Item):
? ? ? ?# define the fields for your item here like:
? ? ? ?# name = scrapy.Field()
? ? ? ?# pass
? ? ? ?title = scrapy.Field()
? ? ? ?time = scrapy.Field()

5.修改settings.py文件
1)开启 DEFAULT_REQUEST_HEADERS
修改如下
DEFAULT_REQUEST_HEADERS = {
? ? ?'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
? ? ?'Accept-Language': 'en',
? ? ?'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}

2)将 ROBOTSTXT_OBEY = True 改为 ROBOTSTXT_OBEY = False
说明：
默认为True，就是要遵守robots.txt 的规则
将此配置项设置为 False ，拒绝遵守 Robot协议

3)开启 ITEM_PIPELINES
ITEM_PIPELINES = {
? ? ?'baidubaike.pipelines.BaidubaikePipeline': 300,
}
其中，ITEM_PIPELINES是一个字典文件，键为要打开的ItemPipeline类，值为优先级，ItemPipeline是按照优先级来调用的，值越小，优先级越高。

6.修改pipelines.py文件
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#第一种方式
#import json
#
#class BaidubaikePipeline(object):
# ? ? ?? def __init__(self):
# ? ? ?? ? ? ?? #pass
# ? ? ?? ? ? ?? self.fp = open('baike.json','w',encoding='utf-8')
#
# ? ? ?? def open_spider(self,spider):
# ? ? ??? ? ?? print('爬虫开始了。。')
#
# ? ? ?? def process_item(self,item,spider):
# ? ? ?? ? ? ?? item_json = json.dumps(dict(item),ensure_ascii=False)
# ? ? ??? ? ?? self.fp.write(item_json+ 'n')
# ? ? ??? ? ?? return item
#
# ? ? ?? def close_spider(self,spider):
# ? ? ??? ? ?? self.fp.close()
# ? ? ??? ? ?? print('爬虫结束了。。')
#

#第二种方式
#from scrapy.exporters import JsonItemExporter
#
#class BaidubaikePipeline(object):
# ? ? ?? def __init__(self):
# ? ? ?? ? ? ?? #pass
# ? ? ?? ? ? ?? self.fp = open('baike.json','wb')
#? ? ? ? ? ? ?? self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
#? ? ? ?? ? ? ?self.exporter.start_exporting()
#
# ? ? ?? def open_spider(self,spider):
# ? ? ??? ? ?? self.exporter.export_item(item)
# ? ? ??? ? ?? return item
#
# ? ? ?? def close_spider(self,spider):
# ? ? ??? ? ?? self.exporter.finish_exporting()
#? ? ??? ? ? ? self.fp.close()
# ? ? ??? ? ?? print('爬虫结束了。。')

#第三种方式
from scrapy.exporters import JsonLinesItemExporter

class BaidubaikePipeline(object):
? ? ? def __init__(self):
? ? ?? ? ???#pass
? ? ??? ? ??self.fp = open('baike.json','wb')
? ? ??? ? ??self.exporter = JsonLinesItemExporter(self.fp,encoding='utf-8')

? ? ??def open_spider(self,spider):
? ? ??? ? ??print('爬虫开始了。。')

? ? ??def process_item(self,spider):
? ? ??? ? ??self.exporter.export_item(item)
? ? ??? ? ??return item

? ? ??def close_spider(self,spider):
? ? ??? ? ??self.fp.close()
? ? ??? ? ??print('爬虫结束了。。')

7.运行爬虫
scrapy crawl 爬虫名

d:pythonCodespiderProjectbaidubaikebaidubaike>scrapy crawl baike

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!