- 分析⽹站:数据静态加载
1. 创建项⽬
scrapy startproject my_scrapy # 项⽬名
2. 创建Spider
cd my_scrapy
scrapy genspider spider baidu . com # 爬⾍⽂件名 域名
3. 创建Item
Item是保存数据的容器 定义爬取数据结构
4. Spider
- 定义初始请求
- 定义解析数据
5.保存数据
scrapy crawl spider -o demo.csv
class MyScrapyPipeline:
def process_item(self, item, spider):
# 简单保存
with open('demo2.txt','a',encoding= 'utf8') as f:
f.write(item['author'] + '\n'+ item['text'] + '\n\n\n')
return item
完整代码
目录结构
spider.py
import scrapy
from my_scrapy.items import MyScrapyItem
class SpiderSpider(scrapy.Spider):
# 爬虫名称
name = 'spider'
# 域名限制,允许爬取的范围
# allowed_domains = ['https://quotes.toscrape.com/']
# 初始请求的页面
start_urls = ['https://quotes.toscrape.com//']
def parse(self, response):
# text = response.text
quotes = response.xpath('//div[@class="quote"]')
for quote in quotes :
# 旧方法 get()为新方法
# text = quote.xpath('./span[@class = "text"]/text()').extract_first()
# 实例化对象
item = MyScrapyItem()
# 利用xpth进行爬取
text = quote.xpath('./span[@class = "text"]/text()').get()
author = quote.xpath('.//small[@class="author"]/text()').get()
Tags = quote.xpath('.//a[@class="tag"]/text()').getall()
item['text'] = text
item['author'] = author
item['Tag'] = Tags
# 迭代出去
yield item
items.py
import scrapy
class MyScrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 名言
text = scrapy.Field()
# 名人
author = scrapy.Field()
# 标签
Tag = scrapy.Field()
pipelines.py
class MyScrapyPipeline:
def process_item(self, item, spider):
# 简单保存
with open('demo2.txt','a',encoding= 'utf8') as f:
f.write(item['author'] + '\n'+ item['text'] + '\n\n\n')
return item
setting.py
ITEM_PIPELINES = {
'my_scrapy.pipelines.MyScrapyPipeline': 300,
}
run.py
from scrapy import cmdline
cmdline.execute('scrapy crawl spider '.split())
Original: https://blog.csdn.net/qq_51179608/article/details/125492782
Author: 依恋、阳光
Title: Scrapy案例(一)
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/789824/
转载文章受原作者版权保护。转载请注明原作者出处!