settings.py
添加如下代码指定输出字段顺序
FEED_EXPORT_FIELDS = ['code', 'name', 'new', 'rise_fall', 'price_limit', 'harvest', 'opening', 'high', 'low', 'volume', 'turnover', 'ratio', 'rate', 'capital', 'currency', 'company', 'trade', 'time', 'capitals', 'A_shares']
item.py
Define here the models for your scraped items
#
See documentation in:
https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
代码 名称 最新 涨跌 涨跌幅 前收 开盘 最高 最低 成交量 成交额 市盈率 换手率 总股本 流通股本
公司名称 所属行业 成立日期 总股本(亿) 流通A股(亿)
code, name, new, rise_fall, price_limit, harvest, opening, high, low, volume, turnover, ratio, rate, capital, currency
company, company, time, capitals, A_shares
class ZhongcaiItem(scrapy.Item):
code = scrapy.Field()
name = scrapy.Field()
new = scrapy.Field()
rise_fall = scrapy.Field()
price_limit = scrapy.Field()
harvest = scrapy.Field()
opening = scrapy.Field()
high = scrapy.Field()
low = scrapy.Field()
volume = scrapy.Field()
turnover = scrapy.Field()
ratio = scrapy.Field()
rate = scrapy.Field()
capital = scrapy.Field()
currency = scrapy.Field()
company = scrapy.Field()
trade = scrapy.Field()
time = scrapy.Field()
capitals = scrapy.Field()
A_shares = scrapy.Field()
spider文件
-*- coding: utf-8 -*-
import scrapy
from ..items import ZhongcaiItem
code, name, new, rise_fall, price_limit,harvest, opening, high, low, volume, turnover, ratio, rate, capital, currency
company, trade, time, capitals, A_shares
class ZcspiderSpider(scrapy.Spider):
name = 'ZCSpider'
allowed_domains = ['data.cfi.cn']
max_page = 2
start_urls = ['https://quote.cfi.cn/quotelist.aspx?sectypeid=1&cfidata=1']
base_url = 'https://quote.cfi.cn/quotelist.aspx?sortcol=stockcode&sortway=asc&pageindex={}§ypeid=1'
# 起始爬虫
def start_requests(self):
for page in range(1, self.max_page+1):
url = self.base_url.format(page)
yield scrapy.Request(url, callback=self.parse)
# code, name, new, rise_fall, price_limit,harvest, opening, high, low, volume, turnover, ratio, rate, capital, currency
def parse(self, response):
tr = response.css(".table_data tr")
for t in tr:
item = ZhongcaiItem()
if "代码" not in t.xpath('.//a/text()').extract()[0]:
href = t.xpath('.//a/@href').extract_first()
item['code'] = t.xpath('.//a/text()').extract()[0]
item['name'] = t.xpath('.//a/text()').extract()[1]
item['new'] = t.xpath('.//nobr/text()').extract()[0]
item['rise_fall'] = t.xpath('.//nobr/text()').extract()[1]
item['price_limit'] = t.xpath('.//nobr/text()').extract()[2]
item['harvest'] = t.xpath('.//nobr/text()').extract()[3]
item['opening'] = t.xpath('.//nobr/text()').extract()[4]
item['high'] = t.xpath('.//nobr/text()').extract()[5]
item['low'] = t.xpath('.//nobr/text()').extract()[6]
item['volume'] = t.xpath('.//nobr/text()').extract()[7]
item['turnover'] = t.xpath('.//nobr/text()').extract()[8]
item['ratio'] = t.xpath('.//nobr/text()').extract()[9]
item['rate'] = t.xpath('.//nobr/text()').extract()[10]
item['capital'] = t.xpath('.//nobr/text()').extract()[11]
item['currency'] = t.xpath('.//nobr/text()').extract()[12]
else:
continue
yield scrapy.Request(response.urljoin(href), callback=self.new_parse, meta={'item': item}, dont_filter=True)
# company, trade, time, capitals, A_shares
def new_parse(self, response):
item = response.meta['item']
tr = response.css('.vertical_table').xpath('.//tr')
item['company'] = (tr.xpath('./td[2]/text()').extract()[0])
item['trade'] = (tr.xpath('./td[2]/text()').extract()[1])
item['time'] = (tr.xpath('./td[2]/text()').extract()[2])
item['capitals'] = (tr.xpath('./td[2]/text()').extract()[3])
item['A_shares'] = (tr.xpath('./td[2]/text()').extract()[4])
yield item
代码编写完成之后在cmd或pycharm终端使用命令,scrapy crawl ZCSpider -o data.csv
就可以将爬取下来的数据按照指定的格式顺序保存为csv文件
写在最后
爬取中财网的过程中遇到的问题
问题1:
在爬取中财网的过程中出现了 Filtered offsite request to ‘quote.cfi.cn’问题
2022-01-07 11:47:22 [scrapy.robotstxt] WARNING: Failure while parsing robots.txt. File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file. Traceback (most recent call last): File “C:\Users\admin\AppData\Roaming\Python\Python38\site-packages\twisted\inter net\defer.py”, line 1661, in _inlineCallbacks result = current_context.run(gen.send, result) StopIteration:
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File “C:\Users\admin\AppData\Roaming\Python\Python38\site-packages\scrapy\robots txt.py”, line 16, in decode_robotstxt robotstxt_body = robotstxt_body.decode(‘utf-8’) UnicodeDecodeError: ‘utf-8’ codec can’t decode byte 0xd5 in position 248: invalid continuation byte
解决:
在settings.py将robots协议改成Flase,如下
ROBOTSTXT_OBEY = False
问题2:
2022-01-07 10:17:52 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite req uest to ‘quote.cfi.cn’:
解决:
在yield scrapy.Request() 里添加 dont_filter=True即可
Original: https://blog.csdn.net/weixin_45971950/article/details/122361122
Author: bug智造
Title: scrapy爬虫练习-中财网股票数据爬取
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/789732/
转载文章受原作者版权保护。转载请注明原作者出处!