爬虫实战
*
– 创建项目
– 项目需求
– 分析网站
– 爬虫文件spiders
– items.py
– pipelines.py
– settings.py
– 总结:
创建项目
在终端输入 scrapy startproject xiaomai
进入到项目实例下 cd xiaomai
创建爬虫文件夹
scrapy genspider xiaomai_pro www.xxx.com
项目需求
目标网站:https://www.chinaseed114.com/
目的需求 获取所有小麦种子的信息
https://www.chinaseed114.com/seed/xiaomai/
分析网站
确认爬取的数据,解析数据要用的解析方式
查看网页和源代码的区别(防止后面解析网页的时候出错)
爬虫文件spiders
主要实现的功能
- 获取所有小麦种子的url
- 获取小麦种子的所有数据
- 数据清洗和分类
在分析网页的时候,发现网页是静态网页,网页之间的格式不统一,对于数据的分类带来很大的困难
在进行数据分类的时候根据一些常出现的词语作为切入点,利用字符串的切片来进行分割数据,
然后再将分割出来的数据进行处理,删除一些于数据无关的文字
import math
import time
import random
import scrapy
from xiaomai.items import XiaomaiItem
class XiaomaiProSpider(scrapy.Spider):
name = 'xiaomai_pro'
start_urls = ['https://www.chinaseed114.com/seed/pzdq/']
def parse(self, response):
xiaomai_url = response.xpath('/html/body/div[5]/div[3]/div[2]/table/tr[1]/td[2]/a/@href').extract_first()
name = response.xpath('/html/body/div[5]/div[3]/div[2]/table/tr[1]/td[2]/a/text()').extract_first()
num = response.xpath('/html/body/div[5]/div[3]/div[2]/table/tr[1]/td[2]/span/text()').extract_first()
num = math.ceil(int(num[1:-1])/80)
print(name+xiaomai_url)
for i in range(1,num+1):
print(xiaomai_url+f"{i}.html")
time.sleep(random.random())
yield scrapy.Request(url=xiaomai_url+f"{i}.html", callback=self.parse_pinzhong)
def parse_pinzhong(self, response):
"""
获取每一页中每一个种子的url
:param response:每一页的网址的对象
:return:
"""
items_url = []
tr = response.xpath('/html/body/div[5]/div[1]/div/div[4]/table[2]/tr')
for i in range(len(tr)):
items_url.extend(tr[i].xpath('./td/ul/li/a/@href').extract())
for item_url in items_url:
time.sleep(random.random())
yield scrapy.Request(url=item_url, callback=self.parse_item_data)
def parse_item_data(self,response):
"""
提取具体得信息
:param response:
:return:
"""
text_list = response.xpath('//*[@id="article"]//text()').extract()
s = ['\r', '\n', '\xa0', ',']
text_list2 = []
for i in text_list:
for j in s:
i = i.strip().replace(j, '')
text_list2.append(i)
text = ','.join(text_list2)
text = text.replace(',','')
label = ['审定编号', '品种名称','申 请 者', '申请人','申请者','申请单位', '申报单位','育种人','育 种 者','育种者', '选育单位','品种来源',
'特征特性', '品质分析结果', '抗性鉴定结果','引 种 者',
'产量表现', '栽培技术要点', '审定意见', '抗性鉴定', '亲本组合','品质分析', '适宜地区', '抗病鉴定', '产量结果','引种备案号']
label_dict = dict(zip(label, ['' for i in range(len(label))]))
for i in label:
target = ''
if i in text:
start_num = text.find(i)
if start_num != -1:
start_num += len(i) + 1
end_num = text[start_num:].find(':')
if end_num != -1:
end_num = end_num + start_num
target = text[start_num:end_num]
for j in label:
if j in target:
target = target.replace(j, '')
break
label_dict[i] = target
if '适宜' in label_dict['审定意见']:
ind = label_dict['审定意见'].find('适宜')
label_dict['审定意见'] = label_dict['审定意见'][ind:]
if label_dict['育种者'] == '':
if label_dict['选育单位'] != '':
label_dict['育种者'] = label_dict['选育单位']
elif label_dict['育 种 者']!= '':
label_dict['育种者'] = label_dict['育 种 者']
elif label_dict['育种人'] != '':
label_dict['育种者'] = label_dict['育种人']
if label_dict['申请者'] == '':
if label_dict['申 请 者'] !='':
label_dict['申请者'] = label_dict['申 请 者']
elif label_dict['申请单位'] !='':
label_dict['申请者'] = label_dict['申请单位']
elif label_dict['申请人'] !='':
label_dict['申请者'] = label_dict['申请人']
elif label_dict['申报单位'] !='':
label_dict['申请者'] = label_dict['申报单位']
nams = ['号','二',':','作物名称','育种单位','引种备案号','主要性状',')试验名称','〈1〉品种来源','引 种 者','作物名称']
for i in nams:
if i in label_dict['审定编号']:
ind = label_dict['审定编号'].find(i)
label_dict['审定编号'] = label_dict['审定编号'][:ind]
item = XiaomaiItem()
item['serial_number'] = label_dict.get('审定编号')
item['url'] = response.url
item['name'] = label_dict.get("品种名称")
item['applicant'] = label_dict.get('申请者')
item['breeder'] = label_dict.get('育种者')
item['source'] = label_dict.get('品种来源')
item['basics_data'] = label_dict.get('特征特性')
item['output_data'] = label_dict.get('产量表现')
item['plant_method'] = label_dict.get('栽培技术要点')
item['area'] = label_dict.get('审定意见')
yield item
items.py
建立item类
class XiaomaiItem(scrapy.Item):
serial_number = scrapy.Field()
name = scrapy.Field()
url = scrapy.Field()
applicant = scrapy.Field()
breeder = scrapy.Field()
source = scrapy.Field()
basics_data = scrapy.Field()
output_data = scrapy.Field()
plant_method = scrapy.Field()
area = scrapy.Field()
pipelines.py
持久化储存
文件格式为.csv
import pandas as pd
class XiaomaiPipeline:
def open_spider(self, spider):
print('开始爬虫')
def process_item(self, item, spider):
df = [item['serial_number'],
item['name'],
item['url'],
item['applicant'],
item['breeder'],
item['source'],
item['basics_data'],
item['output_data'],
item['plant_method'],
item['area']]
df = pd.DataFrame(df).T
df.to_csv('./xiaomai_data.csv',mode='a',index=False,header=False,encoding='utf-8',chunksize=True)
print("追加成功")
return item
def close_spider(self, spider):
print("爬虫结束")
settings.py
项目的相关配置
USER_AGENT = '' UA 伪装
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
DOWNLOADER_MIDDLEWARES = {
'xiaomai.middlewares.XiaomaiDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
'xiaomai.pipelines.XiaomaiPipeline': 300,
}
总结:
- 在使用xpath解析的时候遇到了通过检查得到的代码和源代码不一样的情况
- 项目还不是很完善,因为所爬网站的信息格式不统一,所以在进行数据清洗和数据分类的时候,并不能将所有的数据进行合理的分配。
- 爬虫的伪装性太差,只有ua伪装,改变访问的频率,没有进行ip伪装
- 在数据存储的方面,每存储一个小麦种子的信息,就需要操作一遍文件,效率有点低
Original: https://blog.csdn.net/qq_52007481/article/details/124480329
Author: 小鱼干儿♛
Title: pythonScarpy框架项目实战
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/698267/
转载文章受原作者版权保护。转载请注明原作者出处!