我认为这此课讲的更好!
一、Python3建议使用 PyMySQL
PyMySQL 是在 Python3.x 版本中用于连接 MySQL 服务器的一个库,Python2中则使用mysqldb。
PyMySQL 遵循 Python 数据库 API v2.0 规范,并包含了 pure-Python MySQL 客户端库。
二、MySQL基本操作
参见Python3 MySQL 数据库连接 – PyMySQL 驱动www.runoob.com
三、代码区
main.py
from scrapy.cmdline import execute
import os
import sys
a=os.path.dirname(os.path.abspath(file))
print(a)
sys.path.append(os.path.dirname(os.path.abspath(file)))
execute([“scrapy”,”crawl”,”baidu”])
–– coding: utf-8 ––
import scrapy
from urllib import parse
from baidu_tieba.items import TiebaItem
class BaiduSpider(scrapy.Spider):
name = ‘baidu’
allowed_domains = [‘tieba.baidu.com’]
start_urls = [‘https://tieba.baidu.com/f\?ie\=utf-8\&kw\=%E9%98%B2%E8%AF%88%E9%AA%97’]
start_urls = [‘https://tieba.baidu.com/f?ie=utf-8&kw=%E9%98%B2%E8%AF%88%E9%AA%97’]
def parse(self, response):
url_list=response.xpath(‘//a[@class=”j_th_tit “]/@href’).extract()
print(url_list)
for url in url_list:
yield scrapy.Request(url=parse.urljoin(response.url,url),callback=self.parse_detail)
def parse_detail(self,response):
content_list=response.xpath(‘//div[contains(@class,”d_post_content”)]/text()’).extract()
temp=response.xpath(‘//div[@class=”d_post_content_main”]//div[@class=”p_content “]//text()’).extract()
post_content_main=response.xpath(‘//div[contains(@class,”d_post_content_main”)]’)
content_list=[]
device_list=[]
floor_list=[]
posttime_list=[]
此部分顺序取得回帖内容、设备、日期等。
for r in post_content_main:
content=r.xpath(‘.//div[contains(@class,”d_post_content”)]/text()’).extract()
content=””.join(content).strip()
content_list.append(content)
print(content)
post_tail=r.xpath(‘.//div[@class=”post-tail-wrap”]’)
tail_info=r.xpath(‘.//div[@class=”post-tail-wrap”]/span[@class=”tail-info”]’)
for pt in post_tail:
if len(tail_info)==3:
device=pt.xpath(‘./span[@class=”tail-info”][1]//text()’).extract()
device=””.join(device)
floor=pt.xpath(‘./span[@class=”tail-info”][2]/text()’).extract()
floor=floor[0]
post_time=pt.xpath(‘./span[@class=”tail-info”][3]/text()’).extract()
post_time=post_time[0]
else:
device=”无”
floor=pt.xpath(‘./span[@class=”tail-info”][1]/text()’).extract()
floor=floor[0]
post_time=pt.xpath(‘./span[@class=”tail-info”][2]/text()’).extract()
post_time=post_time[0]
device_list.append(device)
floor_list.append(floor)
posttime_list.append(post_time)
for i in range(len(content_list)):
tieba_item = TiebaItem()
tieba_item[‘title’]=title[0]
tieba_item[‘content’]=content_list[i]
tieba_item[‘device’]=device_list[i]
tieba_item[‘floor’] = floor_list[i]
tieba_item[‘posttime’]=posttime_list[i]
yield tieba_item
–– coding: utf-8 ––
Define here the models for your scraped items
See documentation in:
https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BaiduTiebaItem(scrapy.Item):
define the fields for your item here like:
name = scrapy.Field()
pass
自定义的class
class TiebaItem(scrapy.Item):
define the fields for your item here like:
title = scrapy.Field()
content=scrapy.Field()
device = scrapy.Field()
floor = scrapy.Field()
posttime = scrapy.Field()
def get_insert_sql(self):
insert_sql=”’
insert into baidu_tieba(title,content,device,floor,posttime) values (%s,%s,%s,%s,%s)
”’
params=(self[‘title’],self[‘content’],self[‘device’],self[‘floor’],self[‘posttime’])
return insert_sql,params
pipelines. py
–– coding: utf-8 ––
Define your item pipelines here
Don’t forget to add your pipeline to the ITEM_PIPELINES setting
See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class BaiduTiebaPipeline(object):
def process_item(self, item, spider):
return item
from twisted.enterprise import adbapi
import pymysql
import pymysql.cursors
class MysqlTwistedPipeline(object):
def init(self,dbpool):
self.dbpool=dbpool
”’
MYSQL_HOST=’localhost’
MYSQL_DBNAME=’baidutieba’
MYSQL_USER=’root’
MYSQL_PASSWORD=’Mysql86mysql.’
”’
加载配置
@classmethod
def from_settings(cls,setting):
dbparams=dict(
host=setting[‘MYSQL_HOST’],
db=setting[‘MYSQL_DBNAME’],
user=setting[‘MYSQL_USER’],
passwd=setting[‘MYSQL_PASSWORD’],
charset=’utf8′,
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
连接池
dbpool=adbapi.ConnectionPool(“pymysql”,**dbparams)
return cls(dbpool)
def process_item(self, item, spider):
query=self.dbpool.runInteraction(self.do_insert,item)
def do_insert(self,cursor,item):
insert_sql,params=item.get_insert_sql()
cursor.execute(insert_sql,params)
setting .py
ITEM_PIPELINES = {
‘baidu_tieba.pipelines.BaiduTiebaPipeline’: 300,
‘baidu_tieba.pipelines.MysqlTwistedPipeline’: 1,
MYSQL_HOST=’localhost’
MYSQL_DBNAME=’baidutieba’
MYSQL_USER=’***’
MYSQL_PASSWORD=’***’
测试可用!
Original: https://blog.csdn.net/weixin_39541189/article/details/113202026
Author: weixin_39541189
Title: scrapy和mysql_贪心学院作业:如何使用Scrapy框架和MySQL数据库
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/792943/
转载文章受原作者版权保护。转载请注明原作者出处!