这是被百度反爬了,可以参考以下代码
baidu.py
import scrapy
from scrapy.http import Request
from urllib.parse import urlencode
import json
import time
from urllib.parse import quote
from fake_useragent import UserAgent
import re
from baiduimg.items import BaiduimgItem
class BaiduSpider(scrapy.Spider):
name = 'baidu'
# allowed_domains = ['www.xxx.com']
# start_urls = ['http://www.xxx.com/']
ua = UserAgent()
header={
'User-Agent':ua.random
}
def start_requests(self):
# base_url = "https://image.baidu.com/search/acjson?"
# words = quote(input("请输入关键词:"))
# parm = {'word': words}
# for page in range(1, self.settings.get("MAX_PAGE") + 1):
# parm['pn'] = 30 * page
# parm['tn'] ='resultjson_com'
# parm['ipn'] ='rj'
# parm['gsm'] = hex(30 * page)
# parm['queryWord'] = words
# timestamp=int(time.time())*1000
# parm[timestamp] = ''
# url = base_url +urlencode(parm)
# # print(timestamp)
# yield Request(url=url,headers=self.header,callback=self.parse)
keyworlds=input("输入关键字:")
for page in range(1, self.settings.get("MAX_PAGE") + 1):
url = "http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=" + keyworlds + "&pn="+ str(page * 20) + "&gsm=140&ct=&ic=0&lm=-1&width=0&height=0"
yield Request(url=url, headers=self.header, callback=self.parse)
def parse(self, response):
# print(response.text)
data=response.text
# data=str(data).replace("")
er = '\"objURL\":\"(.*?)\"\,'
er1='\"fromPageTitle\":\"(.*?)\"\,'
objURL = re.compile(er, re.S).findall(data)
fromPageTitle=re.compile(er1, re.S).findall(data)
# image_list = re.findall(er, data, re.S)
# print(d.encode())
for i in range(1,len(objURL)):
item=BaiduimgItem()
item['fromPageTitle']=fromPageTitle[i].replace("",'').replace("",'')
item['objURL'] = objURL[i]
yield item
# json_data=json.loads(response.text)
# print(json_data['data'])
# for data in json_data['data']:
# print(data['thumbURL'])
item.py
Define here the models for your scraped items
#
See documentation in:
https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BaiduimgItem(scrapy.Item):
# define the fields for your item here like:
fromPageTitle = scrapy.Field()
objURL = scrapy.Field()
middlewares.py
Define here the models for your spider middleware
#
See documentation in:
https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class BaiduimgSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn't have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class BaiduimgDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
pipelines.py
Define your item pipelines here
#
Don't forget to add your pipeline to the ITEM_PIPELINES setting
See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
useful for handling different item types with a single interface
from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
import pymongo
class BaiduimgmongoPipeline(object):
def __init__(self,mongo_url,mongo_db):
self.mongo_url=mongo_url
self.mongo_db=mongo_db
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_url=crawler.settings.get("MONGO_URL"),
mongo_db=crawler.settings.get("MONGO_DB")
)
def open_spider(self,spider):
self.client=pymongo.MongoClient(self.mongo_url)
self.db=self.client[self.mongo_db]
def process_item(self,item,spider):
self.db[spider.name].insert(dict(item))
print("数据库中的集合名字:"+spider.name)
return item
def close_spider(self,spider):
self.client.close()
class BaiduimgPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split('/')[-1]
return file_name
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
print(str(image_paths))
if not image_paths:
raise DropItem("下载失败")
return item
def get_media_requests(self, item, info):
yield Request(url=item['objURL'], dont_filter=False)
settings.py
Scrapy settings for baiduimg project
#
For simplicity, this file contains only settings considered important or
commonly used. You can find more settings consulting the documentation:
#
https://docs.scrapy.org/en/latest/topics/settings.html
https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import os
project_dir=os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE=os.path.join(project_dir,'images')
BOT_NAME = 'baiduimg'
#页码
MAX_PAGE=2
#配置mongodb参数
MONGO_URL='localhost'
MONGO_DB='test_image'
SPIDER_MODULES = ['baiduimg.spiders']
NEWSPIDER_MODULE = 'baiduimg.spiders'
Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
Obey robots.txt rules
ROBOTSTXT_OBEY = False
Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
Configure a delay for requests for the same website (default: 0)
See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
Disable cookies (enabled by default)
#COOKIES_ENABLED = False
Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
# 'referer':'https://image.baidu.com/search/acjson?word=%25E7%258B%2597&pn=30&tn=resultjson_com&ipn=rj&gsm=0x1e&queryWor',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
#显示指定类型的日志信息
LOG_LEVEL='ERROR'
Enable or disable spider middlewares
See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
'baiduimg.middlewares.BaiduimgSpiderMiddleware': 543,
#}
Enable or disable downloader middlewares
See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
'baiduimg.middlewares.BaiduimgDownloaderMiddleware': 543,
#}
Enable or disable extensions
See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
'scrapy.extensions.telnet.TelnetConsole': None,
#}
Configure item pipelines
See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'baiduimg.pipelines.BaiduimgPipeline': 300,
'baiduimg.pipelines.BaiduimgmongoPipeline': 301,
}
Enable and configure the AutoThrottle extension (disabled by default)
See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
The average number of requests Scrapy should be sending in parallel to
each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
Enable and configure HTTP caching (disabled by default)
See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
start.py
from scrapy import cmdline
#注入执行语句
cmdline.execute("scrapy crawl baidu".split())
Original: https://blog.csdn.net/weixin_46026136/article/details/118035735
Author: 小施没烦恼
Title: scrapy爬取百度图片时出现Forbid spider access
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/789160/
转载文章受原作者版权保护。转载请注明原作者出处!