最网最全python框架–scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬

1.boss直聘爬虫(bossPro)

2.豆瓣爬虫(db)

3.fbsPro

4.百度爬虫(firstblood)

5.校花全站图片爬虫(imgsPro)

6.代理ip配置爬虫(middlePro)

7.全站视频爬虫(moviePro)

8.糗图百科视频,图片,视频下载(qiubaiPro)

9.最新问政-阳光热线问政平台–相关政策全站爬虫(sunPro)

10.网易新闻全站信息爬虫(wangyiPro)

11.中国校花网,全站校花图片爬取(xiaohuaPro)

12.数据库示例

最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬

1.boss直聘爬虫(bossPro)

最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬
最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬
最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬
boss.py

import scrapy
from bossPro.items import BossproItem

class BossSpider(scrapy.Spider):
    name = 'boss'

    start_urls = ['https://www.zhipin.com/c101010100/?query=python&ka=sel-city-101010100']

    url = 'https://www.zhipin.com/job_detail/bd1e60815ee5d3741nV92Nu1GVZY.html?ka=search_list_jname_1_blank&lid=8iXPDpH593w.search.%d'
    page_num = 2

    def parse_detail(self,response):
        item = response.meta['item']

        job_desc = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div//text()').extract()
        job_desc = ''.join(job_desc)
        item['job_desc'] = job_desc
        print(job_desc)

        yield item

    def parse(self, response):

        li_list = response.xpath('//*[@id="main"]/div/div[2]/ul/li')
        print(li_list)
        for li in li_list:
            item = BossproItem()

            job_name = li.xpath('.//span[@class="job-name"]a/text()').extract_first()
            item['job_name'] = job_name
            print(job_name)
            detail_url ='https://www.zhipin.com/' + li.xpath('.//span[@class="job-name"]a/@href').extract_first()

            yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})

        if self.page_num 3:
            new_url = format(self.url%self.page_num)
            self.page_num += 1

            yield scrapy.Request(new_url,callback = self.parse)

items.py


import scrapy

class BossproItem(scrapy.Item):

    job_name = scrapy.Field()
    job_desc = scrapy.Field()

middlewares.py


from scrapy import signals

class BossproSpiderMiddleware:

    @classmethod
    def from_crawler(cls, crawler):

        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):

        return None

    def process_spider_output(self, response, result, spider):

        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):

        pass

    def process_start_requests(self, start_requests, spider):

        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

class BossproDownloaderMiddleware:

    @classmethod
    def from_crawler(cls, crawler):

        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):

        return None

    def process_response(self, request, response, spider):

        return response

    def process_exception(self, request, exception, spider):

        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

piplelines.py


class BossproPipeline:
    def process_item(self, item, spider):
        print(item)
        return item
个人公众号 yk 坤帝
后台回复scrapy 获取全部源代码

settings.py


BOT_NAME = 'bossPro'

SPIDER_MODULES = ['bossPro.spiders']
NEWSPIDER_MODULE = 'bossPro.spiders'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'

ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'

ITEM_PIPELINES = {
   'bossPro.pipelines.BossproPipeline': 300,
}

2.豆瓣爬虫(db)

最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬
最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬
最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬
db.py

import scrapy

class DbSpider(scrapy.Spider):
    name = 'db'

    start_urls = ['https://www.douban.com/group/topic/157797102/']

    def parse(self, response):
        li_list = response.xpath('//*[@id="comments"]/li')

        for li in li_list:

            comments = li.xpath('./div[2]/p/text()').extract_first()
            print(comments)

3.fbsPro


from scrapy_redis.spiders import RedisCrawlSpider
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from fbsPro.items import FbsproItem

class FbsSpider(RedisCrawlSpider):
    name = 'fbs'

    redis_key = 'sun'

    rules = (
        Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        item = {}

        item = FbsproItem()

        yield item

4.百度爬虫(firstblood)
first.py


import scrapy

class FirstSpider(scrapy.Spider):
    name = 'first'

    start_urls = ['https://www.baidu.com/','https://www.sogou.com']

    def parse(self, response):
        print(response)

5.校花全站图片爬虫(imgsPro)

最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬
最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬
img.py

import scrapy
from imgsPro.items import ImgsproItem

class ImgSpider(scrapy.Spider):
    name = 'img'

    start_urls = ['https://sc.chinaz.com/tupian/']

    def parse(self, response):
        div_list = response.xpath('//div[@id="container"]/div')
        for div in div_list:
            src ='https:' + div.xpath('./div/a/img/@src2').extract_first()
            print(src)

            item = ImgsproItem()
            item['src'] = src

            yield item

6.代理ip配置爬虫(middlePro)

最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬
middle.py

import scrapy

class MiddleSpider(scrapy.Spider):
    name = 'middle'

    start_urls = ['http://www.baidu.com/s?wd=ip']

    def parse(self, response):
        page_text = response.text

        with open('ip.html','w',encoding= 'utf-8') as fp:
            fp.write(page_text)

7.全站视频爬虫(moviePro)

最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬

movie.py


import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from moviePro.items import MovieproItem
from redis import Redis

class MovieSpider(CrawlSpider):
    name = 'movie'

    start_urls = ['http://www.4567kan.com/frim/index1.html']

    rules = (
        Rule(LinkExtractor(allow=r'/frim/index1-\d+\.html'), callback='parse_item', follow=True),
    )
    conn = Redis(host = '127.0.0.1', port = 6379)

    def parse_item(self, response):
        li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
        for li in li_list:

            detail_url = 'http://www.4567kan.com/' + li.xpath('./div/a/@href').extract_first()

            ex = self.conn.sadd('urls',detail_url)

            if ex == 1:

                print('该url没有被爬取过,可以进行数据的爬取')
                yield scrapy.Request(url = detail_url, callback= self.parst_detail)

            else:

                print('数据还没有更新,暂无新数据可爬!')

    def parst_detail(self,response):

        item = MovieproItem()
        item['name'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
        item['desc'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]//text()').extract()
        item['desc'] = ''.join(item['desc'])
        yield item

8.糗图百科视频,图片,视频下载(qiubaiPro)

最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬

import scrapy
from qiubaiPro.items import QiubaiproItem

class QiubaiSpider(scrapy.Spider):
    name = 'qiubai'

    start_urls = ['https://www.qiushibaike.com/text/']

    def parse(self, response):
        div_list = response.xpath('//div[@id="content"]/div[1]/div[2]/div')

        for div in div_list:
            auther = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()

            content = div.xpath('./a[1]/div/span//text()').extract()
            content = ','.join(content)

            item = QiubaiproItem()
            item['auther'] = auther
            item['content'] = content

            yield item

9.最新问政-阳光热线问政平台–相关政策全站爬虫(sunPro)

最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬
sun.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

class SunSpider(CrawlSpider):
    name = 'sun'

    start_urls = ['http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1']

    link = LinkExtractor(allow=r'id=1&page=\d+')
    rules = (
        Rule(link, callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        print(response)

10.网易新闻全站信息爬虫(wangyiPro)

最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬
wangyi.py

import scrapy

from wangyiPro.items import WangyiproItem
from selenium import webdriver
class WangyiSpider(scrapy.Spider):
    name = 'wangyi'

    start_urls = ['https://news.163.com/']

    models_urls = []

    def __init__(self):
        self.bro = webdriver.Chrome()

    def parse(self, response):
        li_list = response.xpath('//*[@id="js_festival_wrap"]/div[3]/div[2]/div[2]/div[2]/div/ul/li')
        alist = [3,4,6,7,8]
        for index in alist:
            model_url = li_list[index].xpath('./a/@href').extract_first()
            self.models_urls.append(model_url)

        for url in self.models_urls:
            yield scrapy.Request(url,callback= self.parse_model)

    def parse_model(self,response):

        div_list = response.xpath('/html/body/div[1]/div[3]/div[4]/div[1]/div/div/ul/li/div/div')

        for div in div_list:
            title = div.xpath('./div/div[1]/h3/a/text()').extract_first()
            new_detail_url = div.xpath('./div/div[1]/h3/a/@href').extract_first()

            item = WangyiproItem()
            item['title'] = title

            yield scrapy.Request(url = new_detail_url,callback=self.parse_detail)

    def parse_detail(self,response):
        content = response.xpath('//*[@id="content"]//text()').extract()
        content = ''.join(content)

        item = response.mata['item']

        item['content'] = content

        yield item

11.中国校花网,全站校花图片爬取(xiaohuaPro)

最网最全python框架--scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬
xiaohua.py

– coding: utf-8 –

大二

2021年2月22日星期一

寒假开学时间3月7日

个人公众号 yk 坤帝

后台回复scrapy 获取全部源代码

import scrapy

class XiaohuaSpider(scrapy.Spider):
name = ‘xiaohua’

allowed_domains = [‘www.xxx.com’]

start_urls = [‘http://www.xiaohuar.com/daxue/’]
url = ‘http://www.xiaohuar.com/daxue/index_%d.html’
page_num = 2

def parse(self, response):
    li_list = response.xpath('//*[@id="wrap"]/div/div/div')
    for li in li_list:
        img_name = li.xpath('./div/div[1]/a/text()').extract_first()
        print(img_name)
        #li.xpath('./div/a/img/@src')

    if self.page_num <= 11: new_url="format(self.url%self.page_num)" self.page_num +="1" yield scrapy.request(new_url,callback="self.parse)" < code></=>

middlewares.py


from scrapy import signals

class XiaohuaproSpiderMiddleware:

    @classmethod
    def from_crawler(cls, crawler):

        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):

        return None

    def process_spider_output(self, response, result, spider):

        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):

        pass

    def process_start_requests(self, start_requests, spider):

        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

class XiaohuaproDownloaderMiddleware:

    @classmethod
    def from_crawler(cls, crawler):

        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):

        return None

    def process_response(self, request, response, spider):

        return response

    def process_exception(self, request, exception, spider):

        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

settings.py


BOT_NAME = 'xiaohuaPro'

SPIDER_MODULES = ['xiaohuaPro.spiders']
NEWSPIDER_MODULE = 'xiaohuaPro.spiders'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'

LOG_LEVEL = 'ERROR'
ROBOTSTXT_OBEY = False

12.数据库示例


import pymysql

db = pymysql.connect("192.168.31.19", "root", "200829", "wj" )

cursor = db.cursor()

sql = "select version()"

cursor.execute(sql)

data = cursor.fetchone()
print(data)

cursor.close()
db.close()

Original: https://blog.csdn.net/qq_45803923/article/details/116987776
Author: yk 坤帝
Title: 最网最全python框架–scrapy(体系学习,爬取全站校花图片),学完显著提高爬虫能力(附源代码),突破各种反爬

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/789269/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球