scrapy框架爬取网站商品信息

项目场景:

苏宁易购计算机商品列表爬取(台式机、笔记本和平板)
以及评论内容爬取

使用架构是:scrapy

存储数据使用是mongoDB存数据 在爬取过程中,最重要是学会分析网页请求URL,分析清楚URL之后才能获取到想要的信息

网页URL分析

电脑商品列表的URL:

https://list.suning.com/emall/searchV1Product.do?ci=258003&pg=03&yjhx=&cp=0&il=0&st=0&iy=0&isNoResult=0&n=1&sesab=ACBAABC&id=IDENTIFYING&cc=773&paging=0&sub=1&jzq=40634

其中,需要关注两个参数

  1. cp是翻页的意思,p=0,1,2,3
  2. paging是下拉页面请求,商品列表每一页一次加载30个商品列表,如果下拉就会继续加载,paging的最大值为3,这样算是一整页(在分析过程中发现paging可以取0~198,商品都不重复)

setting 文件

DOWNLOAD_DELAY = 3

DEFAULT_REQUEST_HEADERS = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',

}

SPIDER_MIDDLEWARES = {

    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
    'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': None,
}
ITEM_PIPELINES = {
    'datasetSpider.pipelines.SuComputerPipeline': 300,
}

MONGODB_HOST = "localhost"
MONGODB_PORT = 27017
MONGODB_DBNAME = 'suning'
MONGODB_COMPUTER_LIST = 'su_computer_list1'
MONGODB_COMPUTER_REVIEW = 'su_computer_review1'

items文件


class SuNingComputer(scrapy.Item):
    ProductId = scrapy.Field()
    ProductName = scrapy.Field()
    ProductDescription = scrapy.Field()
    ProductUrl = scrapy.Field()
    ProductCategories = scrapy.Field()
    ProductPrice = scrapy.Field()
    StoreName = scrapy.Field()
    ProductParameter = scrapy.Field()

    ShopId = scrapy.Field()

class SuNingComuputerReview(scrapy.Item):
    ProductId = scrapy.Field()
    ReviewId = scrapy.Field()
    ReviewEr = scrapy.Field()
    ReviewContent = scrapy.Field()
    ReviewRating = scrapy.Field()
    ReviewHelful = scrapy.Field()
    ReviewTime= scrapy.Field()

items文件中写了两个类 SuNingComuputerReview 是电脑的商品列表类 SuNingComuputerReview 是对应每一个电脑商品的评论类

pipelines文件

import  json
from datasetSpider.items  import SuNingComputer,SuNingComuputerReview
import pymongo
from datasetSpider import settings

class SuComputerPipeline(object):
    def __init__(self):

        pass

    def open_spider(self,spider):

        host = settings.MONGODB_HOST
        port = settings.MONGODB_PORT
        daname = settings.MONGODB_DBNAME
        computer_list = settings.MONGODB_COMPUTER_LIST
        computer_review = settings.MONGODB_COMPUTER_REVIEW

        self.mongodb_client = pymongo.MongoClient(host=host, port=port)
        self.db = self.mongodb_client[daname]
        self.db_su_computer_list = self.db[computer_list]
        self.db_su_computer_review = self.db[computer_review]

    def process_item(self, item, spider):
        insert_mongo = dict(item)

        if isinstance(item,SuNingComputer):

            self.db_su_computer_list.insert(insert_mongo)

        elif isinstance(item,SuNingComuputerReview):

            self.db_su_computer_review.insert(insert_mongo)

        return item

    def close_spider(self, spider):

        self.mongodb_client.close()

computer.py 业务处理

import re
import scrapy
import datasetSpider
from datasetSpider.items import SuNingComputer,SuNingComuputerReview
import json
import requests

class ComputerSpider(scrapy.Spider):

    name = 'computer'
    allowed_domains = ['https://search.suning.com/']

    start_urls = ['https://list.suning.com/emall/searchV1Product.do?ci=258003&pg=03&yjhx=&cp=0&il=0&st=0&iy=0&isNoResult=0&n=1&sesab=ACBAABC&id=IDENTIFYING&cc=773&sub=0&jzq=40609']

    page_num = 0
    same_page_url = "https://list.suning.com/emall/searchV1Product.do?ci=258003&pg=03&yjhx=&cp="
    same_page_url_center = "&il=0&st=0&iy=0&isNoResult=0&n=1&sesab=ACBAABC&id=IDENTIFYING&cc=773&paging="
    same_page_url_tail = "&sub=1&jzq=40634"

    next_page_num = 15
    next_page_url = "https://list.suning.com/emall/searchV1Product.do?ci=258003&pg=03&yjhx=&cp="
    next_page_url_tail = "&il=0&st=0&iy=0&adNumber=0&isNoResult=0&n=1&sesab=ACBAABC&id=IDENTIFYING&cc=773&sub=1&jzq=40610"

    def parse(self, response):
        product_list=response.xpath("//div[@class='title-selling-point']")

        for each in product_list:

            product_url = each.xpath("./a/@href").extract()[0]
            nextUrl = 'https:'+product_url
            product_data=eval(each.xpath("./a/@sa-data").extract()[0])

            ProductId = product_data["prdid"]
            ShopId = product_data["shopid"]

            yield scrapy.Request(url=nextUrl,meta={'ProductId':ProductId,'ShopId':ShopId,'ProductUrl':nextUrl},callback=self.productDetail)

        print("page_num:")
        print(self.page_num)
        if self.page_num < 4:
            yield scrapy.Request(self.same_page_url + str(self.next_page_num) + self.same_page_url_center + str(self.page_num) + self.same_page_url_tail, callback=self.parse)
            self.page_num += 1

        if self.page_num >=4:
            self.page_num = 0;
            if self.next_page_num  50:
                yield scrapy.Request(self.next_page_url + str(self.next_page_num) + self.next_page_url_tail,callback=self.parse)
                self.next_page_num += 1

    def productDetail(self,response):
        get_meta = response.meta

        product_id = get_meta['ProductId']

        product_url = get_meta['ProductUrl']

        shop_id = get_meta['ShopId']

        product_description = response.xpath("//meta[@name='description']/@content").extract()[0]

        product_parameter=response.xpath("//table[@id='itemParameter']/tbody/tr")
        parameter_dict={}
        for i in product_parameter:

            text1=i.xpath("./td[1]/div/span/text()").extract()
            text2=i.xpath("./td[2]/text()").extract()
            text3=i.xpath("./td[2]/a/text()").extract()

            punctuation = '!,;:?".\''
            if len(text1) != 0 and len(text2) != 0:

                key1 = re.sub(r'[{}]+'.format(punctuation),"_",text1[0])

                parameter_dict[key1] = text2[0]
            if len(text1) != 0 and len(text3) != 0:
                key2 = text1[0]
                parameter_dict[key2] = text3[0]

        script = response.xpath("//script[@type='text/javascript']/text()").extract()
        cluster = re.findall(r'\"clusterId\":\".*?\"', script[0])
        cluster_id = json.loads("{"+cluster[0]+"}")['clusterId']

        product_display = re.findall(r'\"itemDisplayName\":\".*?\"', script[0])
        product_name = json.loads("{" + product_display[0] + "}")['itemDisplayName']

        flagshipName = re.findall(r'\"flagshipName\":\".*?\"', script[0])
        product_store_name = json.loads("{" + flagshipName[0] + "}")['flagshipName']

        product_category = {}
        category = re.findall(r'\"categoryName\d\":\".*?\"', script[0])
        categoryName = {}
        for it in category:
            categoryName.update(json.loads("{"+it+"}"))
        product_category['0'] = categoryName['categoryName1']
        product_category['1'] = categoryName['categoryName2']
        product_category['2'] = categoryName['categoryName3']

        brandName = re.findall(r'\"brandName\":\".*?\"', script[0])
        itemDisplayName = re.findall(r'\"itemDisplayName\":\".*?\"', script[0])

        brandName = json.loads("{"+brandName[0]+"}")
        itemDisplayName = json.loads("{"+itemDisplayName[0]+"}")

        product_category['3'] = brandName['brandName']
        product_category['4'] = itemDisplayName['itemDisplayName']

        price_url = "https://icps.suning.com/icps-web/getVarnishAllPriceNoCache/0000000" + str(product_id) + "_773_7730199_" + str(shop_id) + "_1_getClusterPrice.jsonp"

        meta = {"ProductId":product_id,"ShopId":shop_id,"ProductUrl":product_url,
                "ProductName":product_name,"ProductDescription":product_description,
                "ProductCategories":product_category,"StoreName":product_store_name,
                "ProductParameter":parameter_dict,"cluster_id":cluster_id}
        yield scrapy.Request(url=price_url,meta=meta,callback=self.Price)

    def Price(self,response):
        get_meta = response.meta
        price_dirt = re.findall(r'[(](.*)[)]', response.text)[0]
        price_dirt = json.loads(price_dirt)[0]

        product_price = price_dirt['price']

        product_id = get_meta["ProductId"]
        shop_id = get_meta["ShopId"]
        product_url = get_meta["ProductUrl"]
        product_name = get_meta["ProductName"]
        product_description = get_meta["ProductDescription"]
        product_category = get_meta["ProductCategories"]
        product_store_name = get_meta["StoreName"]
        parameter_dict = get_meta["ProductParameter"]
        cluster_id = get_meta["cluster_id"]

        su_item = SuNingComputer()
        su_item["ProductId"] = product_id
        su_item["ShopId"] = shop_id
        su_item['ProductUrl'] = product_url
        su_item["ProductName"] = product_name
        su_item["ProductDescription"] = product_description
        su_item["ProductCategories"] = product_category
        su_item["ProductPrice"] = product_price
        su_item["StoreName"] = product_store_name
        su_item["ProductParameter"] = parameter_dict
        yield su_item

        total_review_url = "https://review.suning.com/ajax/cluster_review_satisfy/cluster-" + str(cluster_id) + "-0000000" + str(product_id) + "-" + str(shop_id) + "-----satisfy.htm"
        yield scrapy.Request(url=total_review_url,meta={"product_id": product_id, "shop_id": shop_id, "cluster_id": cluster_id},callback=self.Review)

    def Review(self,response):
        get_meta = response.meta
        cluster_id = get_meta["cluster_id"]
        product_id = get_meta["product_id"]
        shop_id = get_meta["shop_id"]

        for i in range(1,51):

            review_url = "https://review.suning.com/ajax/cluster_review_lists/cluster-"+str(cluster_id)+"-0000000"+str(product_id)+"-"+str(shop_id)+"-total-"+str(i)+"-default-10-----reviewList.htm"
            yield scrapy.Request(url=review_url,meta={"product_id":product_id},callback=self.ReviewList)

    def ReviewList(self,response):

        get_meta = response.meta
        product_id = get_meta["product_id"]

        review_list = re.findall(r'[(](.*)[)]', response.text)

        review_text = ''.join(review_list)

        review_dirt = json.loads(review_text)

        if "commodityReviews" in review_dirt:
            commodityReviews = review_dirt['commodityReviews']
            for item in commodityReviews:

                commodityReviewId = item['commodityReviewId']
                content = item['content']
                qualityStar = item['qualityStar']
                publishTime = item['publishTime']
                userInfo = item['userInfo']
                nickName = userInfo['nickName']

                usefulCnt_url = "https://review.suning.com/ajax/useful_count/" + str(commodityReviewId) + "-usefulCnt.htm"
                meta = {"product_id": product_id,"commodityReviewId":commodityReviewId,"content":content,"qualityStar":qualityStar,"publishTime":publishTime,"nickName":nickName}
                yield scrapy.Request(url=usefulCnt_url,meta=meta,callback=self.ReviewUsefulCnt)

    def ReviewUsefulCnt(self,response):
        get_meta = response.meta
        ProductId = get_meta["product_id"]
        ReviewId = get_meta["commodityReviewId"]
        ReviewEr = get_meta["nickName"]
        ReviewContent = re.sub('', "",get_meta["content"])
        ReviewRating = get_meta["qualityStar"]
        ReviewTime = get_meta["publishTime"]

        reviewUsefuAndReplylList = re.findall(r'[(](.*?)[)]', response.text)[0]
        reviewUsefuAndReplylList = json.loads(reviewUsefuAndReplylList)
        ReviewHelful = reviewUsefuAndReplylList["reviewUsefuAndReplylList"][0]["usefulCount"]

        reviewItem = SuNingComuputerReview()
        reviewItem["ProductId"] = ProductId
        reviewItem["ReviewId"] = ReviewId
        reviewItem["ReviewEr"] = ReviewEr
        reviewItem["ReviewContent"] = ReviewContent
        reviewItem["ReviewRating"] = ReviewRating
        reviewItem["ReviewTime"] = ReviewTime
        reviewItem["ReviewHelful"] = ReviewHelful
        yield reviewItem

Original: https://blog.csdn.net/weixin_54663210/article/details/120690060
Author: weixin_54663210
Title: scrapy框架爬取网站商品信息

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/790445/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球