码迷,mamicode.com
首页 > 其他好文 > 详细

京东进口牛奶的爬取

时间:2019-07-02 22:58:37      阅读:302      评论:0      收藏:0      [点我收藏+]

标签:you   col   body   clu   scrapy   from   elf   callback   append   

# -*- coding: utf-8 -*-
import scrapy
import json
import csv
from milk.items import MilkItem

class MilkspiderSpider(scrapy.Spider):
    name = milkspider
    # allowed_domains = [‘www.xxx.com‘]
    start_urls = [https://search.jd.com/Search?keyword=%E8%BF%9B%E5%8F%A3%E7%89%9B%E5%A5%B6&enc=utf-8&suggest=3.def.0.V09--12s0,20s0,38s0&wq=%E8%BF%9B%E5%8F%A3&pvid=96ab0296e9ce494fb251b716911d93ec]
    data_list = []

    def parse(self, response):
        li_list = response.xpath(//li[@class="gl-item"])
        for li in li_list:
            good_id = li.xpath(./@data-sku).get()  # 从自己开始找
            # print(good_id)
            shop_name = li.xpath(.//a[@class="curr-shop"]/text()).get()
            # print(shop_name)
            good_name = li.xpath(.//div[@class="p-name p-name-type-2"]/a/em/text()).getall()
            good_name = ,.join(good_name).strip().replace(",", "").replace("\n\t", "")
            # print(good_name)
            good_url = li.xpath(.//div[@class="p-name p-name-type-2"]/a/@href).get()
            if good_url.startswith(https:):
                good_url = good_url
            else:
                good_url = https: + good_url
            # print(good_url)
            good_price = li.xpath(.//div[@class="p-price"]/strong//text()).getall()
            good_price = ,.join(good_price).replace(",", "")
            # print(good_price)

            # 评论数在源码没有 获取不到 需要去详情页获取
            item = MilkItem()
            item["shop_name"] = shop_name
            item["good_name"] = good_name
            item["good_price"] = good_price
            item["good_id"] = good_id
            item[good_url] = good_url
            yield scrapy.Request(url=good_url, meta={"item": item}, callback=self.parse_detail)

    def parse_detail(self, response):
        # 获取的评论是动态加载的
        item = response.meta[item]

        # 拼接每个商品的评论的url
        comment_info_url = https://club.jd.com/comment/productCommentSummaries.action?referenceIds= + item[good_id]
        # print(comment_info_url)
        yield scrapy.Request(url=comment_info_url, meta={"item": item}, callback=self.parse_comment)

    def parse_comment(self, response):
        item = response.meta[item]

        # response.body是一个bytes格式的   转成str
        str = response.body.decode(utf-8, replace)
        json_str = str.replace(??, )
        dict = json.loads(json_str)

        total_comment = dict[CommentsCount][0][CommentCountStr]
        good_comment = dict[CommentsCount][0][GoodCountStr]
        video_count = dict[CommentsCount][0][VideoCountStr]
        general_count = dict[CommentsCount][0][GeneralCountStr]
        poor_count = dict[CommentsCount][0][PoorCountStr]

        item[total_comment] = total_comment
        item[good_comment] = good_comment
        item[video_count] = video_count
        item[general_count] = general_count
        item[poor_count] = poor_count

        self.data_list.append(item)
        # print(self.data_list)

        with open(./京东进口牛奶.csv, w, encoding=utf-8, errors=ignore, newline="") as csvfile:
            fieldnames = [good_id, good_name, shop_name, good_url, total_comment, good_comment,
                          video_count, general_count, poor_count, good_price]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(self.data_list)

        return self.data_list

 

items

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class MilkItem(scrapy.Item):
    # define the fields for your item here like:
    good_id = scrapy.Field()
    good_name = scrapy.Field()
    shop_name = scrapy.Field()
    good_url = scrapy.Field()

    total_comment = scrapy.Field()
    good_comment = scrapy.Field()
    video_count = scrapy.Field()
    general_count = scrapy.Field()
    poor_count = scrapy.Field()

    good_price = scrapy.Field()

 

start

from scrapy import cmdline

cmdline.execute("scrapy crawl milkspider".split())

 

京东进口牛奶的爬取

标签:you   col   body   clu   scrapy   from   elf   callback   append   

原文地址:https://www.cnblogs.com/kenD/p/11123581.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!