scrapy案例:爬取翼蜂网络新闻列表和详情页面

时间：2018-05-26 16:46:25 阅读：527 评论：0 收藏：0 [点我收藏+]

标签：htm last item base col sel def bsp abc

# -*- coding: utf-8 -*-
import scrapy
from Demo.items import DemoItem


class AbcSpider(scrapy.Spider):
    name = ‘abcd‘
    allowed_domains = [‘www.cnyifeng.net‘]
    # start_urls = [‘http://abc.com/‘]

    # 拼接url
    baseURL = "http://www.cnyifeng.net/news/1/{}.html"

    offset = 1

    start_urls = [baseURL.format(offset)]

    def parse(self,response):

        node_list = response.xpath("//div[@class=‘news_con‘]/dl[@class=‘news_dl‘]")

        for node in node_list:

            item = DemoItem()

            if len(node.xpath(".//a[@class=‘dt_1‘]//text()")):

                item[‘title‘] = node.xpath(".//a[@class=‘dt_1‘]//text()").extract()[0]

            else:

                item[‘title‘] = ‘‘

            if len(node.xpath("./dd//text()")):

                item[‘zhaiyao‘] = node.xpath("./dd//text()").extract()[0]
            else:
                item[‘zhaiyao‘] = ‘‘

            item[‘times‘] = node.xpath(".//span//text()").extract()[0]

            mainUrl = ‘http://www.cnyifeng.net‘

            erUrl = mainUrl + node.xpath(".//a[@class=‘dt_1‘]/@href").extract()[0]

            yield scrapy.Request(erUrl,callback=self.parse_detail_info,meta=item) # 把item传递给详情页的方法中

        if len(response.xpath("//div[@class=‘flickr‘]//span[@class=‘disabled‘]")) == 0:
            url = response.xpath("//div[@class=‘flickr‘]/a[last()]/@href").extract()[0]
            yield scrapy.Request("http://www.cnyifeng.net" + url, callback=self.parse)
        else:
            ToNext = response.xpath("//div[@class=‘flickr‘]//span[@class=‘disabled‘]//text()").extract()[0].encode(‘utf-8‘)
            if str(ToNext != ‘下一页?‘):
                url = response.xpath("//div[@class=‘flickr‘]/a[last()]/@href").extract()[0]
                yield scrapy.Request("http://www.cnyifeng.net" + url, callback=self.parse)




    def parse_detail_info(self,response):

        item = response.meta #接收列表页的模型

        item[‘viewcount‘] = ‘90‘

        if len(response.xpath("//div[@id=‘left‘]/div[@class=‘content_arc‘]/span/text()")):

            content_list = response.xpath("//div[@id=‘left‘]/div[@class=‘content_arc‘]/span/text()").extract()

            content_str =‘‘

            for model in content_list:

                content_str = content_str + str(model).strip()

            item[‘content‘] = content_str

            yield item

标签：htm last item base col sel def bsp abc

原文地址：https://www.cnblogs.com/zqrios/p/9093165.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行