码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫进阶(七)——scrapy使用示例

时间:2021-05-24 03:01:58      阅读:0      评论:0      收藏:0      [点我收藏+]

标签:爬虫   image   loader   数据交互   作用   use   数据   win   def   

直接上代码吧

中间件简单使用:

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
import random


user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
class MidlleproDownloaderMiddleware(object):
    #拦截正常请求
    def process_request(self, request, spider):
        #进行UA伪装
        request.headers[User-Agent] = random.choice(user_agent_list)
        # print(request.headers[‘User-Agent‘])

        #代理
        request.meta[proxy] = http://111.29.3.194:8080
        print(request.meta[proxy])
        return None
    #拦截所有的响应
    def process_response(self, request, response, spider):

        return response
    #拦截发生异常的请求对象
    def process_exception(self, request, exception, spider):
        print(request)

        return request#将修正后的正常的请求对象进行重新发送

 

settings也要改

DOWNLOADER_MIDDLEWARES = {
   midllePro.middlewares.MidlleproDownloaderMiddleware: 543,
}

selenium在scrapy使用

网易新闻获取

spiders:

# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from wangyiPeo.items import WangyipeoItem
class WangyiSpider(scrapy.Spider):
    name = wangyi
    # allowed_domains = [‘www.xxx.com‘]
    start_urls = [https://news.163.com]
    model_urls = []
    bro = webdriver.Chrome(executable_path=rD:\study\chromedriver.exe)

    def parse(self, response):
        #解析出5个板块对应的url
        li_list = response.xpath(/html/body/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/ul[1]/li)
        model_index = [1,2]
        print(len(li_list))
        # print(len(li_list))
        # for i in li_list:
        #     print(i)
        for index in model_index:
            #li依次表示的是5个板块对应的li标签
            li = li_list[index]
            #5个板块对应的url
            model_url = li.xpath(./a/@href).extract_first()
            self.model_urls.append(model_url)
            #对每一个板块的url进行手动请求的发送
            yield scrapy.Request(model_url,callback=self.parse_model)
    def parse_model(self,response):#用作于解析每一个板块对应页面数据中的新闻标题和新闻详情页的url
        #该方法中获取的response对象是没有包含动态加载出的新闻数据(是一个不满足需求的response)
        div_list = response.xpath(/html/body/div/div[3]/div[4]/div[1]/div[1]/div/ul/li/div)#1+5+n
        for div in div_list:
            title = div.xpath(./div[1]/div/div[1]/h3/a/text()).extract_first()
            detail_url = div.xpath(./div[1]/a/@href).extract_first()
            item = WangyipeoItem()
            item[title] = title
            yield scrapy.Request(detail_url,callback=self.parse_new_detail,meta={item:item})
    def parse_new_detail(self,response):#解析新闻内容
        item = response.meta[item]
        content = response.xpath(/html/body/div[3]/div[1]/div[3]/div[2]//text()).extract()
        for x in content:
            if   in x:
                content.pop(content.index(x))
        for i in content:
            if \n in i:
                content.pop(content.index(i))
        for i in content:
            list2=[str(i) for i in content]
            content=‘‘.join(list2)
        print(content)
        item[content] = content
        yield item
    #改方法只会在整个程序结束时执行一次
    def closed(self,spider):
        self.bro.quit()

items

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class WangyipeoItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    content = scrapy.Field()

middlewares

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
from scrapy.http import HtmlResponse
from time import sleep


class WangyipeoDownloaderMiddleware(object):

    #参数:
    #reuqest:拦截到请求对应的响应对象
    #response:拦截到所有的响应对象(1+5+n)
    #spider:爬虫类实例化的对象,可以实现爬虫类和中间件类的数据交互
    def process_response(self, request, response, spider):
        #拦截到5个板块对应的响应对象,将其替换成5个符合需求的新的响应对象进行返回
        #1.找出5个板块对应的5个不符合需求的响应对象
        if request.url in spider.model_urls:
            #就是满足需求的五个板块对应的响应对象
            #url:响应对象对应的请求对象的url
            #body:响应数据,可以由selenium中的page_source返回
            bro = spider.bro
            bro.get(request.url)
            sleep(5)
            page_text = bro.page_source #包含了动态加载的新闻数据
            new_response = HtmlResponse(url=request.url,body=page_text,encoding=utf-8,request=request)
            return new_response
        else:
            return response

pipelines三部法

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

import pymysql
class WangyipeoPipeline(object):
    conn = None
    curse = None
    def open_spider(self,spider):
        self.conn = pymysql.Connect(host=127.0.0.1,port=3306,user=root,password=123,db=Spider,charset=utf8)
        # print(self.conn)
    def process_item(self, item, spider):
        # print(6666666666)
        sql = insert into wangyi values ("%s","%s")%(item[title],item[content])
        self.curse = self.conn.cursor()
        try:
            self.curse.execute(sql)
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return item

    def close_spider(self,spider):
        # self.curse.close()
        self.conn.close()

settings配置

DOWNLOADER_MIDDLEWARES = {
   wangyiPeo.middlewares.WangyipeoDownloaderMiddleware: 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    ‘scrapy.extensions.telnet.TelnetConsole‘: None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   wangyiPeo.pipelines.WangyipeoPipeline: 300,
}

一个小例子

# -*- coding: utf-8 -*-
import scrapy
from imgPro.items import ImgproItem

class ImgSpider(scrapy.Spider):
    name = img
    # allowed_domains = [‘www.xxx.com‘]
    start_urls = [http://sc.chinaz.com/tupian/meinvtupian.html]

    def parse(self, response):
        div_list = response.xpath(//*[@id="container"]/div)
        for div in div_list:
            img_src = div.xpath(./div/a/img/@src2).extract_first()
            img_src = https: + img_src
            item = ImgproItem()
            item[img_src] = img_src

            yield item



pipelines下:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# class ImgproPipeline(object):
#     def process_item(self, item, spider):
#         return item
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class ImgproPipeline(ImagesPipeline):
    #是用来对媒体资源进行请求的(数据下载),参数item就是接收到的爬虫类提交的item对象
    def get_media_requests(self, item, info):
        yield scrapy.Request(item[img_src])
    #指明数据存储的路径
    def file_path(self, request, response=None, info=None):
        return request.url.split(/)[-1]
    #将item传递个下一个即将被执行的管道类
    def item_completed(self, results, item, info):
        return item

settings下:

#图片存储文件夹的名称+路径
IMAGES_STORE = ./imgLibs

最最最重要的——分布式+深度爬取+增量式

这里就介绍一下深度爬取吧

分布式要用到redis,增量式就是监听网站,一有新数据立马获取(简得的形容吧)

感兴趣的欢迎联系我

使用CrawlSpider, Rule,深度爬取

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunCrawlPro.items import SuncrawlproItem,Detail_item

class SunSpider(CrawlSpider):

    name = sun
    # allowed_domains = [‘www.xxx.com‘]
    start_urls = [http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1]
    #实例化了一个连接提取器对象
    #作用:根据指定规则(allow=’正则表达式‘)进行指定连接的提取
    link = LinkExtractor(allow=rid=1&page=\d+)#获取页码连接
    #获取新闻详情页的连接
    link_detail = LinkExtractor(allow=rpolitics/index?id=\d+)

    rules = (
        #将link作用到了Rule构造方法的参数1中
        #作用:将连接提取器提取到的连接进行请求发送且根据指定规则对请求到的数据进行数据解析
        Rule(link, callback=parse_item, follow=True),
        #follow=True:将连接提取器 继续作用到 连接提取器提取到的 连接 所对应的 页面中
        Rule(link_detail, callback=parse_detail, follow=False),
    )

    def parse_item(self, response):
        #xpath表达式中是不可以出现tbody标签
        tr_list = response.xpath(/html/body/div[2]/div[3]/ul[2]/li)
        for tr in tr_list:
            link_detail = tr.xpath(./span[3]/a/@href).extract_first()
            link_detail = http://wz.sun0769.com/ + link_detail
            title = tr.xpath(./span[3]/a/text()).extract_first()
            num = tr.xpath(./span[1]/text()).extract_first()
            item = SuncrawlproItem()
            item[title] = title
            item[num] = num
            yield scrapy.Request(link_detail,callback=self.parse_detail,meta={item:item})
            yield item


    def parse_detail(self,response):
        item = response.meta[item]
        content = response.xpath(/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()).extract_first()
        content = ‘‘.join([s for s in content.splitlines(True) if s.strip()])
        print(type(content),content)
        num = response.xpath(/html/body/div[3]/div[2]/div[2]/div[1]/span[4]/text()).extract_first()
        num = num.split(:)[-1]
        # for x in content:
        #     if ‘ ‘ in x:
        #         x = x.strip()
        item = Detail_item()
        item[content] = content
        item[num] = num
        # print(item)
        yield item


items:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class SuncrawlproItem(scrapy.Item):
    title = scrapy.Field()
    num = scrapy.Field()

class Detail_item(scrapy.Item):
    content = scrapy.Field()
    num = scrapy.Field()


pipelines:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


class SuncrawlproPipeline(object):
    # def process_item(self, item, spider):
    #     if item.__class__.__name__ == ‘Detail_item‘:
    #         content = item[‘content‘]
    #         num = item[‘num‘]
    #         print(item)
    #     else:
    #         title = item[‘title‘]
    #         num = item[‘num‘]
    #         print(item)
    #     return item

    fp1 = open(content.txt,w+,encoding=utf-8)
    fp2 = open(title.txt,w+,encoding=utf-8)
    # def open_spider(self,item):
    #     if item.__class__.__name__ == ‘Detail_item‘:
    #         self.fp = open(‘content.txt‘,‘w‘,encoding=‘utf-8‘)
    #     else:
    #         self.fp = open(‘title.txt‘,‘w‘,encoding=‘utf-8‘)
    #     return item

    def process_item(self, item, spider):#item就是接收到爬虫类提交过来的item对象
        if item.__class__.__name__ == Detail_item:
            self.fp1.write(item[content]+:+item[num]+ \n)
        else:
            self.fp2.write(item[title]+:+item[num]+ \n)
        return item
    def close_spider(self,spider):
        self.fp1.close()
        self.fp2.close()


settings:

DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 10
#CONCURRENT_REQUESTS_PER_IP = 16
ITEM_PIPELINES = {
   sunCrawlPro.pipelines.SuncrawlproPipeline: 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60

都是一些防止快速爬取的配置,可以在中间件里使用ip大量爬取,爬下来的内容真的太好笑了。。。

 

爬虫进阶(七)——scrapy使用示例

标签:爬虫   image   loader   数据交互   作用   use   数据   win   def   

原文地址:https://www.cnblogs.com/zzj666/p/14747617.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!