标签:scrapy ide 爬虫 awl www zoom r文件 ring models
如果要文件管道保存为原有的文件名 需要重写文件管道的方法
pipeitem文件
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.files import FilesPipeline
class OveridePipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
file_name = request.url.split(‘/‘)[-1]
if "." not in file_name:
file_name = file_name + ‘.png‘
return "pexels/"+file_name
class ImagesPipeline(object):
def process_item(self, item, spider):
# tmp = item[‘image_urls‘]
# item[‘image_urls‘] = []
#
# for i in tmp:
# if "?" in i:
# item[‘image_urls‘].append(i.split("?")[0])
# else:
# item[‘image_urls‘].append(i)
# print("下载图片:",item[‘image_urls‘])
# return item
tmp = item[‘file_urls‘]
item[‘file_urls‘] = []
for i in tmp:
if "?" in i:
item[‘file_urls‘].append(i.split("?")[0])
else:
item[‘file_urls‘].append(i)
print("下载图片:", item[‘file_urls‘])
return item
setting配置
ITEM_PIPELINES = {
#‘scrapy.pipelines.images.ImagesPipeline‘:2,
#‘scrapy.pipelines.files.FilesPipeline‘:3,
‘images.pipelines.OveridePipeline‘:3,
‘images.pipelines.ImagesPipeline‘: 1,
}
FILES_STORE = ‘d:/crawl‘
spider文件
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import ImagesItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy.pipelines.files import FilesPipeline
from scrapy.pipelines.media import MediaPipeline
class PexSpider(CrawlSpider):
name = ‘pex‘
allowed_domains = [‘www.pexels.com‘]
start_urls = [‘https://www.pexels.com/photo/vehicle-on-road-along-green-grass-during-night-714023/‘]
rules = (
Rule(LinkExtractor(allow=r‘/photo/‘), callback=‘parse_item‘, follow=True),
)
def parse_item(self, response):
i = ImagesItem()
#i[‘domain_id‘] = response.xpath(‘//input[@id="sid"]/@value‘).extract()
#i[‘name‘] = response.xpath(‘//div[@id="name"]‘).extract()
#i[‘description‘] = response.xpath(‘//div[@id="description"]‘).extract()
#i[‘image_urls‘] = response.xpath("//img[@class=‘image-section__image js-photo-zoom‘]/@src").extract()
i[‘file_urls‘] = response.xpath("//img[@class=‘image-section__image js-photo-zoom‘]/@src").extract()
return i
item文件
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ImagesItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# image_urls = scrapy.Field()
# images = scrapy.Field()
file_urls = scrapy.Field()
files = scrapy.Field()
标签:scrapy ide 爬虫 awl www zoom r文件 ring models
原文地址:https://www.cnblogs.com/php-linux/p/9695422.html