标签:tin member text 页面 highlight pipeline 回调函数 ngx www
爬取图片资源
spider文件
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
import time
from ..items import ZhuangxiuItem
class ZhuangxiuspiderSpider(CrawlSpider):
name = ‘zhuangxiuSpider‘
allowed_domains = [‘www.zhuangyi.com‘]
start_urls = [‘http://www.zhuangyi.com/xiaoguotu/keting/p1/‘]
rules = (
# 提取详情页信息 callback 回调函数, 将相应交给这个函数来处理
# 第二步:分类主页的下一页
# Rule(LinkExtractor(allow=r‘(.*?)/p\d+‘), follow=True),
# 第三步: 详情页面
Rule(LinkExtractor(allow=r‘(.*?)\d+.html‘), follow=True, callback=‘parse_item‘),
)
def parse_item(self, response):
img_url_list = re.findall(r‘http://pic.zhuangyi.com/Member/\d/\d+/./\d+.jpg‘, response.text)
item = ZhuangxiuItem()
item[‘image_urls‘] = img_url_list
item[‘title‘] = time.time()
yield item
items.py 中
import scrapy
class ZhuangxiuItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
image_urls = scrapy.Field()
settings
DEFAULT_REQUEST_HEADERS = {
‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
‘Accept-Language‘: ‘en‘,
‘Referer‘: ‘http://www.zhuangyi.com/‘
}
IMAGES_STORE = ‘img‘
ITEM_PIPELINES = {
‘scrapy.pipelines.images.ImagesPipeline‘: 300,
}
标签:tin member text 页面 highlight pipeline 回调函数 ngx www
原文地址:https://www.cnblogs.com/wangyue0925/p/11248709.html