标签:hat src 重复 cpu private 表单验证 跳过 else scheduled
上课第25天,打卡:
保持对技术的热情,认真对待。
1 s1617day3 2 3 内容回顾: 4 Scrapy 5 - 创建project 6 - 创建爬虫 7 - 编写 8 - 类 9 - start_urls = [‘http://www.xxx.com‘] 10 - def parse(self,response): 11 12 yield Item对象 13 yield Request对象 14 15 - pipeline 16 - process_item 17 @classmethod 18 - from_clawer 19 - open_spider 20 - close_spider 21 配置 22 23 - request对象("地址",回调函数) 24 - 执行 25 26 高性能相关: 27 - 多线程【IO】和多进程【计算】 28 - 尽可能利用线程: 29 一个线程(Gevent),基于协程: 30 - 协程,greenlet 31 - 遇到IO就切换 32 一个线程(Twisted,Tornado),基于事件循环: 33 - IO多路复用 34 - Socket,setBlocking(Flase) 35 36 37 今日内容: 38 - Scrapy 39 - Cookie操作 40 - Pipeline 41 - 中间件 42 - 扩展 43 - 自定义命令 44 - 其他 45 - scrapy-redis 46 - Tornado和Flask 47 - 基本流程 48 49 50 51 内容详细: 52 1. Scrapy 53 54 - start_requests 55 - 可迭代对象 56 - 生成器 57 58 内部iter() 59 from scrapy.crawler import Crawler 60 Crawler.crawl 61 62 def start_requests(self): 63 for url in self.start_urls: 64 yield Request(url=url,callback=self.parse) 65 # return [Request(url=url,callback=self.parse),] 66 - cookie 67 cookie_jar = CookieJar() 68 cookie_jar.extract_cookies(response, response.request) 69 70 - pipeline 71 - 5个方法 72 - process_item 73 - return item 74 - raise DropItem() 75 76 - 去重规则 77 DUPEFILTER_CLASS = ‘sp2.my_filter.MyDupeFilter‘ 78 from scrapy.utils.request import request_fingerprint 79 80 class MyDupeFilter(object): 81 def __init__(self): 82 self.visited = set() 83 84 @classmethod 85 def from_settings(cls, settings): 86 return cls() 87 88 def request_seen(self, request): 89 fp = request_fingerprint(request) 90 if fp in self.visited: 91 return True 92 self.visited.add(fp) 93 94 def open(self): # can return deferred 95 pass 96 97 def close(self, reason): # can return a deferred 98 pass 99 100 def log(self, request, spider): # log that a request has been filtered 101 pass 102 103 from scrapy.utils.request import request_fingerprint 104 from scrapy.http import Request 105 106 107 obj1 = Request(url=‘http://www.baidu.com?a=1&b=2‘,headers={‘Content-Type‘:‘application/text‘},callback=lambda x:x) 108 obj2 = Request(url=‘http://www.baidu.com?b=2&a=1‘,headers={‘Content-Type‘:‘application/json‘},callback=lambda x:x) 109 110 v1 = request_fingerprint(obj1,include_headers=[‘Content-Type‘]) 111 print(v1) 112 113 v2 = request_fingerprint(obj2,include_headers=[‘Content-Type‘]) 114 print(v2) 115 116 - 自定义命令 117 - 目录 118 xx.py 119 Class Foo(ScrapyCommand) 120 run方法 121 122 - settings 123 COMMANDS_MODULE = "sp2.目录" 124 125 - scrapy xx 126 127 - 下载中间件 128 - __init__ 129 - from_crawler 130 - process_request 131 - None 132 - response 133 - request 134 - process_response 135 - process_exception 136 137 应用: 138 - 定制请求头(代理) 139 - HTTPS 140 141 注意: 142 默认代理规则:from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware 143 设置代理两种方式 144 - 环境变量 145 os.environ[‘xxxxxxxxxxx_proxy‘] 146 os.environ[‘xxxxxxxxxxx_proxy‘] 147 os.environ[‘xxxxxxxxxxx_proxy‘] 148 os.environ[‘xxxxxxxxxxx_proxy‘] 149 程序启动之前,先设置 150 import os 151 os.environ[‘xxxxxxxxxxx_proxy‘] = "sdfsdfsdfsdfsdf" 152 - 中间件 153 ... 154 155 - 爬虫中间件 156 class SpiderMiddleware(object): 157 158 def __init__(self): 159 pass 160 161 @classmethod 162 def from_cralwer(cls,cralwer): 163 return cls() 164 165 def process_spider_input(self,response, spider): 166 """ 167 下载完成,执行,然后交给parse处理 168 :param response: 169 :param spider: 170 :return: 171 """ 172 pass 173 174 def process_spider_output(self,response, result, spider): 175 """ 176 spider处理完成,返回时调用 177 :param response: 178 :param result: 179 :param spider: 180 :return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable) 181 """ 182 return result 183 184 def process_spider_exception(self,response, exception, spider): 185 """ 186 异常调用 187 :param response: 188 :param exception: 189 :param spider: 190 :return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline 191 """ 192 return None 193 194 def process_start_requests(self,start_requests, spider): 195 """ 196 爬虫启动时调用 197 :param start_requests: 198 :param spider: 199 :return: 包含 Request 对象的可迭代对象 200 """ 201 return start_requests 202 # return [Request(url=‘http://www.baidu.com‘),] 203 204 - 自定义扩展 205 from scrapy import signals 206 207 208 class MyExtension(object): 209 def __init__(self): 210 pass 211 212 @classmethod 213 def from_crawler(cls, crawler): 214 obj = cls() 215 216 crawler.signals.connect(obj.xxxxxx, signal=signals.engine_started) 217 crawler.signals.connect(obj.rrrrr, signal=signals.spider_closed) 218 219 return obj 220 221 def xxxxxx(self, spider): 222 print(‘open‘) 223 224 def rrrrr(self, spider): 225 print(‘open‘) 226 227 228 EXTENSIONS = { 229 ‘sp2.extend.MyExtension‘: 500, 230 } 231 232 233 - Https证书,自定义证书 234 默认: 235 DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory" 236 DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory" 237 238 自定义: 239 DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory" 240 DOWNLOADER_CLIENTCONTEXTFACTORY = "sp2.https.MySSLFactory" 241 242 243 from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory 244 from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate) 245 246 247 class MySSLFactory(ScrapyClientContextFactory): 248 def getCertificateOptions(self): 249 from OpenSSL import crypto 250 v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open(‘/Users/wupeiqi/client.key.unsecure‘, mode=‘r‘).read()) 251 v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open(‘/Users/wupeiqi/client.pem‘, mode=‘r‘).read()) 252 return CertificateOptions( 253 privateKey=v1, # pKey对象 254 certificate=v2, # X509对象 255 verify=False, 256 method=getattr(self, ‘method‘, getattr(self, ‘_ssl_method‘, None)) 257 ) 258 259 260 - 其他:配置 261 262 参考地址:http://www.cnblogs.com/wupeiqi/articles/6229292.html 263 264 2. pip3 install scrapy-redis 265 需求:10个爬虫 266 组件: scrapy-redis,将去重规则和调度器放置到redis中。 267 流程:连接redis,指定调度器时,调用去重规则.request_seen方法 268 269 # 连接redis 270 # REDIS_HOST = ‘localhost‘ # 主机名 271 # REDIS_PORT = 6379 # 端口 272 REDIS_URL = ‘redis://user:pass@hostname:9001‘ # 连接URL(优先于以上配置) 273 # REDIS_PARAMS = {} # Redis连接参数 默认:REDIS_PARAMS = {‘socket_timeout‘: 30,‘socket_connect_timeout‘: 30,‘retry_on_timeout‘: True,‘encoding‘: REDIS_ENCODING,}) 274 # REDIS_PARAMS[‘redis_cls‘] = ‘myproject.RedisClient‘ # 指定连接Redis的Python模块 默认:redis.StrictRedis 275 # REDIS_ENCODING = "utf-8" # redis编码类型 默认:‘utf-8‘ 276 277 # 去重规则(redis中的set集合) 278 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 279 280 281 282 # 调度器 283 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 284 285 SCHEDULER_QUEUE_CLASS = ‘scrapy_redis.queue.PriorityQueue‘ # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) 286 SCHEDULER_QUEUE_KEY = ‘%(spider)s:requests‘ # 调度器中请求存放在redis中的key 287 SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat" # 对保存到redis中的数据进行序列化,默认使用pickle 288 SCHEDULER_PERSIST = True # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空 289 SCHEDULER_FLUSH_ON_START = True # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空 290 SCHEDULER_IDLE_BEFORE_CLOSE = 10 # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。 291 SCHEDULER_DUPEFILTER_KEY = ‘%(spider)s:dupefilter‘ # 去重规则,在redis中保存时对应的key 292 293 294 REDIS_START_URLS_AS_SET = False 295 REDIS_START_URLS_KEY = ‘%(name)s:start_urls‘ 296 297 298 299 300 方式一: 301 REDIS_URL = ‘redis://user:pass@hostname:9001‘ # 连接URL(优先于以上配置) 302 # REDIS_PARAMS = {} # Redis连接参数 默认:REDIS_PARAMS = {‘socket_timeout‘: 30,‘socket_connect_timeout‘: 30,‘retry_on_timeout‘: True,‘encoding‘: REDIS_ENCODING,}) 303 # REDIS_PARAMS[‘redis_cls‘] = ‘myproject.RedisClient‘ # 指定连接Redis的Python模块 默认:redis.StrictRedis 304 # REDIS_ENCODING = "utf-8" # redis编码类型 默认:‘utf-8‘ 305 306 # 去重规则(redis中的set集合) 307 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 308 309 310 311 # 调度器 312 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 313 314 SCHEDULER_QUEUE_CLASS = ‘scrapy_redis.queue.PriorityQueue‘ # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) 315 SCHEDULER_QUEUE_KEY = ‘%(spider)s:requests‘ # 调度器中请求存放在redis中的key 316 SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat" # 对保存到redis中的数据进行序列化,默认使用pickle 317 SCHEDULER_PERSIST = True # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空 318 SCHEDULER_FLUSH_ON_START = True # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空 319 SCHEDULER_IDLE_BEFORE_CLOSE = 10 # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。 320 SCHEDULER_DUPEFILTER_KEY = ‘%(spider)s:dupefilter‘ # 去重规则,在redis中保存时对应的key 321 322 323 class ChoutiSpider(scrapy.Spider): 324 name = ‘chouti‘ 325 allowed_domains = [‘chouti.com‘] 326 cookies = None 327 cookie_dict = {} 328 start_urls = [‘http://dig.chouti.com/‘,] 329 330 def index(self, response): 331 print(‘爬虫返回结果‘,response,response.url) 332 333 方式二: 334 335 REDIS_START_URLS_AS_SET = False 336 REDIS_START_URLS_KEY = ‘%(name)s:start_urls‘ 337 338 339 340 from scrapy_redis.spiders import RedisSpider 341 class ChoutiSpider(RedisSpider): 342 name = ‘chouti‘ 343 allowed_domains = [‘chouti.com‘] 344 345 def index(self, response): 346 print(‘爬虫返回结果‘,response,response.url) 347 348 349 350 ********************* 基本使用 ********************* 351 类,继承scrapy_redis 352 353 354 参考博客:http://www.cnblogs.com/wupeiqi/articles/6912807.html 355 356 357 3. Flask Web框架 358 - pip3 install flask 359 - Web框架: 360 - 路由 361 - 视图 362 - 模板渲染 363 364 - flask中无socket,依赖 实现wsgi协议的模块: werkzeug 365 - URL两种添加方式: 366 方式一: 367 @app.route(‘/xxxxxxx‘) 368 def hello_world(): 369 return ‘Hello World!‘ 370 方式二: 371 def index(): 372 return "Index" 373 374 app.add_url_rule(‘/index‘,view_func=index) 375 - 路由系统: 376 - 固定 377 @app.route(‘/x1/‘) 378 def hello_world(): 379 return ‘Hello World!‘ 380 381 - 不固定 382 @app.route(‘/user/<username>‘) 383 @app.route(‘/post/<int:post_id>‘) 384 @app.route(‘/post/<float:post_id>‘) 385 @app.route(‘/post/<path:path>‘) 386 @app.route(‘/login‘, methods=[‘GET‘, ‘POST‘]) 387 388 @app.route(‘/xx/<int:nid>‘) 389 def hello_world(nid): 390 return ‘Hello World!‘+ str(nid) 391 392 - 自定制正则 393 @app.route(‘/index/<regex("\d+"):nid>‘) 394 def index(nid): 395 return ‘Index‘ 396 397 - 视图 398 399 - 模板 400 401 - message 402 403 - 中间件 404 405 - Session 406 - 默认:加密cookie实现 407 - 第三方:Flask-Session 408 redis: RedisSessionInterface 409 memcached: MemcachedSessionInterface 410 filesystem: FileSystemSessionInterface 411 mongodb: MongoDBSessionInterface 412 sqlalchemy: SqlAlchemySessionInterface 413 414 - 蓝图(文件夹的堆放) 415 416 - 安装第三方组件: 417 - Session: Flask-Session 418 - 表单验证:WTForms 419 - ORM: SQLAchemy 420 参考博客:http://www.cnblogs.com/wupeiqi/articles/7552008.html 421 4. Tornado 422 - pip3 install tornado 423 424 425 参考博客:http://www.cnblogs.com/wupeiqi/articles/5702910.html 426 427 428
################
# 2017-10-29 - 课上笔记
################
day25
内容回顾:
- scrapy
- 创建project
- 创建爬虫
- 编写:
- 类
- start_urls = [‘xx‘]
- def pasrse(self,response):
yield Item 对象 ---> 进入Pipeline做持久化存储
yield Request对象 ---> 进入调度中心,递归...
# 面向对象的封装
- 初开始引擎执行的是父类的 start_requests()方法,可自己重写:
def start_requests(self):
for url in self.start_urls:
yield Request(url=url,callback=self.parse)
或者:
return [Request(url=url,callback=self.parse).]
"""要么返回一个可迭代对象,要么返回一个生成器对象!!!"""
- Pipeline
- process_item
- open_spider
- close_spider
...
- request对象("地址",回调函数)
- 执行
- 高性能相关
- 多线程【适用于IO密集型】和多进程【适用于计算密集型】
- GIL锁 锁的是进程,单个进程只能有一个线程被CPU调度,即使这个CPU有多个核
- 尽可能利用线程:
- 一个线程,基于协程:
- 协程,greenlet
- 遇到IO就切换
- 代表就是 Gevent
- 一个线程,基于事件循环:
- IO多路复用
- Socket,setBlocking(False)
- 代表:Twisted 、tornado 、、、
今日内容:
- scrapy
- Cookie操作
- Pipeline
- 中间件
- 扩展(信号。。。)
- 自定义命令
- 其他配置文件
- 扩展:scrapy-redis
- Tornado 和 Flash 简单实用(轻量级框架)
- 学习基本流程和规则
内容详细:
- Scrapy
- 创建项目:
D:\soft\work\Python_17\day25>scrapy startproject day25spider
New Scrapy project ‘day25spider‘, using template directory ‘d:\\soft\\work\\pyth
on35\\lib\\site-packages\\scrapy\\templates\\project‘, created in:
D:\soft\work\Python_17\day25\day25spider
You can start your first spider with:
cd day25spider
scrapy genspider example example.com
D:\soft\work\Python_17\day25>cd day25spider
D:\soft\work\Python_17\day25\day25spider>scrapy genspider chouti chouti.com
Created spider ‘chouti‘ using template ‘basic‘ in module:
day25spider.spiders.chouti
D:\soft\work\Python_17\day25\day25spider>
- 执行
- scrapy crawl chouti
- scrapy crawl chouti --nolog
# 面向对象的封装
- 初开始引擎执行的是父类的 start_requests()方法,可自己重写:
def start_requests(self):
for url in self.start_urls:
yield Request(url=url,callback=self.parse)
或者:
return [Request(url=url,callback=self.parse).]
"""要么返回一个可迭代对象,要么返回一个生成器对象!!!"""
- 因为内部会有 iter() 的过程,代码见:
- from scrapy.crawler import Crawler
- 里面的 crawl() 方法,如下:
def crawl(self, *args, **kwargs):
assert not self.crawling, "Crawling already taking place"
self.crawling = True
try:
self.spider = self._create_spider(*args, **kwargs)
self.engine = self._create_engine()
start_requests = iter(self.spider.start_requests())
yield self.engine.open_spider(self.spider, start_requests)
...
- 引擎拿到url之后放入到调度器里
- 下载器去调度器拿url,进行下载,下载完成之后执行 callback()
- 筛选器
hxs = HtmlXPathSelector(response)
hxs.xpath(‘//div[@id="i1"]‘)
hxs.xpath(‘//div[@id="i1"]/text()‘)
hxs.xpath(‘//div[@id="i1"]/@href‘)
hxs.xpath(‘//div[@id="i1"]/@href‘).extract()
hxs.xpath(‘//div[@id="i1"]/@href‘).extract_first()
for url in hxs.xpath(‘//div[@id="i1"]/@href‘).extract():
yield Item(name=url)
- cookie
- 登录抽屉,自动点赞
from scrapy.http.cookies import CookieJar
cookie_jar = CookieJar()
cookie_jar.extract_cookies(response, response.request)
- Pipeline
- 可以写5个方法
- procsss_item()
- return item
- raise DropItem()
- 多个pipeline时,跳过下一个pipeline:
from scrapy.exceptions import DropItem()
if ENV==‘debug‘:
raise DropItem()
else:
return Item
- url去重规则 (利用set集合)
"""
默认是有去重规则的:
def start_requests(self):
for url in self.start_urls:
yield Request(url=url,callback=self.parse,dont_filter=False)
"""
- settings.py 里面加上自己写的插件
DUPEFILTER_CLASS = ‘scrapy.dupefilter.RFPDupeFilter‘ # 默认的去重规则
DUPEFILTER_CLASS = ‘xxx.yyy.‘ # 自定义过滤插件
- 模仿RFPDupeFilter写自己的插件
- url 做 类 md5 操作 (fingerprint...)
"""
class RepeatUrl:
def __init__(self):
self.visited_url = set()
@classmethod
def from_settings(cls, settings):
"""
初始化时,调用
:param settings:
:return:
"""
return cls()
def request_seen(self, request):
"""
检测当前请求是否已经被访问过
:param request:
:return: True表示已经访问过;False表示未访问过
"""
if request.url in self.visited_url:
return True
self.visited_url.add(request.url)
return False
def open(self):
"""
开始爬去请求时,调用
:return:
"""
print(‘open replication‘)
def close(self, reason):
"""
结束爬虫爬取时,调用
:param reason:
:return:
"""
print(‘close replication‘)
def log(self, request, spider):
"""
记录日志
:param request:
:param spider:
:return:
"""
print(‘repeat‘, request.url)
"""
- 自定义命令
- spiders同级目录新建 commands 目录
- 创建 crawlall.py
from scrapy.command import ScrapyCommand
class Foo(ScrapyCommand):
...
def run(self):
...
- 把 commands 注册到 setting.py
- 可迭代对象:具有 __iter__() 方法,并且执行后可以返回迭代器
- 迭代器:具有 __next__() 方法 并且逐一向后取值
- 生成器:函数中具有yield关键字
- 具有 __iter__() 返回的还是自己本身
- 具有 __next__()
- 下载中间件
- 方法:
- process_request()
- process_response()
- process_exception()
- process_request() 应用
- 自定义下载模块
- 定制请求头/cookie 避免每次请求都带上同样的重复代码
- 设置代理
- os.environ
- 默认的代理规则:
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
from urllib.request import getproxies
- HTTPS
- 设置代理需注意:
- 默认代理是使用的环境变量
os.environ[‘xxxxxx_proxy‘]
程序启动之前,先设置 os.environ[‘xxx_proxy‘] = "xxxxxxxx"
- 自定义代理中间件
"""
def to_bytes(text, encoding=None, errors=‘strict‘):
if isinstance(text, bytes):
return text
if not isinstance(text, six.string_types):
raise TypeError(‘to_bytes must receive a unicode, str or bytes ‘
‘object, got %s‘ % type(text).__name__)
if encoding is None:
encoding = ‘utf-8‘
return text.encode(encoding, errors)
class ProxyMiddleware(object):
def process_request(self, request, spider):
PROXIES = [
{‘ip_port‘: ‘111.11.228.75:80‘, ‘user_pass‘: ‘‘},
{‘ip_port‘: ‘120.198.243.22:80‘, ‘user_pass‘: ‘‘},
{‘ip_port‘: ‘111.8.60.9:8123‘, ‘user_pass‘: ‘‘},
{‘ip_port‘: ‘101.71.27.120:80‘, ‘user_pass‘: ‘‘},
{‘ip_port‘: ‘122.96.59.104:80‘, ‘user_pass‘: ‘‘},
{‘ip_port‘: ‘122.224.249.122:8088‘, ‘user_pass‘: ‘‘},
]
proxy = random.choice(PROXIES)
if proxy[‘user_pass‘] is not None:
request.meta[‘proxy‘] = to_bytes("http://%s" % proxy[‘ip_port‘])
encoded_user_pass = base64.encodestring(to_bytes(proxy[‘user_pass‘]))
request.headers[‘Proxy-Authorization‘] = to_bytes(‘Basic ‘ + encoded_user_pass)
print "**************ProxyMiddleware have pass************" + proxy[‘ip_port‘]
else:
print "**************ProxyMiddleware no pass************" + proxy[‘ip_port‘]
request.meta[‘proxy‘] = to_bytes("http://%s" % proxy[‘ip_port‘])
"""
- 自定义下载中间件
"""
class DownMiddleware1(object):
def process_request(self, request, spider):
"""
请求需要被下载时,经过所有下载器中间件的process_request调用
:param request:
:param spider:
:return:
None,继续后续中间件去下载;
Response对象,停止process_request的执行,开始执行process_response
Request对象,停止中间件的执行,将Request重新调度器
raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
"""
pass
def process_response(self, request, response, spider):
"""
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return:
Response 对象:转交给其他中间件process_response
Request 对象:停止中间件,request会被重新调度下载
raise IgnoreRequest 异常:调用Request.errback
"""
print(‘response1‘)
return response
def process_exception(self, request, exception, spider):
"""
当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
:param response:
:param exception:
:param spider:
:return:
None:继续交给后续中间件处理异常;
Response对象:停止后续process_exception方法
Request对象:停止中间件,request将会被重新调用下载
"""
return None
"""
- 爬虫中间件
- 扩展
- scrapy 已经给埋好点了
- from scrapy import signals
"""
engine_started = object()
engine_stopped = object()
spider_opened = object()
spider_idle = object()
spider_closed = object()
spider_error = object()
request_scheduled = object()
request_dropped = object()
response_received = object()
response_downloaded = object()
item_scraped = object()
item_dropped = object()
"""
"""
from scrapy import signals
class MyExtension(object):
def __init__(self, value):
self.value = value
@classmethod
def from_crawler(cls, crawler):
val = crawler.settings.getint(‘MMMM‘)
ext = cls(val)
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
return ext
def spider_opened(self, spider):
print(‘open‘)
def spider_closed(self, spider):
print(‘close‘)
"""
- HTTPS证书
- 默认
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory"
- 自定义HTTPS证书
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory"
"""
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate)
class MySSLFactory(ScrapyClientContextFactory):
def getCertificateOptions(self):
from OpenSSL import crypto
v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open(‘/Users/wupeiqi/client.key.unsecure‘, mode=‘r‘).read())
v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open(‘/Users/wupeiqi/client.pem‘, mode=‘r‘).read())
return CertificateOptions(
privateKey=v1, # pKey对象
certificate=v2, # X509对象
verify=False,
method=getattr(self, ‘method‘, getattr(self, ‘_ssl_method‘, None))
"""
- 设置请求头的方法:
- 每次一次 yield Request(headers={...})
- 在下载中间件里分门别类的自定义headers
- 在settings.py里粗暴的统一定义好一个headers
- 多个爬虫的情况:
- 去重规则写在哪?
- 深度优先/广度优先 规则写在哪?
调度器
- 其他 : http://www.cnblogs.com/wupeiqi/articles/6229292.html
- 分布式爬虫:scrapy-redis组件
- 存储
- 调度器
- 去重规则
# 是调度器触发了去重规则
- 流程
- 连接redis
- 指定调度器时,调用了去重规则的 request_seen()方法
- scrapy-redis参考:http://www.cnblogs.com/wupeiqi/articles/6912807.html
- Django 自己没有写socket,而是使用的 WSGI协议 (代表是 wsgiref)
- Flask Web框架
参考:http://www.cnblogs.com/wupeiqi/articles/7552008.html
- Web框架
- 路由
- 视图
- 模板渲染
- Flask 自己也没写socket,而是依赖实现WSGI协议的模块:Werkzeug
"""
from flask import Flask
app = Flask(__name__)
@app.route(‘/‘)
def hello_world():
return ‘Hello World!‘
if __name__ == ‘__main__‘:
app.run()
"""
- @app.route(‘/‘) 返回了一个函数 func
- 执行 func(hello_world)
- url两种添加方式:
- 方式一
@app.route(‘/‘)
def hello_world():
return ‘Hello World!‘
- 方式二
def index():
return ‘Hello World!‘
add_url_rule(‘/index‘,view_func=index)
- 路由系统
- 固定
@app.route(‘/xxx/‘)
def hello_world():
return ‘Hello World!‘
- 不固定
"""
Flask只支持如下几种不固定的url匹配:
@app.route(‘/user/<username>‘)
@app.route(‘/post/<int:post_id>‘)
@app.route(‘/post/<float:post_id>‘)
@app.route(‘/post/<path:path>‘)
@app.route(‘/login‘, methods=[‘GET‘, ‘POST‘])
"""
@app.route(‘/xxx/<int:uid>‘)
def hello_world(uid):
return ‘Hello World!‘ + str(uid)
- Flask本身不支持正则匹配,需要自定制正则规则
- 反向生成url:
url_for(‘index‘)
@app.route(‘/index.htm‘)
def index():
return "首页"
- session加密之后放到浏览器的cookie里,server端不保存
- 第三方插件: Flask-Session
- obj() 就是调用类的 __call__() 方法
- 蓝图:文件夹的堆放
""" 重要 """
- HTTP method 有多少种?分别是什么?
看django的CBV里面的源码
标签:hat src 重复 cpu private 表单验证 跳过 else scheduled
原文地址:http://www.cnblogs.com/standby/p/7758125.html