标签:like hash urllib odi 链接 headers pytho safari 获取
实现每个链接都能独立缓存,如果存在直接读取,如果不存在,则获取网页,然后系列化后保存到本地
目录功能比较简单:后续可能会改进
#!/usr/bin/env python
#coding:utf-8
#Created by Andy @ 2017/6/28
import os
import hashlib
import urllib.request
import random
import time
import gzip
import pickle
# 简单的反防爬,每次随机选下header
headers = [{‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.3 WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.8.1000 Chrome/30.0.1599.101 Safari/537.36‘},
{‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0‘},
{‘User-Agent‘: ‘Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 10.0; WOW64; Trident/7.0)‘}]
def cache_html(url, header):
# 根据不同链接产生不同的哈希值,并以此为文件名,保证一链接一缓存文件
md = hashlib.md5()
md.update(url.encode(encoding=‘utf8‘))
file_name = md.hexdigest()
# 判断缓存文件是否已经存在,如果存在就直接读取缓存文件
path = os.path.join(base_path, file_name)
if os.path.exists(path) and os.path.getsize(path):
print("Cache file already exist!")
with open(path, ‘rb‘) as read_f:
html = pickle.load(read_f)
try:
html = gzip.decompress(html).decode(‘utf8‘) # 有些网站的数据经过压缩了
except:
html = html.decode(‘utf8‘)
else:
req = urllib.request.Request(url, headers=header)
html = urllib.request.urlopen(req).read()
if not req or not html:
print("Connection failed...")
else:
time.sleep(random.randint(1, 3))
with open(file_name, ‘wb‘) as write_f:
pickle.dump(html, write_f)
try:
html = gzip.decompress(html).decode(‘utf8‘)
except:
html = html.decode(‘utf-8‘)
return html
if __name__ == ‘__main__‘:
header = random.choice(headers)
base_path = os.path.dirname(os.path.abspath(__file__))
url = ‘http://www.python.org‘
html = cache_html(url, header)
print(html)
标签:like hash urllib odi 链接 headers pytho safari 获取
原文地址:http://www.cnblogs.com/Andy963/p/7103352.html