requests
re
os
COMPILE = re.compile()
URL = ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
CASE_INSENSE = []
():
(urlcodereasoncontent):
(MavenException).()
.url = url
.code = code
.reason = reason
.content = content
(url=URL):
result = requests.get(url)
code = result.status_code
code != requests.codes.ok:
MavenException(urlcoderesult.reasonresult.content)
COMPILE.findall(result.content)
():
(url):
result = {}
disk_path = url.rsplit()[]
is_dir = disk_path.endswith()
result[] = disk_path.split()[].replace(os.sep)
result[] = is_dir
result
(disk_path):
os.mkdir(ROOT_DIR + disk_path)
(urldisk_path):
result = requests.get(url)
(ROOT_DIR + disk_path) f:
f.write(result.content)
(url):
resource = handle_resource_type(url)
urlresource
resource[]:
down(urlresource[])
:
:
make_dir(resource[])
e:
e.winerror == :
CASE_INSENSE.append(resource[])
make_dir(resource[].rstrip()++((CASE_INSENSE))+)
:
e
urls = get_urls(url)
urls
[u u urls handle_resource_type(u)[]]:
item urls:
parse_url(item)
:
item urls:
parse_url(item)
__name__ == :
url get_urls():
parse_url(url)下一步目标使用线程池, 单线程太慢了...
从17:00~第二天1:00爬了1/3,中间还跪了...
还要加入日志和容错处理..
http://xiaorui.cc/2014/11/15/%E4%BD%BF%E7%94%A8python%E7%9A%84%E4%B8%8A%E5%B1%82%E5%B0%81%E8%A3%85%E5%B9%B6%E5%8F%91%E5%BA%93concurrent-futures%E5%AE%9E%E7%8E%B0%E5%BC%82%E6%AD%A5/
原文地址:http://3732370.blog.51cto.com/3722370/1918704