Why requests
python的标准库urllib2提供了大部分需要的HTTP功能,但是API太逆天了,一个简单的功能就需要一大堆代码。
Requests 使用的是 urllib3,因此继承了它的所有特性。Requests 支持 HTTP 连接保持和连接池,支持使用 cookie 保持会话,支持文件上传,支持自动确定响应内容的编码,支持国际化的 URL 和 POST 数据自动编码。现代、国际化、人性化。。
官方文档:http://docs.python-requests.org/en/master/
中文文档:http://docs.python-requests.org/zh_CN/latest/user/quickstart.html
安装:
pip install requests
或者下载源码后安装
$ git clone git://github.com/kennethreitz/requests.git $ cd requests $ python setup.py install
也可以通过IDE安装比如pycharm(File-settings-Projecr Interpreter 点击右边的加号"+"然后搜索requests点击Install Package安装)
爬取校花网视频:
import requests
import re
import os
import hashlib
import time
DOWLOAD_PATH = r‘D:\Downloads‘
def get_page(url):
try:
response = requests.get(url, )
if response.status_code == 200:
return response.text
except Exception:
pass
def parse_index(index_contents):
detail_urls = re.findall(‘class="items".*?href="(.*?)"‘, index_contents, re.S)
for detail_url in detail_urls:
if not detail_url.startswith(‘http‘):
detail_url = ‘http://www.xiaohuar.com‘ + detail_url
yield detail_url
def parse_detail(detail_contents):
movie_urls = re.findall(‘id="media".*?src="(.*?)"‘, detail_contents, re.S)
if movie_urls:
movie_url = movie_urls[0]
if movie_url.endswith(‘mp4‘):
yield movie_url
def download(movie_url):
print(movie_url)
try:
response = requests.get(movie_url,
)
if response.status_code == 200:
data = response.content
m = hashlib.md5()
m.update(str(time.time()).encode(‘utf-8‘))
m.update(movie_url.encode(‘utf-8‘))
filepath = os.path.join(DOWLOAD_PATH, ‘%s.mp4‘ % m.hexdigest())
with open(filepath, ‘wb‘) as f:
f.write(data)
f.flush()
print(‘下载成功‘, movie_url)
except Exception:
pass
def main():
raw_url = ‘http://www.xiaohuar.com/list-3-{page_num}.html‘
for i in range(5):
index_url = raw_url.format(page_num=i)
index_contents = get_page(index_url)
detail_urls = parse_index(index_contents)
for detail_url in detail_urls:
detail_contents = get_page(detail_url)
movie_urls = parse_detail(detail_contents)
for movie_url in movie_urls:
download(movie_url)
if __name__ == ‘__main__‘:
main()
注:D盘要创建一个Downloads文件夹

线程池版
import requests
import re
import os
import hashlib
import time
from concurrent.futures import ThreadPoolExecutor
pool = ThreadPoolExecutor(50)
DOWLOAD_PATH = r‘D:\Downloads‘
def get_page(url):
try:
response = requests.get(url, )
if response.status_code == 200:
return response.text
except Exception:
pass
def parse_index(index_contents):
index_contents = index_contents.result()
detail_urls = re.findall(‘class="items".*?href="(.*?)"‘, index_contents, re.S)
for detail_url in detail_urls:
if not detail_url.startswith(‘http‘):
detail_url = ‘http://www.xiaohuar.com‘ + detail_url
pool.submit(get_page, detail_url).add_done_callback(parse_detail)
def parse_detail(detail_contents):
detail_contents = detail_contents.result()
movie_urls = re.findall(‘id="media".*?src="(.*?)"‘, detail_contents, re.S)
if movie_urls:
movie_url = movie_urls[0]
if movie_url.endswith(‘mp4‘):
pool.submit(download, movie_url)
def download(movie_url):
try:
response = requests.get(movie_url,
)
if response.status_code == 200:
data = response.content
m = hashlib.md5()
m.update(str(time.time()).encode(‘utf-8‘))
m.update(movie_url.encode(‘utf-8‘))
filepath = os.path.join(DOWLOAD_PATH, ‘%s.mp4‘ % m.hexdigest())
with open(filepath, ‘wb‘) as f:
f.write(data)
f.flush()
print(‘下载成功‘, movie_url)
except Exception:
pass
def main():
raw_url = ‘http://www.xiaohuar.com/list-3-{page_num}.html‘
for i in range(5):
index_url = raw_url.format(page_num=i)
pool.submit(get_page, index_url).add_done_callback(parse_index)
if __name__ == ‘__main__‘:
main()

参考博客:
http://www.zhidaow.com/post/python-requests-install-and-brief-introduction
http://blog.csdn.net/shanzhizi/article/details/50903748