码迷,mamicode.com
首页 > 编程语言 > 详细

python3 urllib模块

时间:2014-10-29 12:47:18      阅读:373      评论:0      收藏:0      [点我收藏+]

标签:style   blog   http   io   color   os   ar   使用   for   

3.0版本中已经将urllib2、urlparse、和robotparser并入了urllib中,并且修改urllib模块,其中包含5个子模块,即是help()中看到的那五个名字。

Python2中的urllib模块,在Python3中被修改为

20.5. urllib.request — Extensible library for opening URLs
20.6. urllib.response — Response classes used by urllib
20.7. urllib.parse — Parse URLs into components
20.8. urllib.error — Exception classes raised by urllib.request
20.9. urllib.robotparser — Parser for robots.txt

这几个模块,常用的urllib.urlopen()方法变成了urllib.request.urlopen()方法,其它方法的改变,可以参考Python3的文档

Python3文档的互联网协议与支持部分:http://docs.python.org/py3k/library/internet.html

Python2使用库:

urllib     http://docs.python.org/library/urllib.html【下载】

urllib2  http://docs.python.org/library/urllib2.html【抓取】

urlparse  http://docs.python.org/library/urlparse.html【url切分用到】

sgmllib  http://docs.python.org/library/sgmllib.html【html解析用到】

# Python urllib2递归抓取某个网站下图片
#
!/usr/bin/python # -*- coding:utf-8 -*- # author: wklken # 2012-03-17 wklken@yeah.net #1实现url解析 #2实现图片下载 #3优化重构 #4多线程 尚未加入 import os,sys,urllib,urllib2,urlparse from sgmllib import SGMLParser img = [] class URLLister(SGMLParser): def reset(self): SGMLParser.reset(self) self.urls=[] self.imgs=[] def start_a(self, attrs): href = [ v for k,v in attrs if k=="href" and v.startswith("http")] if href: self.urls.extend(href) def start_img(self, attrs): src = [ v for k,v in attrs if k=="src" and v.startswith("http") ] if src: self.imgs.extend(src) def get_url_of_page(url, if_img = False): urls = [] try: f = urllib2.urlopen(url, timeout=1).read() url_listen = URLLister() url_listen.feed(f) if if_img: urls.extend(url_listen.imgs) else: urls.extend(url_listen.urls) except urllib2.URLError, e: print e.reason return urls #递归处理页面 def get_page_html(begin_url, depth, ignore_outer, main_site_domain): #若是设置排除外站 过滤之 if ignore_outer: if not main_site_domain in begin_url: return if depth == 1: urls = get_url_of_page(begin_url, True) img.extend(urls) else: urls = get_url_of_page(begin_url) if urls: for url in urls: get_page_html(url, depth-1) #下载图片 def download_img(save_path, min_size): print "download begin..." for im in img: filename = im.split("/")[-1] dist = os.path.join(save_path, filename) #此方式判断图片的大小太浪费了 #if len(urllib2.urlopen(im).read()) < min_size: # continue #这种方式先拉头部,应该好多了,不用再下载一次 connection = urllib2.build_opener().open(urllib2.Request(im)) if int(connection.headers.dict[content-length]) < min_size: continue urllib.urlretrieve(im, dist,None) print "Done: ", filename print "download end..." if __name__ == "__main__": #抓取图片首个页面 url = "http://www.baidu.com/" #图片保存路径 save_path = os.path.abspath("./downlaod") if not os.path.exists(save_path): os.mkdir(save_path) #限制图片最小必须大于此域值 单位 B min_size = 92 #遍历深度 max_depth = 1 #是否只遍历目标站内,即存在外站是否忽略 ignore_outer = True main_site_domain = urlparse.urlsplit(url).netloc get_page_html(url, max_depth, ignore_outer, main_site_domain) download_img(save_path, min_size)
#!/usr/bin/env python3

# -*- coding: utf-8 -*-

import time
import sys
import gzip
import socket
import urllib.request, urllib.parse, urllib.error
import http.cookiejar
class HttpTester:
def __init__(self, timeout=10, addHeaders=True):
socket.setdefaulttimeout(timeout) # 设置超时时间
self.__opener = urllib.request.build_opener()
urllib.request.install_opener(self.__opener)
if addHeaders: self.__addHeaders()
def __error(self, e):
‘‘‘错误处理‘‘‘
print(e)
def __addHeaders(self):
‘‘‘添加默认的 headers.‘‘‘
self.__opener.addheaders = [(User-Agent, Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0),
(Connection, keep-alive),
(Cache-Control, no-cache),
(Accept-Language:, zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3),
(Accept-Encoding, gzip, deflate),
(Accept, text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8)]
def __decode(self, webPage, charset):
‘‘‘gzip解压,并根据指定的编码解码网页‘‘‘
if webPage.startswith(bx1fx8b):
return gzip.decompress(webPage).decode(charset)
else:
return webPage.decode(charset)
def addCookiejar(self):
‘‘‘为 self.__opener 添加 cookiejar handler。‘‘‘
cj = http.cookiejar.CookieJar()
self.__opener.add_handler(urllib.request.HTTPCookieProcessor(cj))
def addProxy(self, host, type=http):
‘‘‘设置代理‘‘‘
proxy = urllib.request.ProxyHandler({type: host})
self.__opener.add_handler(proxy)
def addAuth(self, url, user, pwd):

‘‘‘添加认证‘‘‘
pwdMsg = urllib.request.HTTPPasswordMgrWithDefaultRealm()
pwdMsg.add_password(None, url, user, pwd)
auth = urllib.request.HTTPBasicAuthHandler(pwdMsg)
self.__opener.add_handler(auth)

def get(self, url, params={}, headers={}, charset=UTF-8):
‘‘‘HTTP GET 方法‘‘‘
if params: url += ? + urllib.parse.urlencode(params)
request = urllib.request.Request(url)
for k,v in headers.items(): request.add_header(k, v) # 为特定的 request 添加指定的 headers
try:
response = urllib.request.urlopen(request)
except urllib.error.HTTPError as e:
self.__error(e)
else:
return self.__decode(response.read(), charset)


def post(self, url, params={}, headers={}, charset=UTF-8): ‘‘‘HTTP POST 方法‘‘‘ params = urllib.parse.urlencode(params) request = urllib.request.Request(url, data=params.encode(charset)) # 带 data 参数的 request 被认为是 POST 方法。 for k,v in headers.items(): request.add_header(k, v) try: response = urllib.request.urlopen(request) except urllib.error.HTTPError as e: self.__error(e) else: return self.__decode(response.read(), charset) def download(self, url, savefile): ‘‘‘下载文件或网页‘‘‘ header_gzip = None for header in self.__opener.addheaders: # 移除支持 gzip 压缩的 header if Accept-Encoding in header: header_gzip = header self.__opener.addheaders.remove(header) __perLen = 0

def reporthook(a, b, c): # a:已经下载的数据大小; b:数据大小; c:远程文件大小; if c > 1000000: nonlocal __perLen per = (100.0 * a * b) / c if per>100: per=100 per = {:.2f}%.format(per) print(b*__perLen, per, end=‘‘) # 打印下载进度百分比 sys.stdout.flush() __perLen = len(per)+1 print(--> {}t.format(url), end=‘‘) try: urllib.request.urlretrieve(url, savefile, reporthook) # reporthook 为回调钩子函数,用于显示下载进度 except urllib.error.HTTPError as e: self.__error(e) finally: self.__opener.addheaders.append(header_gzip) print() 二、应用实例 在OSC上动弹一下 ht = HttpTester() ht.addCookiejar() # 为了隐私,把有些关键字隐藏了 ht.get(https://www.oschina.net/home/login?goto_page=http%3A%2F%2Fwww.oschina.net%2F) ht.post(url = https://www.oschina.net/action/user/hash_login, params = {email: ****@foxmail.com,pwd: e4a1425583d37fcd33b9*************,save_login: 1})#密码哈希,Firefox开发工具抓取的
ht.get(http://www.oschina.net/)
ht.post(url = http://www.oschina.net/action/tweet/pub,
params = {user_code: 8VZTqhkJOqhnuugHvzBtME4***********,user: 102*****,msg: 大家在动弹什么? via:(python3, urllib) ->{t}.format(t = time.time())})
金山快盘签到送空间
ht = HttpTester()
ht.addCookiejar()
# 为了隐私,把有些关键字隐藏
ht.get(https://www.kuaipan.cn/account_login.htm)
ht.post(url=https://www.kuaipan.cn/index.php?ac=account&op=login,params={username: ****@qq.com,userpwd: lyb********,isajax: yes})
ht.get(http://www.kuaipan.cn/index.php?ac=zone&op=taskdetail)
ht.get(http://www.kuaipan.cn/index.php?ac=common&op=usersign)

 

python3 urllib模块

标签:style   blog   http   io   color   os   ar   使用   for   

原文地址:http://www.cnblogs.com/fly-xiang-zhao/p/4058983.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!