python爬虫(1）-

时间：2017-11-03 16:14:04 阅读：250 评论：0 收藏：0 [点我收藏+]

标签：gravatar 异常 urllib sof robots uil ted ror sed

大多数网站都会定义robots.txt文件，以让爬虫了解爬取该网站时存在的限制。在地址后面加/robots.txt查看

识别网站所用的技术：使用builtwith

因为这个包是用于python2的，python3在使用时要做一些修改。然后我们用一个charset包来检测网站的编码，修改后的完整内容如下（可以直接覆盖）：

  1 import sys
  2 import os
  3 import re
  4 import json
  5 import urllib.request
  6 import urllib.error
  7 import chardet
  8 
  9 
 10 def builtwith(url, headers=None, html=None, user_agent=‘builtwith‘):
 11     """Detect the technology used to build a website
 12 
 13     >>> builtwith(‘http://wordpress.com‘) 
 14     {u‘blogs‘: [u‘PHP‘, u‘WordPress‘], u‘font-scripts‘: [u‘Google Font API‘], u‘web-servers‘: [u‘Nginx‘], u‘javascript-frameworks‘: [u‘Modernizr‘], u‘programming-languages‘: [u‘PHP‘], u‘cms‘: [u‘WordPress‘]}
 15     >>> builtwith(‘http://webscraping.com‘) 
 16     {u‘javascript-frameworks‘: [u‘jQuery‘, u‘Modernizr‘], u‘web-frameworks‘: [u‘Twitter Bootstrap‘], u‘web-servers‘: [u‘Nginx‘]}
 17     >>> builtwith(‘http://microsoft.com‘) 
 18     {u‘javascript-frameworks‘: [u‘jQuery‘], u‘mobile-frameworks‘: [u‘jQuery Mobile‘], u‘operating-systems‘: [u‘Windows Server‘], u‘web-servers‘: [u‘IIS‘]}
 19     >>> builtwith(‘http://jquery.com‘) 
 20     {u‘cdn‘: [u‘CloudFlare‘], u‘web-servers‘: [u‘Nginx‘], u‘javascript-frameworks‘: [u‘jQuery‘, u‘Modernizr‘], u‘programming-languages‘: [u‘PHP‘], u‘cms‘: [u‘WordPress‘], u‘blogs‘: [u‘PHP‘, u‘WordPress‘]}
 21     >>> builtwith(‘http://joomla.org‘) 
 22     {u‘font-scripts‘: [u‘Google Font API‘], u‘miscellaneous‘: [u‘Gravatar‘], u‘web-servers‘: [u‘LiteSpeed‘], u‘javascript-frameworks‘: [u‘jQuery‘], u‘programming-languages‘: [u‘PHP‘], u‘web-frameworks‘: [u‘Twitter Bootstrap‘], u‘cms‘: [u‘Joomla‘], u‘video-players‘: [u‘YouTube‘]}
 23     """
 24     techs = {}
 25 
 26     # check URL
 27     for app_name, app_spec in data[‘apps‘].items():
 28         if ‘url‘ in app_spec:
 29             if contains(url, app_spec[‘url‘]):
 30                 add_app(techs, app_name, app_spec)
 31 
 32     # download content
 33     if None in (headers, html):
 34         try:
 35             request = urllib.request.Request(url, None, {‘User-Agent‘: user_agent})
 36             if html:
 37                 # already have HTML so just need to make HEAD request for headers
 38                 request.get_method = lambda: ‘HEAD‘
 39             response = urllib.request.urlopen(request)
 40             if headers is None:
 41                 headers = response.headers
 42             if html is None:
 43                 html = response.read()
 44                 encode_type = chardet.detect(html)
 45                 if encode_type[‘encoding‘] == ‘utf-8‘:
 46                     html = html.decode(‘utf-8‘)
 47                 else:
 48                     html = html.decode(‘gbk‘)
 49         except Exception as e:
 50             print(‘Error:‘, e)
 51             request = None
 52 
 53     # check headers
 54     if headers:
 55         for app_name, app_spec in data[‘apps‘].items():
 56             if ‘headers‘ in app_spec:
 57                 if contains_dict(headers, app_spec[‘headers‘]):
 58                     add_app(techs, app_name, app_spec)
 59 
 60     # check html
 61     if html:
 62         for app_name, app_spec in data[‘apps‘].items():
 63             for key in ‘html‘, ‘script‘:
 64                 snippets = app_spec.get(key, [])
 65                 if not isinstance(snippets, list):
 66                     snippets = [snippets]
 67                 for snippet in snippets:
 68                     if contains(html, snippet):
 69                         add_app(techs, app_name, app_spec)
 70                         break
 71 
 72         # check meta
 73         # XXX add proper meta data parsing
 74         metas = dict(re.compile(‘<meta[^>]*?name=[\‘"]([^>]*?)[\‘"][^>]*?content=[\‘"]([^>]*?)[\‘"][^>]*?>‘,
 75                                 re.IGNORECASE).findall(html))
 76         for app_name, app_spec in data[‘apps‘].items():
 77             for name, content in app_spec.get(‘meta‘, {}).items():
 78                 if name in metas:
 79                     if contains(metas[name], content):
 80                         add_app(techs, app_name, app_spec)
 81                         break
 82 
 83     return techs
 84 
 85 
 86 parse = builtwith
 87 
 88 
 89 def add_app(techs, app_name, app_spec):
 90     """Add this app to technology
 91     """
 92     for category in get_categories(app_spec):
 93         if category not in techs:
 94             techs[category] = []
 95         if app_name not in techs[category]:
 96             techs[category].append(app_name)
 97             implies = app_spec.get(‘implies‘, [])
 98             if not isinstance(implies, list):
 99                 implies = [implies]
100             for app_name in implies:
101                 add_app(techs, app_name, data[‘apps‘][app_name])
102 
103 
104 def get_categories(app_spec):
105     """Return category names for this app_spec
106     """
107     return [data[‘categories‘][str(c_id)] for c_id in app_spec[‘cats‘]]
108 
109 
110 def contains(v, regex):
111     """Removes meta data from regex then checks for a regex match
112     """
113     return re.compile(regex.split(‘\\;‘)[0], flags=re.IGNORECASE).search(v)
114 
115 
116 def contains_dict(d1, d2):
117     """Takes 2 dictionaries
118 
119     Returns True if d1 contains all items in d2"""
120     for k2, v2 in d2.items():
121         v1 = d1.get(k2)
122         if v1:
123             if not contains(v1, v2):
124                 return False
125         else:
126             return False
127     return True
128 
129 
130 def load_apps(filename=‘apps.json.py‘):
131     """Load apps from Wappalyzer JSON (https://github.com/ElbertF/Wappalyzer)
132     """
133     # get the path of this filename relative to the current script
134     # XXX add support to download update
135     filename = os.path.join(os.getcwd(), os.path.dirname(__file__), filename)
136     return json.load(open(filename))
137 
138 
139 data = load_apps()
140 
141 if __name__ == ‘__main__‘:
142     urls = sys.argv[1:]
143     if urls:
144         for url in urls:
145             results = builtwith(url)
146             for result in sorted(results.items()):
147                 print(‘%s: %s‘ % result)
148     else:
149         print(‘Usage: %s url1 [url2 url3 ...]‘ % sys.argv[0])

　　之后我们就可以使用builtwith来读取网站的技术结构

import builtwith

builtwith.parse(url)

有一个名为whois的包可以用来查询网站所有者信息　　

import whois

print whois.whois(url)

现在我们来建立一个下载函数：

 1 import urllib.request,urllib.error
 2 
 3 def download(url):
 4     print(‘Downloading:‘,url)
 5     try:
 6         html = urllib.request.urlopen(url).read()
 7     except urllib.error as e:
 8         print(‘Download error:‘,e.reason)
 9         html = None
10     return html

当下载出现错误的时候，这个函数能够捕获异常，然后返回None

现在考虑这样一个问题，下载中出现的错误是多种类型的，如5XX的错误是服务器端存在问题，4XX类的错误是请求存在问题，当问题属于服务器端的时候，我们可以尝试重新下载，于是改进版本如下：

 1 import urllib.request,urllib.error
 2 
 3 def download(url,num_retries=2):
 4     print(‘Downloading:‘,url)
 5     try:
 6         html = urllib.request.urlopen(url).read()
 7     except urllib.error as e:
 8         print(‘Download error:‘,e.reason)
 9         html = None
10         if num_retries>0:
11             if hasattr(e,‘code‘) and 500<= e <600:
12                 return download(url,num_retries-1)
13     return html

其中num_retries是我们设定的尝试重新下载的次数。

在然后，我们要考虑用户代理的问题，默认情况下，urllib使用Python-urllib/x作为用户代理，其中x为python版本号。有些网站会禁止这个代理访问，所以我们要设定自己的用户代理。改进版本如下：

 1 import urllib.request,urllib.error
 2 
 3 def download(url,user_agent = ‘wswp‘,num_retries=2):
 4     print(‘Downloading:‘,url)
 5     header = {‘User_agent‘:user_agent}
 6     request = urllib.request(url,header = header)
 7     try:
 8         html = urllib.request.urlopen(request).read()
 9     except urllib.error as e:
10         print(‘Download error:‘,e.reason)
11         html = None
12         if num_retries>0:
13             if hasattr(e,‘code‘) and 500<= e <600:
14                 return download(url,num_retries-1)
15     return html

其中，urlopen（）可以接受一个url地址为参数，也可以接受一个request对象为参数

这样我们的下载函数就建立好了，它可以捕获异常，重试下载并设置用户代理。

python爬虫(1）-

标签：gravatar 异常 urllib sof robots uil ted ror sed

原文地址：http://www.cnblogs.com/peter-sun/p/7778495.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行