标签:blog http ar os sp for strong on 2014
首先,还是放出我自己写的豆瓣电影爬行代码,如下所示,
#encoding=utf-8
#!/usr/bin/python
import os
import sys
import re
import time
import smtplib
import urllib
import urllib2
import traceback
from urllib import urlopen
page=[]
def check_num_exist(data):
for i in range(0, len(page)):
if data==page[i]:
return True
return False
if __name__ == ‘__main__‘:
num = 0;
page.append(‘25788662‘)
while num < len(page):
‘‘‘sleep‘‘‘
time.sleep(2)
‘‘‘produce url address‘‘‘
url = ‘http://movie.douban.com/subject/‘ + page[num]
num += 1
‘‘‘get web data ‘‘‘
req = urllib2.Request(str(url))
req.add_header(‘User-Agent‘,‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘)
try:
request = urllib2.urlopen(req)
except urllib2.URLError, e:
continue
except urllib2.HTTPError, e:
continue
webdata = request.read()
‘‘‘get title ‘‘‘
find=re.search(r‘<title>\n(.*?)\(.*?\)\n</title>‘,webdata)
if( None == find):
continue;
title = find.group(1).strip().decode(‘utf-8‘)
‘‘‘get score‘‘‘
find=re.search(r‘<strong class=.*? property=.*?>(\d\.\d)‘,webdata)
if( None == find):
continue;
score = find.group(1)
‘‘‘print info about the film ‘‘‘
print (‘%s %s %s‘) %(url,title,score)
‘‘‘print webdata‘‘‘
find=re.findall(r‘http://movie.douban.com/subject/(\d{7,8})‘, webdata)
if( 0 == len(find)):
continue;
for i in range(0,len(find)):
if(False == check_num_exist(find[i])):
page.append(find[i])
(1)web page的下载非常简单,urllib2公式就可以解决;上面的只是电影的爬行,简单改造一下就是书的查询,
#encoding=utf-8
#!/usr/bin/python
import os
import sys
import re
import time
import smtplib
import urllib
import urllib2
import traceback
from urllib import urlopen
page=[]
def check_num_exist(data):
for i in range(0, len(page)):
if data==page[i]:
return True
return False
if __name__ == ‘__main__‘:
num = 0;
page.append(‘25843109‘)
while num < len(page):
‘‘‘sleep‘‘‘
time.sleep(2)
‘‘‘produce url address‘‘‘
url = ‘http://book.douban.com/subject/‘ + page[num]
num += 1
‘‘‘get web data ‘‘‘
req = urllib2.Request(str(url))
req.add_header(‘User-Agent‘,‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘)
try:
request = urllib2.urlopen(req)
except urllib2.URLError, e:
continue
except urllib2.HTTPError, e:
continue
webdata = request.read()
‘‘‘get title ‘‘‘
find=re.search(r‘<title>(.*?)\(.*?\)</title>‘,webdata)
if( None == find):
continue;
title = find.group(1).strip().decode(‘utf-8‘)
‘‘‘get score‘‘‘
find=re.search(r‘<strong class=.*? property=.*?>\n.*?(\d\.\d)‘,webdata)
if( None == find):
continue;
score = find.group(1)
‘‘‘print info about the film ‘‘‘
print (‘%s %s %s‘) %(url,title,score)
‘‘‘print webdata‘‘‘
find=re.findall(r‘http://book.douban.com/subject/(\d{7,8})‘, webdata)
if( 0 == len(find)):
continue;
for i in range(0,len(find)):
if(False == check_num_exist(find[i])):
page.append(find[i])
#encoding=utf-8
#!/usr/bin/python
import os
import sys
import re
import time
import smtplib
import urllib
import urllib2
import traceback
from urllib import urlopen
page=[]
def check_num_exist(data):
for i in range(0, len(page)):
if data==page[i]:
return True
return False
if __name__ == ‘__main__‘:
num = 0;
page.append(‘25720661‘)
while num < len(page):
‘‘‘sleep‘‘‘
time.sleep(2)
‘‘‘produce url address‘‘‘
url = ‘http://music.douban.com/subject/‘ + page[num]
num += 1
‘‘‘get web data ‘‘‘
req = urllib2.Request(str(url))
req.add_header(‘User-Agent‘,‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘)
try:
request = urllib2.urlopen(req)
except urllib2.URLError, e:
continue
except urllib2.HTTPError, e:
continue
webdata = request.read()
‘‘‘get title ‘‘‘
find=re.search(r‘<title>\n(.*?)\(.*?\)\n</title>‘,webdata)
if( None == find):
continue;
title = find.group(1).strip().decode(‘utf-8‘)
‘‘‘get score‘‘‘
find=re.search(r‘<strong class=.*? property=.*?>(\d\.\d)‘,webdata)
if( None == find):
continue;
score = find.group(1)
‘‘‘print info about the film ‘‘‘
print (‘%s %s %s‘) %(url,title,score)
‘‘‘print webdata‘‘‘
find=re.findall(r‘http://music.douban.com/subject/(\d{7,8})‘, webdata)
if( 0 == len(find)):
continue;
for i in range(0,len(find)):
if(False == check_num_exist(find[i])):
page.append(find[i])
#encoding=utf-8 #!/usr/bin/python import os import sys import re import time import smtplib import urllib import urllib2 import traceback from urllib import urlopen page=[] def check_num_exist(data): for i in range(0, len(page)): if data==page[i]: return True return False if __name__ == ‘__main__‘: num = 0; page.append(‘angel‘) while num < len(page): ‘‘‘sleep‘‘‘ time.sleep(2) ‘‘‘produce url address‘‘‘ url = ‘http://www.douban.com/group/‘ + page[num] num += 1 ‘‘‘get web data ‘‘‘ req = urllib2.Request(str(url)) req.add_header(‘User-Agent‘,‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘) try: request = urllib2.urlopen(req) except urllib2.URLError, e: continue except urllib2.HTTPError, e: continue webdata = request.read() ‘‘‘get title ‘‘‘ find=re.search(r‘<title>\n(.*?)\n</title>‘,webdata) if( None == find): continue; title = find.group(1).strip().decode(‘utf-8‘) ‘‘‘print info about the film ‘‘‘ print (‘%s %s‘) %(url,title) ‘‘‘print webdata‘‘‘ find=re.findall(r‘http://www.douban.com/group/([\w|\d]+?)/‘, webdata) if( 0 == len(find)): continue; for i in range(0,len(find)): if(False == check_num_exist(find[i])): page.append(find[i])
最后,还是希望大家爬行豆瓣的时候,尽量文明一点。太频繁的话,会被server forbidden的。
标签:blog http ar os sp for strong on 2014
原文地址:http://blog.csdn.net/feixiaoxing/article/details/41809089