随想录（豆瓣网站的爬行）

时间：2014-12-08 21:25:20 阅读：340 评论：0 收藏：0 [点我收藏+]

标签：blog http ar os sp for strong on 2014

【声明：版权所有，欢迎转载，请勿用于商业用途。联系信箱：feixiaoxing @163.com】

    喜欢看电影的同学对豆瓣肯定不是很陌生。一般我们在选择将要看的电影之前，都会到豆瓣上看一看评价如何。如果评价太低的话，其实看得意义不大了，相反如果看的价值很过，那无论如何使不能错过的。豆瓣的积分大部分是准确的，但是也不能排除恶意刷积分的情况。
    很长时间以来，自己都想把豆瓣上所有好看的电影一网打尽，但是苦于没有现成的工具，只好自己来写。现在的脚本语言比较多，python、go都可以完成这么一项任务。简单的几句就可以完成很多的工作。
    要想完成这么一个任务，我们可以拆成几个事情来做？比如说怎么下载网页？怎么获取电影名称？怎么获取积分？怎么爬行等等。

首先，还是放出我自己写的豆瓣电影爬行代码，如下所示，

#encoding=utf-8
#!/usr/bin/python

import os		
import sys
import re
import time
import smtplib
import urllib
import urllib2
import traceback
from urllib import urlopen

page=[]

def check_num_exist(data):
	for i in range(0, len(page)):
		if data==page[i]:
			return True
	return False

if __name__ == ‘__main__‘:

	num = 0;
	page.append(‘25788662‘)
		
	while num < len(page):
		
		‘‘‘sleep‘‘‘
		time.sleep(2)
		
		‘‘‘produce url address‘‘‘
		url = ‘http://movie.douban.com/subject/‘ + page[num]
		num += 1
		
		‘‘‘get web data	‘‘‘
		req = urllib2.Request(str(url))
		req.add_header(‘User-Agent‘,‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘)
		
		try:
			request = urllib2.urlopen(req)
			
		except urllib2.URLError, e:
			continue
			
		except urllib2.HTTPError, e:
			continue
			
		webdata = request.read()
		
		‘‘‘get title ‘‘‘
		find=re.search(r‘<title>\n(.*?)\(.*?\)\n</title>‘,webdata)
		if( None == find):
			continue;
			
		title = find.group(1).strip().decode(‘utf-8‘)
		
		‘‘‘get score‘‘‘
		find=re.search(r‘<strong class=.*? property=.*?>(\d\.\d)‘,webdata)
		if( None == find):
			continue;
			
		score = find.group(1)
		
		‘‘‘print info about the film ‘‘‘
		
		print (‘%s %s %s‘) %(url,title,score)
	
		‘‘‘print webdata‘‘‘
		find=re.findall(r‘http://movie.douban.com/subject/(\d{7,8})‘, webdata)
		if( 0 == len(find)):
			continue;
			
		for i in range(0,len(find)):
			if(False == check_num_exist(find[i])):
				page.append(find[i])

    （1）web page的下载非常简单，urllib2公式就可以解决；
    （2）获取title和score主要靠正则表达式来完成；
    （3）web爬行的方法，这里采用了深度优先的选择，比较简单，也比较容易实现；
    （4）很多时候，网站会对连续的外部爬行进行屏蔽处理，所以需要用add_header将我们伪装成浏览器访问；
    （5）web爬行不能太过分，一般中间要有sleep几秒钟；
    （6）爬行过程中的所有异常都要考虑到，否则无法长时间运行；
    （7）积极发现web网页的规律，比如说豆瓣电影基本都是movie.douban.com/subject/*的结构；
    （8）大胆尝试，积极改错就可以了，很少能一步到位的。

上面的只是电影的爬行，简单改造一下就是书的查询，

#encoding=utf-8
#!/usr/bin/python

import os		
import sys
import re
import time
import smtplib
import urllib
import urllib2
import traceback
from urllib import urlopen

page=[]

def check_num_exist(data):
	for i in range(0, len(page)):
		if data==page[i]:
			return True
	return False

if __name__ == ‘__main__‘:

	num = 0;
	page.append(‘25843109‘)
		
	while num < len(page):
		
		‘‘‘sleep‘‘‘
		time.sleep(2)
		
		‘‘‘produce url address‘‘‘
		url = ‘http://book.douban.com/subject/‘ + page[num]
		num += 1
		
		‘‘‘get web data	‘‘‘
		req = urllib2.Request(str(url))
		req.add_header(‘User-Agent‘,‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘)
		
		try:
			request = urllib2.urlopen(req)
			
		except urllib2.URLError, e:
			continue
			
		except urllib2.HTTPError, e:
			continue
			
		webdata = request.read()
		
		‘‘‘get title ‘‘‘
		find=re.search(r‘<title>(.*?)\(.*?\)</title>‘,webdata)
		if( None == find):
			continue;
			
		title = find.group(1).strip().decode(‘utf-8‘)
		
		‘‘‘get score‘‘‘
		find=re.search(r‘<strong class=.*? property=.*?>\n.*?(\d\.\d)‘,webdata)
		if( None == find):
			continue;
			
		score = find.group(1)
		
		‘‘‘print info about the film ‘‘‘
		
		print (‘%s %s %s‘) %(url,title,score)
	
		‘‘‘print webdata‘‘‘
		find=re.findall(r‘http://book.douban.com/subject/(\d{7,8})‘, webdata)
		if( 0 == len(find)):
			continue;
			
		for i in range(0,len(find)):
			if(False == check_num_exist(find[i])):
				page.append(find[i])

还有音乐的爬行，

#encoding=utf-8
#!/usr/bin/python

import os		
import sys
import re
import time
import smtplib
import urllib
import urllib2
import traceback
from urllib import urlopen

page=[]

def check_num_exist(data):
	for i in range(0, len(page)):
		if data==page[i]:
			return True
	return False

if __name__ == ‘__main__‘:

	num = 0;
	page.append(‘25720661‘)
		
	while num < len(page):
		
		‘‘‘sleep‘‘‘
		time.sleep(2)
		
		‘‘‘produce url address‘‘‘
		url = ‘http://music.douban.com/subject/‘ + page[num]
		num += 1
		
		‘‘‘get web data	‘‘‘
		req = urllib2.Request(str(url))
		req.add_header(‘User-Agent‘,‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘)
		
		try:
			request = urllib2.urlopen(req)
			
		except urllib2.URLError, e:
			continue
			
		except urllib2.HTTPError, e:
			continue
			
		webdata = request.read()
		
		‘‘‘get title ‘‘‘
		find=re.search(r‘<title>\n(.*?)\(.*?\)\n</title>‘,webdata)
		if( None == find):
			continue;
			
		title = find.group(1).strip().decode(‘utf-8‘)
		
		‘‘‘get score‘‘‘
		find=re.search(r‘<strong class=.*? property=.*?>(\d\.\d)‘,webdata)
		if( None == find):
			continue;
			
		score = find.group(1)
		
		‘‘‘print info about the film ‘‘‘
		
		print (‘%s %s %s‘) %(url,title,score)
	
		‘‘‘print webdata‘‘‘
		find=re.findall(r‘http://music.douban.com/subject/(\d{7,8})‘, webdata)
		if( 0 == len(find)):
			continue;
			
		for i in range(0,len(find)):
			if(False == check_num_exist(find[i])):
				page.append(find[i])

最后，就是兴趣小组的爬行，这是最好玩的。你可以看到很多搞笑的兴趣小组，

#encoding=utf-8
#!/usr/bin/python

import os		
import sys
import re
import time
import smtplib
import urllib
import urllib2
import traceback
from urllib import urlopen

page=[]

def check_num_exist(data):
	for i in range(0, len(page)):
		if data==page[i]:
			return True
	return False

if __name__ == ‘__main__‘:

	num = 0;
	page.append(‘angel‘)
		
	while num < len(page):
		
		‘‘‘sleep‘‘‘
		time.sleep(2)
		
		‘‘‘produce url address‘‘‘
		url = ‘http://www.douban.com/group/‘ + page[num]
		num += 1
		
		‘‘‘get web data	‘‘‘
		req = urllib2.Request(str(url))
		req.add_header(‘User-Agent‘,‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘)
		
		try:
			request = urllib2.urlopen(req)
			
		except urllib2.URLError, e:
			continue
			
		except urllib2.HTTPError, e:
			continue
			
		webdata = request.read()
		
		‘‘‘get title ‘‘‘
		find=re.search(r‘<title>\n(.*?)\n</title>‘,webdata)
		if( None == find):
			continue;
			
		title = find.group(1).strip().decode(‘utf-8‘)
		
		‘‘‘print info about the film ‘‘‘
		
		print (‘%s %s‘) %(url,title)
	
		‘‘‘print webdata‘‘‘
		find=re.findall(r‘http://www.douban.com/group/([\w|\d]+?)/‘, webdata)
		if( 0 == len(find)):
			continue;
			
		for i in range(0,len(find)):
			if(False == check_num_exist(find[i])):
				page.append(find[i])

最后，还是希望大家爬行豆瓣的时候，尽量文明一点。太频繁的话，会被server forbidden的。

随想录（豆瓣网站的爬行）

标签：blog http ar os sp for strong on 2014

原文地址：http://blog.csdn.net/feixiaoxing/article/details/41809089

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行