标签:
#!/usr/bi/env python
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
class Turtle(object):
def __init__(self):
self.pageIndex = 1
self.stories = []
self.enable = True
self.header = {‘User-Agent‘:‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘}
self.enable = True
‘‘‘
获取 网页内容
‘‘‘
def getPage(self,pageIndex):
try:
url = ‘http://www.qiushibaike.com/hot/page/‘+ str(pageIndex)
request = urllib2.Request(url, headers = self.header)
response = urllib2.urlopen(request)
return response.read().decode(‘utf-8‘)
except urllib2.URLError,e:
if hasattr(e,‘code‘):
print u‘错误码:‘,e.code
if hasattr(e , ‘reason‘):
print u‘错误原因:‘,e.reason
‘‘‘
获取网页内段子
‘‘‘
def getPageItem(self,pageIndex):
pageContent = self.getPage(pageIndex)
if not pageContent:
print u‘页面加载失败。。。‘
return None
pattern = re.compile(‘<div.*?author.*?<h2>(.*?)</h2>.*?‘+
‘<div.*?content">(.*?)<!--(.*?)-->.*?</div>‘+
‘.*?<div.*?class="stats.*?class="number">(.*?)</i>‘,re.S)
try:
items = re.findall(pattern,pageContent)
except BaseException,e:
print e
pageStories = []
for item in items:
replaceBR = re.compile(‘<br/>‘)
text = re.sub(replaceBR,‘\n‘,item[1])
pageStories.append([item[0].strip(),text.strip(),item[3].strip()])
return pageStories
‘‘‘
加载一页段子
‘‘‘
def loadPage(self):
if len(self.stories) < 2:#总页数小于1页,加载下一页
print ‘==============剩余未读小于两页,预加载下一页==============‘
pageStories = self.getPageItem(self.pageIndex)
self.pageIndex += 1
self.stories.append(pageStories)
‘‘‘
一个个读段子
‘‘‘
def getOneStory(self):
for story in self.stories[0]:
isQ = raw_input()
if isQ == ‘q‘ or isQ == ‘Q‘:
self.enable = False
return
self.loadPage()
print story[1]
print ‘-----%s,liked by %s‘ % ( story[0] , story[2] )
def start(self):
print ‘start to read page 1‘
self.loadPage();
while self.enable:
if len(self.stories) >0:
self.getOneStory()
del self.stories[0]
print ‘===========该页已读完,读取下一页===========‘
turtle = Turtle()
turtle.start();
标签:
原文地址:http://www.cnblogs.com/sixstones/p/5291642.html