码迷,mamicode.com
首页 > 编程语言 > 详细

无比强大!Python抓取cssmoban网站的模版并下载

时间:2014-05-15 09:17:50      阅读:423      评论:0      收藏:0      [点我收藏+]

标签:blog   class   code   c   tar   ext   

Python实现抓取http://www.cssmoban.com/cssthemes网站的模版并下载


实现代码

# -*- coding: utf-8 -*-
import urlparse
import urllib2
import re
import os  
import os.path

URL=‘http://www.cssmoban.com/cssthemes‘

#全局超时设置 
urllib2.socket.setdefaulttimeout(500)

#根据url获取内容
def getUrlContent(url):
    response = urllib2.urlopen(url)
    html = response.read();
    return html

#获取html中的a标签,且格式是<a target="_blank" href="/showcase/*">的
def getAllUrl(html):
    return re.findall(‘<a[\\s]+href="/cssthemes/\d+\.shtml">.*?\/a>‘,html)

#获取下载文件的标题
def getDownTitle(html):
    return re.findall(‘\<h1>(.*?)\</h1>‘,html)

#获取文件下载的url
def getDownUrl(html):
    return re.findall(‘<a.*?class="button btn-down".*?\/a>‘,html)

#获取下一页的url
def getNextUrl(html):
    return re.findall(‘<a.*?下一页</a>‘,html)

#下载文件
def download(title,url):
    result = urllib2.urlopen(url).read()
    if os.path.exists("template/")==False:
        os.makedirs("template/")
    newname=("template/"+title.decode(‘utf-8‘))
    newname=newname+‘.‘+url[url.rfind(‘.‘)+1:len(url)]
    open(newname, "wb").write(result)

#记录日志
def i(msg):
    fileobj=open(‘info.log‘,‘a‘)
    fileobj.write(msg+‘\n‘)
    fileobj.close();
    print msg
#记录错误日志
def e(msg):
    fileobj=open(‘error.log‘,‘a‘)
    fileobj.write(msg+‘\n‘)
    fileobj.close();
    print msg
if __name__ == ‘__main__‘:

    #print getDownUrl(‘<a href="http://down.cssmoban.com/cssthemes1/cctp_17_jeans.zip" target="_blank" class="button btn-down" title="免费下载"><i class="icon-down icon-white"></i><i class="icon-white icon-down-transiton"></i>免费下载</a>‘)
    
    html= getUrlContent(URL)
    i(‘开始下载:%s‘ %(URL))
    while True:
        lista= getAllUrl(html);
        #print lista;
        nextPage=getNextUrl(html)
        #print nextPage[0]
        nextUrl=‘‘
        #i(‘下一页%s‘%(nextPage))
        
        if len(nextPage)<=0:
            e(‘地址:%s,未找到下一页,程序退出‘ %(nextPage))
            break;
        
        nextUrl=nextPage[0]
        nextUrl=URL+‘/‘+nextUrl[nextUrl.index(‘href="‘)+6:nextUrl.index(‘" target‘)]
        #print nextPage
        for a in lista:
            downGotoUrl=‘‘
            try:
                #print a.decode(‘utf-8‘)
                downGotoUrl=(URL+‘‘+a[a.index(‘href="‘)+6:a.index(‘">‘)])
                downGotoUrl=downGotoUrl.replace(URL,‘http://www.cssmoban.com‘)
                #print downGotoUrl
                downHtml=getUrlContent(downGotoUrl)
                #print downHtml
                downTitleList= getDownTitle(downHtml)
                downTitle=‘‘
                if len(downTitleList)>0:
                    downTitle=downTitleList[0]
                #print downTitle
                downUrlList= getDownUrl(downHtml)
                downUrl=‘‘
                if len(downUrlList)>0:
                    downUrl=downUrlList[0]
                downUrl= downUrl[downUrl.index(‘href="‘)+6:downUrl.index(‘" target‘)]
                #print downUrl
                i(‘开始下载:%s,文件名:%s‘ %(downUrl,downTitle))

                download(downTitle,downUrl)
                i(‘%s下载完成,保存文件名:%s‘ %(downUrl,downTitle))
            except Exception,e:
                e(‘地址:%s下载失败,失败信息:‘ %(downGotoUrl))
                e(str(e))
                

        i(‘-----------------------------------------‘)
        i(‘执行下一页:%s‘ %(nextUrl))
        html= getUrlContent(nextUrl)
    
        


无比强大!Python抓取cssmoban网站的模版并下载,布布扣,bubuko.com

无比强大!Python抓取cssmoban网站的模版并下载

标签:blog   class   code   c   tar   ext   

原文地址:http://blog.csdn.net/wiker_yong/article/details/25844349

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!