标签:
没得事就爬一下我喜欢的海贼王上的图片
须要在d盘下建立一个imgcache目录
# -*- coding: utf-8 -*-
import urllib
import urllib2
import json
from bs4 import BeautifulSoup
import threadpool 
import thread
class htmlpaser:
        def __init__(self):
                self.url='http://1.hzfans.sinaapp.com/process.php'
        #POST数据到接口
        def Post(self,postdata):
                # headers = {
                #         'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'  
                # }
                # data = urllib.urlencode(postdata)
                # req = urllib2.Request(self.url,data,headers)
                # resp = urllib2.urlopen(req,None,20)
                # html = resp.read()
                # return html
                data = urllib.urlencode(postdata)
                req = urllib2.Request(url, data)
                html= urllib2.urlopen(req).read()
                print html
        #获取html内容
        def GetHtml(self,url):
                headers = {  
                    'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'  
                }  
                req = urllib2.Request(url,None,headers)
                resp = urllib2.urlopen(req,None,5)
                html = resp.read()
                #return html.decode('utf8')
                return html
        def GetHtml2(self,url): 
                page = urllib.urlopen(url) 
                html = page.read() 
                page.close() 
                return html
        def GetHtml3(self,url):
                req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                'Accept':'text/html;q=0.9,*/*;q=0.8',
                'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Accept-Encoding':'gzip',
                'Connection':'close',
                'Referer':None #注意假设依旧不能抓取的话,这里能够设置抓取站点的host
                }
                req_timeout = 5
                req = urllib2.Request(url,None,req_header)
                resp = urllib2.urlopen(req,None,req_timeout)
                html = resp.read()
                return html
        def GetList(self,html):
                soup = BeautifulSoup(''.join(html))
                baseitem=soup.find('ul',{'class':'list'})
                slist=baseitem.select('li a')
                return slist
        def DownImg(self,imgurl):
                path= r"d:/imgcache/"+self.gGetFileName(imgurl)
                data = urllib.urlretrieve(imgurl,path)
                return data
        def gGetFileName(self,url):
                if url==None: return None
                if url=="" : return ""
                arr=url.split("/")
                return arr[len(arr)-1]
        def mkdir(path):
                import os
                path=path.strip()
                path=path.rstrip("\\")
                # 推断路径是否存在
                # 存在     True
                # 不存在   False
                isExists=os.path.exists(path)
                # 推断结果
                if not isExists:
                        # 假设不存在则创建文件夹
                        # 创建文件夹操作函数
                        os.makedirs(path)
                        return True
                else:
                        # 假设文件夹存在则不创建,并提示文件夹已存在
                        return False
        #返回两个值
        def ParseContent(self,html):
                soup = BeautifulSoup(''.join(html))
                baseitem=soup.find('div',{'class':'showbox'})
                title=soup.find('div',{'class':'msg'}).find('div',{'class':'m_left'}).get_text()
                imglist=baseitem.find_all('img')
                for img in imglist:
                        imgurl=img.get('src')
                        self.DownImg(imgurl)
                content=baseitem.get_text().encode('utf8')
                position=content.find('热点推荐')
                return title,content[0:position]
        
        def ParseItem(self,item):
                url=item.get('href')
                if url==None:
                        return
                #print url+'\n'
                html=obj.GetHtml2(url)
                title,content=obj.ParseContent(html)
                #print title+'\n'
                return title
def print_result(request, result):
        print str(request.requestID)+":"+result
        
        
obj=htmlpaser()
pool = threadpool.ThreadPool(10) 
for i in range(1,40):
        url="http://op.52pk.com/shtml/op_wz/list_2594_%d.shtml"%(i)
        html=obj.GetHtml2(url)
        items=obj.GetList(html)
        print 'add job %d\r' % (i)
        requests = threadpool.makeRequests(obj.ParseItem, items, print_result)
        [pool.putRequest(req) for req in requests] 
pool.wait()
标签:
原文地址:http://www.cnblogs.com/bhlsheji/p/5152931.html