小爬虫，抓取某贴吧内所有帖子的图片

时间：2014-09-23 13:35:14 阅读：194 评论：0 收藏：0 [点我收藏+]

#!/usr/bin/env python
#-*- coding:utf8 -*-

import sys
reload(sys)
sys.setdefaultencoding(‘gbk‘)
import urllib,urllib2
import re
from bs4 import BeautifulSoup

class GetHtml():
    def __init__(self):  
        headers = { #伪装为浏览器抓取    
                   ‘User-Agent‘:‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘    
        }
        html = "http://tieba.baidu.com/f?kw=摄影&ie=utf-8&ie=utf-8&fr=wwwt"   #摄影贴吧的首页链接
        self.req = urllib2.Request(html)
        self.req.add_header(‘User-Agent‘,headers)  
        
        content = urllib2.urlopen(self.req).read()
        soup = BeautifulSoup(content)
        aLinks = soup.find_all(‘a‘)     #定位a标签
        self.urls = []
        for aLink in aLinks :
            href = str(aLink.get(‘href‘))       #a标签内href的属性
            link = re.compile("/p/\d{10}")      #正则筛选出符合条件的数据,比如 /p/1234567890
            if link.findall(href):
                url = link.findall(href)        
                self.urls += url                #结果合并成一个list
  
    def getImg(self):
        for u in self.urls:         #循环进入每个帖子内，查找jpg后缀的文件，并download
            page = urllib2.urlopen(‘http://tieba.baidu.com‘+u)
            html = page.read()
            reg = r‘src="(.+?\.jpg)" pic_ext‘       
            imgre = re.compile(reg)
            imglist = re.findall(imgre,html)
            x = 0
            for img in imglist:
                urllib.urlretrieve(img,‘%s.jpg‘ % x)
                x+=1
                        
if __name__ == "__main__":
    gh = GetHtml()
    gh.getImg()

小爬虫，抓取某贴吧内所有帖子的图片

标签：blog http for 文件数据 div on c log

原文地址：http://www.cnblogs.com/xiaoluosun/p/3988039.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行