码迷,mamicode.com
首页 > 其他好文 > 详细

爬站长之家表情包

时间:2017-04-24 23:03:53      阅读:365      评论:0      收藏:0      [点我收藏+]

标签:img   roo   cep   parser   包名   ==   orm   爬取   key   

 1 from bs4 import BeautifulSoup
 2 import os
 3 import requests
 4 
 5 #获取页面内容
 6 def getHtmlText(url, s=text):
 7     try:
 8         r = requests.get(url, timeout=30)
 9         r.raise_for_status()
10         r.encoding = r.apparent_encoding
11         if s == text:
12             return r.text
13         elif s == content:
14             return r.content
15         else:
16             return ‘‘
17     except:
18         return ""
19 
20 
21  #获取表情包名字与表情包套链接
22 def getEmotionInfo(html):
23     soup = BeautifulSoup(html, html.parser)
24     emo_divs = soup.find_all(div, attrs={class:up})
25     for div in emo_divs:
26         a = div.find(div, attrs={class:num_1}).find(a)
27         title = a.attrs[title]
28         href = a.attrs[href]
29         getEmotionImgInfo(title, href)
30 
31 #获取表情包中每一个图片的链接
32 def getEmotionImgInfo(title, href):
33     html = getHtmlText(href)
34     soup = BeautifulSoup(html, html.parser)
35     img_div = soup.find(div, attrs={class:img_text}).next_sibling.next_sibling
36     imgs = img_div.find_all(img)
37     url_list = []
38     for img in imgs:
39         src = img.attrs[src]
40         url_list.append(src)
41     getImg(title, url_list)
42 
43 #获取表情包保存在本地
44 def getImg(title, url_list):
45     root = D://pics// + title
46     if not os.path.exists(root):
47         os.mkdir(root)
48     count_small = 0
49     for key in url_list:
50         path = root +//+ key.split(/)[-1]
51         if not os.path.exists(path):
52             img_content = getHtmlText(key,content)
53             with open(path, wb) as f:
54                 f.write(img_content)
55             count_small = count_small + 1
56             print(\r{}文件进度:{:.2f}%.format(title, count_small*100/len(url_list)),end=,)
57 
58 if __name__ == __main__:
59     first_url = http://sc.chinaz.com/biaoqing/index.html
60     root_url = http://sc.chinaz.com/biaoqing/index_
61 
62 pages = 20 63 for i in range(1,pages): #切换页面爬取内容 64 if i == 1: 65 html = getHtmlText(first_url) 66 else: 67 url = root_url + str(i) + .html 68 html = getHtmlText(url) 69 getEmotionInfo(html)

 

爬站长之家表情包

标签:img   roo   cep   parser   包名   ==   orm   爬取   key   

原文地址:http://www.cnblogs.com/jp-mao/p/6759005.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!