码迷,mamicode.com
首页 > 其他好文 > 详细

无法解决的问题

时间:2019-04-20 00:17:05      阅读:176      评论:0      收藏:0      [点我收藏+]

标签:一个   nbsp   main   beautiful   sage   hid   ges   asc   getpass   

学习python时做了一个爬虫爬取百度贴吧的内容,但是用BeautifulSoup得到的结果使用find_all函数却无法获取。

 

getCommentInfo.py:

技术图片
 1 from urllib import request
 2 import requests
 3 from bs4 import BeautifulSoup
 4 from mylog import MyLog as mylog
 5 import random
 6 
 7 class Item(object):
 8     title = None    #帖子标题
 9     firstAuthor = None  #创建者
10     firstTime = None    #创建时间
11     reNum = None    #总回复数
12     content = None  #最后回复内容
13     lastAuthor = None   #最后回复者
14     lastTime = None     #最后回复时间
15 
16 class GetTiebaInfo(object):
17     def __init__(self,url):
18         self.url = url
19         self.log = mylog()
20         self.pageSum = 5
21         self.urls = self.getUrls(self.pageSum)
22         self.items = self.spider(self.urls)
23         self.pipelines(self.items)
24 
25     def getUrls(self,pageSum):
26         urls = []
27         pns = [str(i*50) for i in range(pageSum)]
28         ul = self.url.split(=)
29         for pn in pns:
30             ul[-1] = pn
31             url = =.join(ul)
32             urls.append(url)
33         self.log.info(u"获取URLS成功 ")
34         return urls
35 
36     def spider(self,urls):
37         items = []
38         for url in urls:
39             htmlContent = self.getResponseContent(url)
40             with open("content.html","w",encoding=utf-8) as f:
41                 f.write(htmlContent)
42             soup = BeautifulSoup(htmlContent,lxml)
43             with open(soup.txt,w,encoding=utf-8) as fp:
44                 fp.write(soup.text)
45 
46             tagsli = soup.find_all(li,attrs={class:j_thread_list clearfix})
47             for tag in tagsli:
48                 item = Item()
49                 item.title = tag.find(a,attrs={class:j_th_tit }).get_text().strip()
50                 item.firstAuthor = tag.find(span,attrs={class:frs-author-name-wrap}).a.get_text().strip()
51                 item.firstTime = tag.find(span,attrs={title:u创建时间.encode(utf-8)}).get_text().strip()
52                 item.reNum = tag.find(span,attrs={title:u回复.encode(utf-8)}).get_text().strip()
53                 item.content = tag.find(div,attrs={class:threadlist_abs threadlist_abs_onlyline }).get_text().strip()
54                 item.lastAuthor = tag.find(span,attrs={class:tb_icon_author_rely j_replyer}).a.get_text().strip()
55                 item.lastTime = tag.find(span,attrs={title:u最后回复时间.encode(utf-8)}).get_text().strip()
56                 items.append(item)
57                 self.log.info(u获取标题为<<%s>>的项成功 ... %item.title)
58         return items
59 
60     def pipelines(self,items):
61         fileName = u百度贴吧_权力的游戏.txt.encode(utf-8)
62         with open(fileName,w) as fp:
63             for item in items:
64                 fp.write(title:%s \t author:%s \t firstTime:%s \n content:%s \n return:%s \n lastAuthor:%s \t lastTime:%s \n\n\n\n
65                          %(item.title.encode(utf-8),item.firstAuthor.encode(utf-8),item.firstTime.encode(utf-8),item.content.encode(utf-8),item.lastTime.encode(utf-8)))
66                 self.log.info(u标题为<<%s>>的项输入到"%s"成功 %(item.title,fileName.decode(utf-8)))
67 
68     def getResponseContent(self,url):
69         header = {
70             Accept: text/heml,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8,
71             Accept-Encoding: gzip,deflate,sdch, Accept-Language: zh_CN,zh;q=0.8,
72             Connect: keep-alive,
73             User-Agent: Mozilla/5.0(Windows NT 6.3;WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/43.0.235
74         }
75         timeout = random.choice(range(80, 180))
76         try:
77             response = requests.get(url,headers = header,timeout = timeout)
78             response.encoding = utf-8
79         except:
80             self.log.error(uPython 返回 URL:%s 数据失败 %url)
81         else:
82             self.log.info(uPython 返回URL:%s 数据成功 %url)
83             return response.content.decode(utf-8)
84 if __name__==__main__:
85     url = uhttp://tieba.baidu.com/f?kw=权力的游戏&ie=utf-8&pn=50
86     GTI = GetTiebaInfo(url)
View Code

 

mylog.py

技术图片
 1 import logging
 2 import getpass
 3 import sys
 4 
 5 #定义MyLog类
 6 class MyLog(object):
 7     def __init__(self):
 8         self.user = getpass.getuser()
 9         self.logger = logging.getLogger(self.user)
10         self.logger.setLevel(logging.DEBUG)
11 
12         #日志文件名
13         self.logFile = sys.argv[0][0:-3] + .log
14         self.formatter = logging.Formatter(%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n)
15 
16         #文件显示到屏幕并输出到日志文件
17         self.logHand = logging.FileHandler(self.logFile,encoding=utf-8)
18         self.logHand.setFormatter(self.formatter)
19         self.logHand.setLevel(logging.DEBUG)
20 
21         self.logHandSt = logging.StreamHandler()
22         self.logHand.setFormatter(self.formatter)
23         self.logHandSt.setLevel(logging.DEBUG)
24 
25         self.logger.addHandler(self.logHand)
26         self.logger.addHandler(self.logHandSt)
27 
28     def debug(self,msg):
29         self.logger.debug(msg)
30 
31     def info(self,msg):
32         self.logger.info(msg)
33 
34     def warn(self,msg):
35         self.logger.warning(msg)
36 
37     def error(self,msg):
38         self.logger.error(msg)
39 
40     def critical(self,msg):
41         self.logger.critical(msg)
42 
43 # if __name__==‘__main__‘:
44 # #     mylog = MyLog()
45 # #     mylog.debug(u"I‘m debug 测试中文")
46 # #     mylog.info("I‘m info")
47 # #     mylog.warn("I‘m warn")
48 # #     mylog.error(u"I‘m error 测试中文")
49 # #     mylog.critical("I‘m critical")
View Code

 

错误:

  在getCommentInfo.py中40行左右的htmlContent可得到原html的正确内容,但经BeautifulSoup后,返回的soup内容变化,导致无法爬取结果。可从两个调式文件content.html和soup.txt得知。

无法解决的问题

标签:一个   nbsp   main   beautiful   sage   hid   ges   asc   getpass   

原文地址:https://www.cnblogs.com/ShadowCharle/p/10739512.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!