码迷,mamicode.com
首页 > 数据库 > 详细

爬取豆瓣古典文学(数据库存储)

时间:2018-06-21 19:44:32      阅读:216      评论:0      收藏:0      [点我收藏+]

标签:html_   cursor   div   size   tin   print   with   indent   close   

 

 

代码如下:

  1 # coding:utf-8
  2 import cPickle
  3 import random
  4 import requests
  5 from lxml import etree
  6 import time
  7 import re
  8 import sys
  9 import codecs
 10 import sqlite3
 11 
 12 class Spider:
 13     def __init__(self):
 14         self.con = sqlite3.connect(rBookInformation.db)
 15         self.cur = self.con.cursor()
 16         self.home = https://book.douban.com/tag/%E5%8F%A4%E5%85%B8%E6%96%87%E5%AD%A6
 17         self.Referer = https://book.douban.com/
 18         self.user_agent_list = []
 19         self.books_list = []
 20         with open(user_agent.txt, rb) as f:
 21             self.user_agent_list = cPickle.load(f)
 22 
 23     def GetHeaders(self):
 24         UserAgent = random.choice(self.user_agent_list)
 25         headers = {Referer: self.Referer, User-Agent: UserAgent}
 26         return headers
 27 
 28     def SaveBook(self,info):
 29         sql = INSERT INTO BookInfo VALUES(?,?,?,?,?)
 30         info_list = (info["Name"],info["Author"],info["Rating"],info["ContentIntro"],info["AuthorIntro"])
 31         self.cur.execute(sql, info_list)
 32         self.con.commit()
 33 
 34     def Crawl(self):
 35         html = requests.get(self.home,headers=self.GetHeaders()).text
 36         html_tree = etree.HTML(html)
 37         booksList = html_tree.xpath(/html/body/div[3]/div[1]/div/div[1]/div/ul/li)
 38         num = 0
 39         for book in booksList:
 40             time.sleep(1)
 41             bookUrl = book.xpath(div[2]/h2/a)[0].get(href)
 42             pageHtml = requests.get(bookUrl,headers=self.GetHeaders()).text
 43             page_tree = etree.HTML(pageHtml)
 44             book_info = self.GetPage(page_tree)
 45             print book_info[Name]
 46             self.SaveBook(book_info)
 47             # self.books_list.append(book_info)
 48             # f = codecs.open(‘text.txt‘,‘a‘,encoding=‘utf-8‘)
 49             # f.write(book_info[‘AuthorIntro‘])
 50             # f.close()
 51             # print book_info[‘AuthorIntro‘]
 52             num = num+1
 53             if num==5:
 54                 break
 55 
 56 
 57     def GetPage(self, page_tree):
 58         book_info = {}
 59         try:
 60             Name = self.GetName(page_tree)
 61             book_info[Name] = Name
 62         except:
 63             book_info[Name] = ‘‘
 64         try:
 65             Author = self.GetAuthor(page_tree)
 66             book_info[Author] = Author
 67         except:
 68             book_info[Author] = ‘‘
 69         try:
 70             Rating = self.GetRating(page_tree)
 71             book_info[Rating] = Rating
 72         except:
 73             book_info[Rating] = ‘‘
 74         try:
 75             ContentIntro = self.GetContentIntro(page_tree)
 76             book_info[ContentIntro] = ContentIntro
 77         except:
 78             book_info[ContentIntro] = ‘‘
 79         try:
 80             AuthorIntro = self.GetAuthorIntro(page_tree)
 81             book_info[AuthorIntro] = AuthorIntro
 82         except:
 83             book_info[AuthorIntro] = ‘‘
 84 
 85 
 86         return book_info
 87 
 88     def GetName(self, page_tree):
 89         return page_tree.xpath(/html/body/div[3]/h1/span)[0].text
 90 
 91     def GetAuthor(self,page_tree):
 92         author_list = page_tree.xpath(/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/span[1]/a)
 93         result = ‘‘
 94         if len(author_list) is not 0:
 95             list = []
 96             for author in author_list:
 97                 list.append(author.text.strip())
 98             result = /.join(list)
 99         else:
100             result = page_tree.xpath(/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/a)[0].text.strip()
101         return re.sub(r\s+, ,result)
102 
103 
104     def GetRating(self, page_tree):
105         return page_tree.xpath(/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[2]/div/div[2]/strong)[0].text.strip()
106 
107     def GetContentIntro(self, page_tree):
108         para_div = page_tree.xpath(//*[@id="link-report"]//div[@class="intro"])
109         result = ‘‘
110         if len(para_div) is not 0:
111             para_para = para_div[len(para_div)-1].xpath(p)
112             for para in para_para:
113                 result = result+\t+para.text+\n
114         return result
115 
116     def GetAuthorIntro(self, page_tree):
117         para_div = page_tree.xpath(/html/body/div[3]/div[2]/div/div[1]/div[3]/div[@class="indent "]//div[@class="intro"])
118         result = ‘‘
119         if len(para_div) is not 0:
120             para_para = para_div[len(para_div) - 1].xpath(p)
121             for para in para_para:
122                 result = result + \t + para.text + \n
123         return result
124 
125     # def GetCatalogue(self, page_tree):
126     #     pass
127     #
128     # def GetTag(self, page_tree):
129     #     pass
130     #
131     # def GetShortCommentary(self, page_tree):
132     #     pass
133 
134 if __name__ == __main__:
135     s = Spider()
136     s.Crawl()

 

爬取豆瓣古典文学(数据库存储)

标签:html_   cursor   div   size   tin   print   with   indent   close   

原文地址:https://www.cnblogs.com/DOLFAMINGO/p/9210568.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!