python 爬取豆瓣图书

时间：2018-03-28 14:08:19 阅读：143 评论：0 收藏：0 [点我收藏+]

标签：fst post art 数据 ascii comment value lis 爬虫

#!-*-coding:utf-8-*-
import requests
import xlwt
from bs4 import BeautifulSoup
from collections import OrderedDict


class DouBanBookSpider(object):

    def __init__(self, book_type, quantity):
        self.book_type = book_type
        self.quantity = quantity
        self.url_list = []
        self.book_dict = OrderedDict()
        self.count = 0

    #获取url
    def get_url(self):
        count = 0
        while count < self.quantity+1:
            url = ‘https://book.douban.com/tag/%s?start=%d&type=S‘ % (self.book_type, count)
            self.url_list.append(url)
            #每页20本书，
            count += 20
        return self.url_list

    #爬虫主体
    def main_spider(self, url):
        rsp = requests.get(url)
        tag_bf = BeautifulSoup(rsp.text, ‘lxml‘)
        content = tag_bf.find_all(‘li‘, class_=‘subject-item‘)
        if content:
            for i in content:
                bt_bf = BeautifulSoup(str(i), ‘lxml‘)
                self.count += 1
                book_name = bt_bf.h2.a.get_text(strip=True)
                author = bt_bf.find(‘div‘, class_=‘pub‘).string.strip()
                comment_info = bt_bf.find(‘div‘, class_=‘star clearfix‘)
                co_bf = BeautifulSoup(str(comment_info), ‘lxml‘)
                grade = co_bf.find(‘span‘, class_=‘rating_nums‘)
                if grade:
                    grade = grade.string
                comment_count = co_bf.find(‘span‘, class_=‘pl‘).string.strip()
                self.book_dict[str(self.count)] = {‘序号‘: self.count, ‘书名‘: book_name, ‘评分‘: grade, ‘评论数‘: comment_count, ‘作者‘: author}
        else:
            return

    #执行爬虫
    def do_spider(self):
        for i in self.get_url():
            self.main_spider(i)

    #数据写入excel
    def write_excel(self):
        wb = xlwt.Workbook(encoding=‘ascii‘)
        ws = wb.add_sheet(self.book_type)
        style = xlwt.XFStyle()
        font = xlwt.Font()
        font.name = ‘Times New Roman‘
        font.bold = True
        style.font = font
        row0 = [‘序号‘, ‘书名‘, ‘评分‘, ‘评论数‘, ‘出版信息‘]
        for i in range(0, len(row0)):
            ws.write(0, i, row0[i], style)
        for k, v in self.book_dict.items():
            for j in range(0, len(v.values())):
                ws.write(int(k), j, list(v.values())[j])
        wb.save(‘%s.xlsx‘ % self.book_type)


if __name__ == "__main__":
    ds = DouBanBookSpider(‘中国历史‘, 2000)
    ds.do_spider()
    ds.write_excel()

python 爬取豆瓣图书

标签：fst post art 数据 ascii comment value lis 爬虫

原文地址：https://www.cnblogs.com/LouisZJ/p/8663166.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行