Nodejs——简单小说爬虫实现

时间：2017-04-24 00:13:23 阅读：268 评论：0 收藏：0 [点我收藏+]

标签：blog 模块 exists nbsp 添加 com 文件 end set

 1 //引入模块
 2 const http = require(‘http‘)
 3 const fs = require(‘fs‘)
 4 const cheerio = require(‘cheerio‘)
 5 const iconv = require(‘iconv-lite‘)
 6 //第一章url
 7 const url = ‘http://www.81zw.com/book/8634/745331.html‘
 8 //开始章节数
 9 let i = 1
10 //最大获取章节数
11 let num = 100
12 
13 function main(url) {
14     startRequest(url)
15 }
16 
17 function startRequest(url) {
18     http.get(url, res => {
19         //定义空数组存放html
20         const html = []
21         res.on(‘data‘, (chunk) => {
22             //把数据块添加进数组
23             html.push(chunk)
24         })
25         res.on(‘end‘, () => {
26             //获取数据完毕后，使用iconv-lite转码，decedo中为Buffer对象，Buffer.concat为数组
27             const html1 = iconv.decode(Buffer.concat(html), ‘gbk‘)
28             //使用cheerio解析html，cheerio模块的语法跟jQuery基本一样
29             const $ = cheerio.load(html1, {decodeEntities: false})
30             //处理数据
31             const title = $(‘.bookname h1‘).text()
32             const arr = []
33             const content = $("#content").html()
34             //分析结构后分割html
35             const contentArr = content.split(‘<br><br>‘)
36             contentArr.forEach(elem => {
37                 //去除内容的两端空格和&nbsp;
38                 const data = trim(elem.toString())
39                 arr.push(data)
40             })
41             const bookName = $(".con_top a").eq(2).text()
42             //定义存入数据库的对象
43             const obj = {
44                 id: i,
45                 err: 0,
46                 bookName: bookName,
47                 title: title,
48                 content: arr
49             }
50 
51             let url2 = url.split(‘/‘)[url.split(‘/‘).length - 2]
52             const link = $(".bottem2 a").eq(2).attr(‘href‘)
53             //获取当前章节的下一章地址，递归调用fetchPage
54             const nextLink = `http://www.81zw.com/book/${url2}/${link}`
55             saveContent(obj, nextLink)
56             console.log(`第${i + 1}章：${nextLink}`)
57             i++
58             if (i <= num) {
59                 setTimeout(() => {
60                     main(nextLink)
61                 }, 100)
62             }
63         })
64     })
65 }
66 
67 function saveContent(obj, nextLink) {
68     console.log(`${i}--${obj.title}`)
69     //判断书名文件夹是否存在，不存在则创建
70     if (!fs.existsSync(`data/${obj.bookName}`)) {
71         fs.mkdirSync(`data/${obj.bookName}`)
72     }
73     //写入json文件
74     fs.writeFile(`./data/${obj.bookName}/chapter${i}.json`, JSON.stringify(obj), ‘utf-8‘, err => {
75         if (err) throw err
76     })
77 }
78 
79 function trim(str) {
80     return str.replace(/(^\s*)|(\s*$)/g, ‘‘).replace(/&nbsp;/g, ‘‘)
81 }
82 
83 main(url)

技术分享

生成文件

Nodejs——简单小说爬虫实现

标签：blog 模块 exists nbsp 添加 com 文件 end set

原文地址：http://www.cnblogs.com/tgxh/p/6754649.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行