北京市信件内容爬虫实例——首都之窗

时间：2020-02-16 01:18:23 阅读：120 评论：0 收藏：0 [点我收藏+]

标签：cdn 代码 info headers 存储 find dtd array integer

网址:http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow

Python爬虫代码：

import requests
import re
import xlwt
# #https://flightaware.com/live/flight/CCA101/history/80
url = ‘http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId=AH20021300174‘
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
}
def get_page(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            print(‘获取网页成功‘)
            return response.text
        else:
            print(‘获取网页失败‘)
    except Exception as e:
        print(e)
f = xlwt.Workbook(encoding=‘utf-8‘)
sheet01 = f.add_sheet(u‘sheet1‘, cell_overwrite_ok=True)
sheet01.write(0, 0, ‘序号‘)  # 第一行第一列
sheet01.write(0, 1, ‘问‘)  # 第一行第二列
sheet01.write(0, 2, ‘来信人‘)  # 第一行第三列
sheet01.write(0, 3, ‘时间‘)  # 第一行第四列
sheet01.write(0, 4, ‘网友同问‘)  # 第一行第五列
sheet01.write(0, 5, ‘问题内容‘)  # 第一行第六列
sheet01.write(0, 6, ‘答‘)  # 第一行第七列
sheet01.write(0, 7, ‘答复时间‘)  # 第一行第八列
sheet01.write(0, 8, ‘答复内容‘)  # 第一行第九列
fopen = open(‘C:\\Users\\hp\\Desktop\\list.txt‘, ‘r‘)
lines = fopen.readlines()
temp=0
temp1=0
urls = [‘http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId={}‘.format(line) for line in lines]
for url in urls:
     print(url.replace("\n", ""))
     page = get_page(url.replace("\n", ""))
     items = re.findall(‘<div class="col-xs-10 col-sm-10 col-md-10 o-font4 my-2"><strong>(.*?)</strong></div>.*?<div class="col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted ">(.*?)</div>.*?<div class="col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted ">(.*?)</div>.*?<div class="col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted ">(.*?)<label.*?">(.*?)</label>.*? <div class="col-xs-12 col-md-12 column p-2 text-muted mx-2">(.*?)</div>.*?<div class="col-xs-9 col-sm-7 col-md-5 o-font4 my-2">.*?<strong>(.*?)</div>.*?<div class="col-xs-12 col-sm-3 col-md-3 my-2 ">(.*?)</div>.*?<div class="col-xs-12 col-md-12 column p-4 text-muted my-3">(.*?)</div>‘,page,re.S)
     print(items)
     print(len(items))
     for i in range(len(items)):
        sheet01.write(temp + i + 1, 0,temp+1)
        sheet01.write(temp + i + 1, 1,items[i][0].replace( ‘ ‘,‘‘).replace(‘ ‘,‘‘))
        sheet01.write(temp + i + 1, 2, items[i][1].replace(‘来信人：\r\n\t      \t\r\n        \t\r\n\t        \t‘,‘‘).replace( ‘\r\n        \t\r\n          ‘,‘‘))
        sheet01.write(temp + i + 1, 3, items[i][2].replace(‘时间：‘,‘‘))
        sheet01.write(temp + i + 1, 4, items[i][4].replace(‘\r\n\t\t        \t\r\n\t                \t‘,‘‘).replace(‘\r\n\t                \r\n\t                \r\n                ‘,‘‘))
        sheet01.write(temp + i + 1, 5,items[i][5].replace(‘\r\n\t    \t‘, ‘‘).replace(‘\r\n\t    ‘, ‘‘).replace(‘<p>‘,‘‘).replace(‘   ‘,‘‘).replace(‘ ‘,‘‘).replace(‘  ‘,‘‘).replace(‘    ‘,‘‘))
        sheet01.write(temp + i + 1, 6,items[i][6].replace(‘</strong>‘, ‘‘))
        sheet01.write(temp + i + 1, 7,items[i][7])
        sheet01.write(temp + i + 1, 8, items[i][8].replace(‘\r\n\t\t\t\t\t\t\t\t‘,‘‘).replace(‘    ‘,‘‘).replace(‘\r\n\t\t\t\t\t\t\t‘,‘‘).replace(‘<p>‘,‘‘).replace(‘   ‘,‘‘).replace(‘ ‘,‘‘).replace(‘  ‘,‘‘))
     temp+=len(items)
     temp1+=1
     print("总爬取完毕数量："+str(temp1))
     print("打印完！！！")
     f.save(‘letter.xls‘)

C:\\Users\\hp\\Desktop\\list.txt‘：这个文件是所有信件内容网址的后缀：

技术图片

如果需要请加Q：893225523

将爬虫结果主要分为：序号、问、时间、网友同问、问题内容、答、答复时间、答复内容

存储到Execl:

技术图片

然后为了方便清洗数据，首先让execl表格转换为csv(自动以逗号间隔)，然后用记事本打开，复制粘贴到txt文件中，通过FileZilla上传到虚拟机里。

特别注意，一定要在execl表里面选中所有单元格，将所有内容的英文字段改为中文字段，以防止阻碍数据清洗。

技术图片

这是最终的txt文件：

技术图片

打开虚拟机：将文件上传到mapreduce，开始进行数据清洗。

技术图片

package letter_count;

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import test4.quchong;
import test4.quchong.doMapper;
import test4.quchong.doReducer;

public class letter {
	public static class doMapper extends Mapper<Object,Text,Text,IntWritable>{
		public static final IntWritable one = new IntWritable(1);
		public static Text word = new Text();
		@Override
		protected void map(Object key, Text value, Context context)
		throws IOException,InterruptedException {
		//StringTokenizer tokenizer = new StringTokenizer(value.toString(),"  ");
			   String[] strNlist = value.toString().split(",");
			  // String str=strNlist[3].trim();
			   String str2=strNlist[3].substring(0,4);
			   System.out.println(str2);
			// Integer temp= Integer.valueOf(str);
		word.set(str2);
		//IntWritable abc = new IntWritable(temp);
		context.write(word,one);
		}
		}
		public static class doReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
			private IntWritable result = new IntWritable();
			@Override
			protected void reduce(Text key,Iterable<IntWritable> values,Context context)
			throws IOException,InterruptedException{
			int sum = 0;
			for (IntWritable value : values){
			sum += value.get();
			}
			result.set(sum);
			context.write(key,result);
			}
			}
		public static void main(String[] args) throws IOException,ClassNotFoundException,InterruptedException {
			Job job = Job.getInstance();
			job.setJobName("letter");
			job.setJarByClass(quchong.class);
			job.setMapperClass(doMapper.class);
			job.setReducerClass(doReducer.class);
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);
			Path in = new 
                        Path("hdfs://localhost:9000/JCC/letter_final.txt");
			Path out = new Path("hdfs://localhost:9000/JCC/out1");
			FileInputFormat.addInputPath(job,in);
			FileOutputFormat.setOutputPath(job,out);
			System.exit(job.waitForCompletion(true) ? 0 : 1);
		}
}

　　清洗完的数据：

技术图片

然后打开虚拟机数据库：

技术图片

输入建表语句：

create table letter
    -> (
    -> year varchar(255),
    -> num int(11));

txt 导入mysql,首先将准备导入的文件放到mysql_files目录下，（/var/lib/mysql-files）然后确保自己的local_infile是“ON”状态。
SHOW VARIABLES LIKE ‘%local%‘;（set global local_infile=‘ON‘;）可以开启。最后导入LOAD DATA INFILE ‘/var/lib/mysql-files/part-r-00000‘ INTO TABLE stage1 fields terminated by ‘,‘;
（show variables like ‘%secure%‘;）来查看自己mysql-files的路径。

将letter_final文件复制到/var/lib/mysql-files目录下：

技术图片

格式为：（以逗号间隔，方便存入mysql）

技术图片

输入代码：

 LOAD DATA INFILE ‘/var/lib/mysql-files/letter_final‘ INTO  TABLE letter fields terminated by ‘,‘;

　　技术图片

然后我们创建web，开始进行可视化。

图表的html代码：

<%@ page language="java" contentType="text/html; charset=UTF-8"
    pageEncoding="UTF-8"%>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Insert title here</title>
<script src="https://cdn.staticfile.org/echarts/4.3.0/echarts.min.js"></script>
<script src="js/jquery-1.5.1.js"></script>
</head>
<body>
<div id="main" style="width: 1000px;height:400px;background-color:white;"></div>
<script type="text/javascript">
var myChart = echarts.init(document.getElementById(‘main‘));
myChart.showLoading(); 
//6.ajax发起数据请求
var nums = []; //建立一个销量数组（实际用来盛放Y坐标值）
var names = []; //建立一个销量数组（实际用来盛放Y坐标值）
$.ajax({
    type : "post",
    async : true, //异步请求（同步请求将会锁住浏览器，其他操作须等请求完成才可执行）
    url : "servlet?method=find2", //请求发送到TestServlet
    data : {},
    dataType : "json", //返回数据形式为json

    //7.请求成功后接收数据name+num两组数据
    success : function(result) {
        //result为服务器返回的json对象
        if (result) {
            //8.取出数据存入数组
            //alert(result.length);
            for (var i = 0; i < result.length; i++) {
                nums.push(result[i].num); //迭代取出类别数据并填入类别数组
                names.push(result[i].year);
            }
            //alert(nums);
            myChart.hideLoading();
            
option = {
	    xAxis: {
	        type: ‘category‘,
	        data: names
	    },
	    yAxis: {
	        type: ‘value‘
	    },
	    series: [{
	        data: nums,
	        type: ‘bar‘
	    }]
	};
myChart.setOption(option, true);
        }}})
</script>
</body>
</html>

　　dao层:

package com.letter.dao;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;

import com.letter.entity.entity;;
public class dao {
	public List<entity> list1()
	{
		int sum=0;
		   List<entity> list =new ArrayList<entity>();
	        try {
	            // 加载数据库驱动，注册到驱动管理器
	            Class.forName("com.mysql.jdbc.Driver");
	            // 数据库连接字符串
	            String url = "jdbc:mysql://localhost:3306/mysql?useUnicode=true&characterEncoding=utf-8";
	            // 数据库用户名
	            String username = "root";
	            // 数据库密码
	            String password = "";
	            // 创建Connection连接
	            Connection conn = DriverManager.getConnection(url, username,
	                    password);
	            // 添加图书信息的SQL语句
	            String sql = "select * from letter";
	            // 获取Statement
	            Statement statement = conn.createStatement();
	  
	            ResultSet resultSet = statement.executeQuery(sql);
	            while (resultSet.next()) {
                   entity book = new entity();
                	   //sum=sum+resultSet.getInt("round");
                   book.setYear(resultSet.getString("year"));
                   book.setNum(resultSet.getInt("Num"));
   	                list.add(book);
	            }
	            resultSet.close();
	            statement.close();
	            conn.close();
	}catch (Exception e) {
	    e.printStackTrace();
	}
	        return list;
}
}

　　servlet代码:

package com.letter.servlet;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.servlet.http.HttpSession;

import com.google.gson.Gson;
import com.letter.entity.entity;
import com.letter.dao.*;
/**
 * Servlet implementation class servlet
 */
@WebServlet("/servlet")
public class servlet extends HttpServlet {
	private static final long serialVersionUID = 1L;
       
    /**
     * @see HttpServlet#HttpServlet()
     */
    public servlet() {
        super();
        // TODO Auto-generated constructor stub
    }
dao dao1=new dao();
	/**
	 * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
	 */
    protected void service(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
		request.setCharacterEncoding("utf-8");
		String method=request.getParameter("method");
		if("view".equals(method))
		{
			view(request, response);
		}else if("find2".equals(method))
		{
			find2(request, response);
		}
    }
    private void view(HttpServletRequest request, HttpServletResponse response) throws IOException, ServletException {
  		request.setCharacterEncoding("utf-8");
  		String buy_nbr=request.getParameter("buy_nbr");
		HttpSession session=request.getSession();
       	session.setAttribute("userInfo", buy_nbr);
  	request.getRequestDispatcher("left.jsp").forward(request, response);
      }

    private void find2(HttpServletRequest request, HttpServletResponse response) throws IOException, ServletException {
  		request.setCharacterEncoding("utf-8");
  		 List<entity> list =new ArrayList<entity>();
  		HttpSession session=request.getSession();
		String buy_nbr=(String) session.getAttribute("userInfo");
  		 entity book = new entity();
  		List<entity> list2=dao1.list1();
  		System.out.println(list2.size());
//  		String buy_nbr=(String) session.getAttribute("userInfo");
//  		System.out.println(buy_nbr);
  		Gson gson2 = new Gson();
        String json = gson2.toJson(list2);
       // System.out.println(json);
       // System.out.println(json.parse);
	 response.setContentType("textml; charset=utf-8");
        response.getWriter().write(json);

      }
}

　架构:

技术图片

运行结果:

技术图片

那么就完整的展现出来了从网站分析爬取----》数据清洗----》数据可视化。

北京市信件内容爬虫实例——首都之窗

标签：cdn 代码 info headers 存储 find dtd array integer

原文地址：https://www.cnblogs.com/jccjcc/p/12315405.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行