标签:
数据库创建脚本:
/* Navicat MySQL Data Transfer Source Server : localhost Source Server Version : 50621 Source Host : localhost:3306 Source Database : cmm-db Target Server Type : MYSQL Target Server Version : 50621 File Encoding : 65001 Date: 2015-05-25 19:05:58 */ SET FOREIGN_KEY_CHECKS=0; -- ---------------------------- -- Table structure for `region` -- ---------------------------- DROP TABLE IF EXISTS `region`; CREATE TABLE `region` ( `area_code` varchar(50) NOT NULL COMMENT ‘地区编码‘, `area_name` varchar(50) NOT NULL COMMENT ‘地区名称‘, `parent_code` varchar(50) NOT NULL COMMENT ‘地区父编码‘, `place_order` varchar(50) NOT NULL DEFAULT ‘0‘ COMMENT ‘显示顺序‘, PRIMARY KEY (`area_code`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT; -- ---------------------------- -- Records of region -- ----------------------------
maven依赖:
<dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.18</version> </dependency>
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.8.2</version> </dependency>
源代码:
import java.io.FileWriter;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupRegion {
/** 整数 */
private static final String V_INTEGER = "^-?[1-9]\\d*$";
class Region {
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getParentId() {
return parenId;
}
public void setParentId(int parenId) {
this.parenId = parenId;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
private String code;
private String name;
private String type;// 乡镇类型
private int parenId;
}
/**
* @说明: url2Document
* @param @param url
* @param @return
* @param @throws IOException
* @return Document
* @throws
*/
public static Document url2Doc(String url) throws IOException {
// 此种方式403
// return Jsoup.connect(url).get();
// return Jsoup.connect(url).timeout(600 * 1000)
// .userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36").get();
//
return Jsoup.connect(url).timeout(600 * 1000).get();
}
/**
* 验证是不是整数
*
* @param value
* 要验证的字符串 要验证的字符串
* @return 如果是符合格式的字符串,返回 <b>true </b>,否则为 <b>false </b>
*/
public static boolean Integer(String value) {
return match(V_INTEGER, value);
}
/**
* @param regex
* 正则表达式字符串
* @param str
* 要匹配的字符串
* @return 如果str 符合 regex的正则表达式格式,返回true, 否则返回 false;
*/
private static boolean match(String regex, String str) {
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(str);
return matcher.matches();
}
/**
* 追加文件:使用FileWriter
*
* @param fileName
* @param content
*/
public static void appendFile(String fileName, String content) {
FileWriter writer = null;
try {
// 打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件
writer = new FileWriter(fileName, true);
writer.write(content);
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (writer != null) {
writer.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* @说明: 获取省份
* @param @param url
* @param @return
* @param @throws IOException
* @return List<Region>
* @throws
*/
private static List<Region> getProvince(String url) throws IOException {
List<Region> list = new ArrayList<Region>();
Document doc = url2Doc(url);
Elements proviceTr = doc.getElementsByAttributeValue("class", "provincetr");// 通过css获取tr
for (Element e : proviceTr) {
Elements tds = e.select("a[href]");
for (Element element : tds) {
Region region = new JsoupRegion().new Region();
// region.setCode("13");
region.setCode(element.attr("href").substring(0, 2));
region.setName(element.text().replaceAll("<br />", ""));
region.setType("");
region.setParentId(0);
list.add(region);
}
}
return list;
}
/**
* @说明: 获取省份下的市
* @param @param url
* @param @return
* @param @throws IOException
* @return List<Region>
* @throws
*/
private static List<Region> getCity(String url) throws IOException {
List<Region> list = new ArrayList<Region>();
Document doc = url2Doc(url);
Elements proviceTr = doc.getElementsByAttributeValue("class", "citytr");// 通过css获取tr
for (Element e : proviceTr) {
Elements tds = e.select("a[href]");
for (Element element : tds) {
if (Integer(element.text())) {
continue;
}
Region region = new JsoupRegion().new Region();
String code = element.attr("href").substring(3, 7);
region.setCode(code);
region.setName(element.text());
region.setParentId(Integer.valueOf(code.substring(0, 2)));
region.setType("");
list.add(region);
}
}
return list;
}
/**
* @说明: 县
* @param @param url
* @param @return
* @param @throws IOException
* @return List<Region>
* @throws
*/
private static List<Region> getCounty(String url) throws IOException {
List<Region> list = new ArrayList<Region>();
Document doc = url2Doc(url);
Elements proviceTr = doc.getElementsByAttributeValue("class", "countytr");// 通过css获取tr
for (Element e : proviceTr) {
Elements tds = e.select("a[href]");
for (Element element : tds) {
if (Integer(element.text())) {
continue;
}
Region region = new JsoupRegion().new Region();
String code = element.attr("href").substring(3, 9);
region.setCode(code);
region.setName(element.text());
region.setParentId(Integer.valueOf(code.substring(0, 4)));
region.setType("");
list.add(region);
}
}
return list;
}
/**
* @说明: 镇
* @param @param url
* @param @return
* @param @throws IOException
* @return List<Region>
* @throws
*/
private static List<Region> getTown(String url) throws IOException {
List<Region> list = new ArrayList<Region>();
Document doc = url2Doc(url);
Elements proviceTr = doc.getElementsByAttributeValue("class", "towntr");// 通过css获取tr
for (Element e : proviceTr) {
Elements tds = e.select("a[href]");
for (Element element : tds) {
if (Integer(element.text())) {
continue;
}
Region region = new JsoupRegion().new Region();
String code = element.attr("href").substring(3, 12);
region.setCode(code);
region.setName(element.text());
region.setParentId(Integer.valueOf(code.substring(0, 6)));
region.setType("");
list.add(region);
}
}
return list;
}
/**
* @说明: 村
* @param @param url
* @param @return
* @param @throws IOException
* @return List<Region>
* @throws
*/
private static List<Region> getVillage(String url) throws IOException {
List<Region> list = new ArrayList<Region>();
Document doc = url2Doc(url);
Elements proviceTr = doc.getElementsByAttributeValue("class", "villagetr");// 通过css获取tr
for (Element e : proviceTr) {
Elements trs = e.select("tr");
for (Element element : trs) {
Elements tds = element.select("td");
Region region = new JsoupRegion().new Region();
for (Element element2 : tds) {
String value = element2.text();
if (Integer(value) && value.length() == 3) {
region.setType(element2.text());
}
if (Integer(value) && value.length() > 3) {
region.setCode(value);
region.setParentId(Integer.valueOf(value.substring(0, 9)));
} else {
region.setName(value);
}
}
list.add(region);
}
}
return list;
}
public static void insertDb() {
}
/**
* @Description:
* @param
* @return Connection 返回类型
* @throws
*/
private static Connection connection() throws ClassNotFoundException, SQLException {
// 调用Class.forName()方法加载驱动程序
Class.forName("com.mysql.jdbc.Driver");
String url = "jdbc:mysql://localhost:3306/cmm-db?useUnicode=true&characterEncoding=utf-8&autoReconnect=true"; // JDBC的URL
Connection conn = DriverManager.getConnection(url, "root", "root");
return conn;
}
public static void insertRegion(Region region) throws ClassNotFoundException, SQLException {
Connection conn = connection();
conn.setAutoCommit(false);
// 插入数据的代码
String sql2 = "insert into region(area_code,area_name,parent_code,place_order) values(?,?,?,?) ";
PreparedStatement pst = conn.prepareStatement(sql2);
pst.setString(1, region.getCode());
pst.setString(2, region.getName());
pst.setString(3, region.getParentId()+"");
pst.setString(4, region.getCode());
pst.addBatch();
// 执行批量更新
pst.executeBatch();
// 语句执行完毕,提交本事务
conn.commit();
pst.close();
conn.close();
}
public static void main(String[] args) throws IOException, ClassNotFoundException, SQLException {
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/";
List<Region> all = new ArrayList<Region>();
List<Region> province = getProvince(url + "/index.html");
all.addAll(province);
List<String> done = new ArrayList<String>();// 用来存已经抓取过的省份
for (Region regionProvince : province) {// 遍历省
if (done.contains(regionProvince.getCode())) {
continue;
}
insertRegion(regionProvince);
System.out.println(regionProvince.getCode() + regionProvince.getName());
List<Region> city = getCity(url + regionProvince.getCode() + ".html");
for (Region regionCity : city) {// 遍历市
insertRegion(regionCity);
System.out.println(regionCity.getCode() + "||" + regionCity.getName());
List<Region> county = getCounty(url + regionProvince.getCode() + "/" + regionCity.getCode() + ".html");
all.addAll(county);
for (Region regionCounty : county) {// 遍历县
insertRegion(regionCounty);
System.out.println(regionCounty.getCode() + "||" + regionCounty.getName());
List<Region> town = getTown(url + regionProvince.getCode() + "/" + regionCity.getCode().substring(2, 4) + "/" + regionCounty.getCode() + ".html");
all.addAll(town);
for (Region regionTown : town) {// 遍历镇
insertRegion(regionTown);
System.out.println(regionTown.getCode() + "||" + regionTown.getName());
List<Region> village = getVillage(url + regionProvince.getCode() + "/" + regionCity.getCode().substring(2, 4) + "/" + regionCounty.getCode().substring(4, 6) + "/"
+ regionTown.getCode() + ".html");
all.addAll(village);
for (Region regionVillage : village) {// 遍历村
insertRegion(regionVillage);
System.out.println(regionVillage.getCode() + "||" + regionVillage.getName());
}
}
}
}
}
}
}
标签:
原文地址:http://my.oschina.net/ydsakyclguozi/blog/419740