标签:city print parse exception insert stp get tcl window
一、代码演示
如果中途中断,可进行刷选过滤已拉取省份数据
/** * TODO * * @author kevin * @createTime 2019-11-18 19:37 */ @RestController public class CityController { @Autowired private ProvinceService provinceService; @Autowired private HttpUtil httpUtil; private String yearHref = ""; private int index; // {"provincetr", "citytr", "countytr", "towntr", "villagetr"}; @GetMapping("/start") public ResultTemplate<String> spider() throws Exception { String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/"; String charset = "gb2312"; Document rootDoc = httpUtil.get(url, charset); if (rootDoc == null) { return of("fail"); } Element firstElement = rootDoc.getElementsByClass("center_list_contlist").get(0); // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html yearHref = firstElement.select("a").get(0).attr("href"); // 最近一个年份的省份链接 Document doc = httpUtil.get(yearHref, charset); // 遍历所有的省 Elements provinceElements = doc.getElementsByClass("provincetr"); for (Element element : provinceElements) { Elements aEles = element.select("a"); for (Element aEle : aEles) { String name = aEle.text(); // 11.html String provincesHref = aEle.attr("href"); String code = provincesHref.substring(0, provincesHref.indexOf(".")); index = yearHref.lastIndexOf("/") + 1; // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/11.html provincesHref = yearHref.substring(0, index) + provincesHref; DicProvince province = new DicProvince() .setProvinceName(name) .setProvinceCode(code) .setCountryId(1196612453660643329L) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); if ("北京市".equals(name) || "天津市".equals(name) || "河北省".equals(name)) { System.out.println("未执行市:" + name); } else { System.out.println("开始时间:" + LocalDateTime.now()); System.out.println("省名称:" + name); Long id = provinceService.insertProvince(province); getCites(provincesHref, charset, id); } } } return of("spider crawl end."); } private void getCites(String url, String charset, Long provinceId) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println("循环次数:" + i); } rootDoc = httpUtil.get(url, charset); } catch (Exception e) { rootDoc = null; System.out.println("请求网页链接报错"); } } i = 0; if (rootDoc != null) { Elements cityElements = rootDoc.getElementsByClass("citytr"); for (Element cityElement : cityElements) { Element aEle = cityElement.select("a").get(1); // 第二个是市的名字 String name = aEle.text(); // 11/1101.html String cityHref = aEle.attr("href"); int start = cityHref.lastIndexOf("/") + 1; String code = cityHref.substring(start, cityHref.indexOf(".")); cityHref = yearHref.substring(0, index) + cityHref; DicCity city = new DicCity() .setCityName(name) .setCityCode(code) .setProvinceId(provinceId) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertCity(city); //Long id=1L; getDistrict(cityHref, charset, id); } } } // 区县 private void getDistrict(String url, String charset, Long idDis) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println("循环次数:" + i); } rootDoc = httpUtil.get(url, charset); } catch (Exception e) { rootDoc = null; System.out.println("请求网页链接报错"); } } i = 0; if (rootDoc != null) { Elements cityElements = rootDoc.getElementsByClass("countytr"); for (Element cityElement : cityElements) { try { Element aEle = cityElement.select("a").get(1); String name = aEle.text(); String cityHref = aEle.attr("href"); int start = cityHref.lastIndexOf("/") + 1; String code = cityHref.substring(start, cityHref.indexOf(".")); int index = url.lastIndexOf("/") + 1; cityHref = url.substring(0, index) + cityHref; DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertDistrict(district); //Long id=1L; getStreet(cityHref, charset, id); } catch (Exception e) { System.out.println("市辖区"); Element aEle = cityElement.select("td").get(0); String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1); String name = aEle2.text(); DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis); Long id = provinceService.insertDistrict(district); System.out.println("执行完毕"); } } } } // 街道 private void getStreet(String url, String charset, Long idStr) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println("循环次数:" + i); } rootDoc = httpUtil.get(url, charset); } catch (Exception e) { rootDoc = null; System.out.println("请求网页链接报错"); } } i = 0; if (rootDoc != null) { Elements cityElements = rootDoc.getElementsByClass("towntr"); for (Element cityElement : cityElements) { Element aEle = cityElement.select("a").get(1); // 第二个是市的名字 String name = aEle.text(); String cityHref = aEle.attr("href"); int start = cityHref.lastIndexOf("/") + 1; String code = cityHref.substring(start, cityHref.indexOf(".")); int index = url.lastIndexOf("/") + 1; cityHref = url.substring(0, index) + cityHref; DicStreet street = new DicStreet() .setStreetName(name) .setStreetCode(code) .setDistrictId(idStr) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertStreet(street); //Long id=1L; getCommunity(cityHref, charset, id); } } } // 社区 private void getCommunity(String url, String charset, Long idPro) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println("循环次数:" + i); } rootDoc = httpUtil.get(url, charset); } catch (Exception e) { rootDoc = null; System.out.println("请求网页链接报错"); } } i = 0; if (rootDoc != null) { Elements cityElements = rootDoc.getElementsByClass("villagetr"); for (Element cityElement : cityElements) { Element aEle = cityElement.select("td").get(0); String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1); String cl_code = aEle2.text(); Element aEle3 = cityElement.select("td").get(2); String name = aEle3.text(); DicCommunity community = new DicCommunity() .setCommunityName(name) .setCommunityCode(code) .setClassificationCode(cl_code) .setStreetId(idPro) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertCommunity(community); } } } }
二、HttppUtil工具类
/** * TODO * * @author kevin * @createTime 2019-11-20 9:17 */ @Component public class HttpUtil { public Document get(String url, String charset) throws IOException { String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"; URL url2 = new URL(url); HttpURLConnection connection = (HttpURLConnection)url2.openConnection(); connection.setRequestMethod("GET"); //是否允许缓存,默认true。 connection.setUseCaches(Boolean.FALSE); //设置请求头信息 connection.addRequestProperty("Connection", "close"); connection.addRequestProperty("user-agent", userAgent); //设置连接主机超时(单位:毫秒) connection.setConnectTimeout(80000); //设置从主机读取数据超时(单位:毫秒) connection.setReadTimeout(80000); //开始请求 try { Document doc = Jsoup.parse(connection.getInputStream(), charset, url); return doc; } catch (Exception e) { System.out.println("parse error: " + url); } return null; } }
三、service部分,根据需要自行定义数据库表
/**
 * TODO
 *
 * @author kevin
 * @createTime 2019-11-18 20:41
 */
@Service
public class ProvinceServiceImpl implements ProvinceService {
    @Autowired
    private ProvinceMapper provinceMapper;
    @Autowired
    private CityMapper cityMapper;
    @Autowired
    private DistrictMapper districtMapper;
    @Autowired
    private StreetMapper streetMapper;
    @Autowired
    private CommunityMapper communityMapper;
    @Override
    public Long insertProvince(DicProvince dicProvince) {
        int res=0;
        while (res!=1){
            try {
                res=provinceMapper.insert(dicProvince);
            } catch (Exception e) {
                res=0;
                System.out.println("插入省数据失败");
                e.printStackTrace();
            }
        }
        return dicProvince.getProvinceId();
    }
    @Override
    public Long insertCity(DicCity dicCity) {
        int res=0;
        while(res!=1){
            try {
                res=cityMapper.insert(dicCity);
            } catch (Exception e) {
                res=0;
                System.out.println("插入市数据失败");
                e.printStackTrace();
            }
        }
        return dicCity.getCityId();
    }
    @Override
    public Long insertDistrict(DicDistrict dicDistrict) {
        int res=0;
        while (res!=1){
            try {
                res=districtMapper.insert(dicDistrict);
            } catch (Exception e) {
                res=0;
                System.out.println("插入区县数据失败");
                e.printStackTrace();
            }
        }
        return dicDistrict.getDistrictId();
    }
    @Override
    public Long insertStreet(DicStreet dicStreet) {
        int res=0;
        while (res!=1){
            try {
                res=streetMapper.insert(dicStreet);
            } catch (Exception e) {
                res=0;
                System.out.println("插入街道数据失败");
                e.printStackTrace();
            }
        }
        return dicStreet.getStreetId();
    }
    @Override
    public Long insertCommunity(DicCommunity dicCommunity) {
        int res=0;
        while (res!=1){
            try {
                res=communityMapper.insert(dicCommunity);
            } catch (Exception e) {
                res=0;
                System.out.println("插入社区数据失败");
                e.printStackTrace();
            }
        }
        return dicCommunity.getCommunityId();
    }
}
基于【 springBoot+jsoup】一 || 爬取全国行政区划数据
标签:city print parse exception insert stp get tcl window
原文地址:https://www.cnblogs.com/kevin-ying/p/11925782.html