码迷,mamicode.com
首页 > 编程语言 > 详细

Java使用Jsoup获得新闻联播所有文字稿

时间:2019-03-14 00:21:59      阅读:242      评论:0      收藏:0      [点我收藏+]

标签:exists   hand   txt   文字   dep   repos   select   new   close   

Jsoup的maven坐标:

        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>

Java代码:

package com.zifeiy.test;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class XinwenGetter {
    
    private static List<String> urlList = new ArrayList<String>();
    
    private static OutputStreamWriter out;
    
    private static void getUrlList() throws IOException {
        for (int i = 1; i <= 44; i ++) {
            String url = null;
            if (i == 0) {
                url = "http://www.xwlbo.com/txt.html";
            } else {
                url = "http://www.xwlbo.com/txt_" + i + ".html";
            }
            Document doc = Jsoup.connect(url).get();
            Elements xwlistElements = doc.getElementsByClass("xwlist");
            Elements aElements = xwlistElements.get(0).select("a");
            for (Element element : aElements) {
                String resUrl = element.attr("href");
                urlList.add(resUrl);
            }
        }
    }
    
    private static void solve(String url) throws IOException {
        Document doc = Jsoup.connect(url).get();
        System.out.println("handling " + doc.title() + " ...");
        out.write("<h3>" + doc.title() + "</h3>\r\n");
        Elements textElements = doc.getElementsByClass("text_content");
        Elements pElements = textElements.get(0).select("p");
        for (Element pElement : pElements) {
//          System.out.println(pElement);
            out.write(pElement.toString() + "\r\n");
        }
        out.write("<hr>\r\n");
    }
    
    
    public static void main(String[] args) throws IOException {
        
        getUrlList();
        
        File file = new File("D:/新闻联播大全.html");
        if (file.exists() == true) file.delete();
        out = new OutputStreamWriter(new FileOutputStream(file, true), "UTF-8");
        
        for (String url: urlList) {
            solve(url);
        }
        
        out.close();
        
    }
    
}

Java使用Jsoup获得新闻联播所有文字稿

标签:exists   hand   txt   文字   dep   repos   select   new   close   

原文地址:https://www.cnblogs.com/zifeiy/p/10527621.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!