简单爬虫从指定地址下载网站内容

时间：2017-06-27 23:26:54 阅读：263 评论：0 收藏：0 [点我收藏+]

标签：void ace file 字节数组 connect htm 网络资源 tac 文件存储

Http01App.java 
1.使用了多线程、io流，net（网络包）

package main;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
/**
 * Created by lxj-pc on 2017/6/27.
 */
public class Http01App {
    public static void main(String[] args) {
        String url = "http://tuijian.hao123.com:80/index.html";
        //启动线程下载指定位置的html内容
        new Thread(new DownloadHtmlTask(url)).start();
        //下载指定的html内容，并存在d://html/hao123.html
    }

    static class DownloadHtmlTask implements Runnable {
        private String url;
        String fileName = "hao123.html";
        String dirPath = "d:/lxj";

        public DownloadHtmlTask(String url) {

            this.url = url;
        }

        @Override
        public void run() {
            //下载url指定的HTMl网页内容
            try {
                URL htmlURL = new URL(url);
                //打开网络资源连接
                try {
                    URLConnection urlConnection = htmlURL.openConnection();//.filed
                    HttpURLConnection conn = (HttpURLConnection) urlConnection;
                    //获取网络资源的读取流
                    InputStream is = conn.getInputStream();
                    //判断网络资源响应是否成功
                    if (conn.getResponseCode() == 200) {

                        //内存流 ByteArrayOutputStream
                        ByteArrayOutputStream baos = new ByteArrayOutputStream();

                        byte[] buffer = new byte[20 * 1024];//每次读取最大内存大小20k，缓冲大小
                        int len = -1; //每次读取的字节长度

                        //开始读取网路数据
                        //检测文件下载的进度
                        //1.获取网络资源的总长度
                        int contentLength = conn.getContentLength();
                        //2.声明当前已读取的资源长度，累加len
                        int curLen = 0;

                        while ((len = is.read(buffer)) != -1) {
                            //将读取的数据读取写入内存流中
                            baos.write(buffer, 0, len);

                            //3.计算下载进度
                            curLen += len;
                            System.out.println(curLen + " " + contentLength);
                            //4.计算下载进度
                            int p = curLen  / contentLength;
                            System.out.println("下载进度" + p + "%");
                        }
                        //下载完成,获取内存流中的数据
                        byte[] bytes = baos.toByteArray();
                        //将字节数组转成字符串，并打印到控制台
//                        "hello".getBytes(); zifu->zijie
                        String htmlContent = new String(bytes, "utf-8");
                        writerFile(htmlContent, dirPath, fileName);//成员方法在下面
                        System.out.println(htmlContent);
                        //  System.out.println(htmlContent);
                    }} catch (IOException e) {
                    e.printStackTrace();
                }
            } catch (MalformedURLException e) {
                e.printStackTrace();
            }


        }

        //将文件存储在指定的path的文件中
        private void writerFile(String htmlContent, String dirPath, String fileName) {
            File dir = new File(dirPath);

            FileWriter fileWriter = null;
            try {
                fileWriter = new FileWriter(new File(dir, fileName));
            } catch (IOException e) {
                e.printStackTrace();
            }
            try {
                fileWriter.write(htmlContent);
            } catch (IOException e) {
                e.printStackTrace();
            }
            try {
                fileWriter.close();
            } catch (IOException e) {
                e.printStackTrace();
            }


        }
    }
    public static void outputFile(String content,String dirPath,String fileName){
        File dir=new File(dirPath);
        try {
            FileOutputStream fileOutputStream=new FileOutputStream(new File(dir,fileName));
            fileOutputStream.write(content.getBytes("utf-8"));
            fileOutputStream.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

简单爬虫从指定地址下载网站内容

标签：void ace file 字节数组 connect htm 网络资源 tac 文件存储

原文地址：http://www.cnblogs.com/lxj666/p/7087481.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行

简单爬虫 从指定地址下载网站内容

简单爬虫从指定地址下载网站内容