码迷,mamicode.com
首页 > 其他好文 > 详细

记一次针对excel导出的优化

时间:2021-04-26 13:58:43      阅读:0      评论:0      收藏:0      [点我收藏+]

标签:trace   服务器   each   contains   sso   win   mod   压缩   input   

最近发现我们系统导出excel文件时由于是导出百万级数据导出,速度过慢并且内存占用多,故进行了下面的一次优化。
我们使用apache的poi进行excel文件操作
主要耗时:
1、从数据库得到需要导出的结果集耗时
2、将数据写入excel耗时

优化前

public abstract class BaseExcelOutputVo {
    public BaseExcelOutputVo() {
    }

    public abstract String[] toExcelHeaders();

    public abstract Object[] toExcelData();

    public abstract String getExcelTitle();
}

public class ExcelUtils {

    private static Logger log = LoggerFactory.getLogger(ExcelUtils.class);
    /**
     * 未分页数据导出
     * @param request
     * @param response
     * @param dataList
     * @param downloadName
     * @param fileName
     * @param pageSize  每个excel文件最大记录数
     */
    public static <T extends BaseExcelOutputVo> void excelExport(HttpServletRequest request, HttpServletResponse response, List<T> dataList,String downloadName,String fileName,int pageSize){
        //List<List<T>> list = Lists.partition(dataList,pageSize);
        Stopwatch sw = Stopwatch.createStarted();
        List<List<T>> list = new ArrayList<>();

        Iterator<T> it = dataList.iterator();
        for(int pages = dataList.size()/pageSize +1 ; pages>0; pages--){
            List<T> dataPage = new ArrayList<>();
            for(int num = 0 ;num < pageSize ; num++){
                if(it.hasNext()){
                dataPage.add(it.next());}
                else{
                    break;
                }
            }
            list.add(dataPage);
        }
        int size = dataList.size();
        dataList.clear();

        excelExport(request,response,list,downloadName,fileName);
        log.trace("excel导出记录{}条,耗时{}",size,sw.stop().elapsed(TimeUnit.MILLISECONDS));

    }
        /**
         * 将已经分好页的数据,按照分页导出多个excel,并打包为压缩包响应
         * @param request
         * @param response
         * @param dataList 用于生成excel的数据,不同list生成不同excel文件
         * @param downloadName 压缩包名字,以.zip结尾
         * @param fileName excel文件名字前缀,不需要格式名
         */
    public static <T extends BaseExcelOutputVo> void excelExport(HttpServletRequest request, HttpServletResponse response, List<List<T>> dataList,String downloadName,String fileName){
        List<Workbook> excelList = new ArrayList<>();

        for(List<T> data :dataList){
            if(data.size()>0)
            excelList.add(createExcel(data));
        }
        if(excelList.size() > 0 ) {
            try {
                downloadFiles(request, response, downloadName, fileName, excelList);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    public static <T extends BaseExcelOutputVo> Workbook createExcel(List<T> data) {
        if(data.size()>0) {
            XSSFWorkbook workbook = null;
            try {
                workbook = (XSSFWorkbook) Class.forName("org.apache.poi.xssf.usermodel.XSSFWorkbook").newInstance();

                //指定 sheet 的名字
                Sheet sheet = workbook.createSheet(data.get(0).getExcelTitle());

                // 列数
                int cols = data.get(0).toExcelHeaders().length;
                //复制一行用于设置header
                data.add(0,data.get(0));
                int rows = data.size();
                int index = 0;

                //设置列头样式
                XSSFCellStyle headerStyle = workbook.createCellStyle();
                headerStyle.setAlignment(HSSFCellStyle.ALIGN_CENTER); // 居中格式
                headerStyle.setFillForegroundColor(IndexedColors.SKY_BLUE.getIndex());// 背景色
                headerStyle.setFillPattern(HSSFCellStyle.SOLID_FOREGROUND); //加粗

                for (int rowNum = 0; rowNum < rows; rowNum++) {
                    Row row = sheet.createRow(rowNum);

                    for (int colNum = 0; colNum < cols; colNum++) {
                        Cell cell = row.createCell(colNum);
                        if(rowNum == 0){
                            cell.setCellValue(data.get(0).toExcelHeaders()[colNum]);
                            cell.setCellStyle(headerStyle);
                        }else {
                            Object obj = data.get(index).toExcelData()[colNum];
                            if(obj instanceof Date) {
                                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
                                cell.setCellValue(sdf.format((Date) obj));
                            }else if(obj instanceof Integer){
                                cell.setCellValue((Integer)obj);
                            }else if(obj instanceof Double){
                                cell.setCellValue((Double)obj);
                            }else if(obj instanceof  Long){
                                cell.setCellValue((Long)obj);
                            }else{
                                cell.setCellValue((String)obj);
                            }
                        }
                        sheet.autoSizeColumn(colNum);
                        sheet.setColumnWidth(colNum,sheet.getColumnWidth(colNum)*17/10);
                    }

                    index++;
                }
                return workbook;
            } catch (InstantiationException e) {
                e.printStackTrace();
            } catch (IllegalAccessException e) {
                e.printStackTrace();
            } catch (ClassNotFoundException e) {
                e.printStackTrace();
            }

        }
        return null;
    }

    private static void downloadFiles(HttpServletRequest request, HttpServletResponse response,String downloadName,String fileName,List<Workbook> excelList) throws Exception {

        //响应头的设置
        response.reset();
        response.setCharacterEncoding("utf-8");
        response.setContentType("multipart/form-data");

        //设置压缩包的名字
        //解决不同浏览器压缩包名字含有中文时乱码的问题
        String agent = request.getHeader("USER-AGENT");
        try {
            if (agent.contains("MSIE")||agent.contains("Trident")) {
                downloadName = java.net.URLEncoder.encode(downloadName, "UTF-8");
            } else {
                downloadName = new String(downloadName.getBytes("UTF-8"),"ISO-8859-1");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        response.setHeader("Content-Disposition", "attachment;fileName=\"" + downloadName + "\"");

        //设置压缩流:直接写入response,实现边压缩边下载
        ZipOutputStream zipos = null;
        try {
            zipos = new ZipOutputStream(new BufferedOutputStream(response.getOutputStream()));
            zipos.setMethod(ZipOutputStream.DEFLATED); //设置压缩方法
        } catch (Exception e) {
            e.printStackTrace();
        }

        //循环将文件写入压缩流
        DataOutputStream os = null;
        for(int i = 0; i < excelList.size(); i++ ){

            InputStream in = null;
            try{
                //将excel文件转为输入流
                ByteArrayOutputStream out = new ByteArrayOutputStream();
                excelList.get(i).write(out);
                byte [] bookByteAry = out.toByteArray();
                in = new ByteArrayInputStream(bookByteAry);

                //添加ZipEntry,并ZipEntry中写入文件流
                //这里,加上i是防止要下载的文件有重名的导致下载失败
                zipos.putNextEntry(new ZipEntry( fileName + i +".xlsx"));
                os = new DataOutputStream(zipos);

                byte[] b = new byte[100];
                int length = 0;
                while((length = in.read(b))!= -1){
                    os.write(b, 0, length);
                }
                in.close();
                zipos.closeEntry();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        //关闭流
        try {
            os.flush();
            os.close();
            zipos.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }
}

原有的导出工具类测试情况

excel导出记录5条,耗时574 ms
查询耗时 334 ms
导出总耗时908 ms

excel导出记录501条,耗时23638 ms
查询耗时 476 ms
导出总耗时24114 ms

数据转excel时间234429 ms
 完成响应耗时234516 ms
excel导出记录5000条,耗时234516 ms
导出总耗时235518 ms

excel导出记录10001条,耗时460735 ms
查询耗时 2766 ms
导出总耗时463501 ms

发现主要耗时集中在生成excel文件上,随着导出记录数增加该比重明显上升,1万记录时已经占了99.5%

注意XSSFWorkbook这个类是个坑
不知道各位有没发现一个奇怪的事情,500条记录的导出也花费了20S左右的时间。按理说500条只是一个很小的数目,20S左右怎么也能导上万条记录了。为此,我对每个row的生成时间打印了出来,发现开始时写的时间是10ms/行,500行时慢慢增加到100ms/行,对于内容相近的每行写入时间增加了有10倍,感觉是因为不停插入数据导致excel的对象变大了的原因?改用SXSSFWorkbook后写速度稳定在5ms/行左右。SXSSFWorkbook相对与XSSFWorkbook来讲,使用了滑动窗口的方式,只保存设置的最大记录数在内存中(默认是100行),其他的记录将会被保存在一个临时文件当中,以此来降低大数据量操作时的内存占用,写入完成后需要调用dispose()方法将临时文件删除。并且经过测试SXSSFWorkbook的单个cell写入耗时只有XSSFWorkbook的1/10,对于不需要随机读取以前数据的程序来讲,SXSSFWorkbook是一个更好的选择。

每行三列同样长度的随机字符串写入workbook中,XSSFWorkbook与SXSSFWorkbook测试对比

StopWatch ‘col0‘: running time = 4033300 ns
StopWatch ‘col1‘: running time = 74000 ns
StopWatch ‘col2‘: running time = 70200 ns
StopWatch ‘row0‘: running time = 5570700 ns

StopWatch ‘col0‘: running time = 47800 ns
StopWatch ‘col1‘: running time = 53200 ns
StopWatch ‘col2‘: running time = 38400 ns
StopWatch ‘row1‘: running time = 216300 ns

StopWatch ‘col0‘: running time = 35000 ns
StopWatch ‘col1‘: running time = 39900 ns
StopWatch ‘col2‘: running time = 36200 ns
StopWatch ‘row2‘: running time = 205200 ns

StopWatch ‘col0‘: running time = 25600 ns
StopWatch ‘col1‘: running time = 19900 ns
StopWatch ‘col2‘: running time = 24200 ns
StopWatch ‘row3‘: running time = 141700 ns

StopWatch ‘col0‘: running time = 23600 ns
StopWatch ‘col1‘: running time = 21700 ns
StopWatch ‘col2‘: running time = 24300 ns
StopWatch ‘row4‘: running time = 185200 ns

StopWatch ‘col0‘: running time = 19500 ns
StopWatch ‘col1‘: running time = 21000 ns
StopWatch ‘col2‘: running time = 24900 ns
StopWatch ‘row5‘: running time = 357100 ns

StopWatch ‘col0‘: running time = 23900 ns
StopWatch ‘col1‘: running time = 25400 ns
StopWatch ‘col2‘: running time = 26400 ns
StopWatch ‘row6‘: running time = 503600 ns

StopWatch ‘col0‘: running time = 23600 ns
StopWatch ‘col1‘: running time = 21900 ns
StopWatch ‘col2‘: running time = 27000 ns
StopWatch ‘row7‘: running time = 157700 ns

StopWatch ‘col0‘: running time = 63300 ns
StopWatch ‘col1‘: running time = 21600 ns
StopWatch ‘col2‘: running time = 24100 ns
StopWatch ‘row8‘: running time = 193900 ns

StopWatch ‘col0‘: running time = 20700 ns
StopWatch ‘col1‘: running time = 21100 ns
StopWatch ‘col2‘: running time = 24300 ns
StopWatch ‘row9‘: running time = 137000 ns

StopWatch ‘sxssf‘: running time = 565115800 ns
---------------------------------------------
ns         %     Task name
---------------------------------------------
565115800  100%  

StopWatch ‘col0‘: running time = 17104200 ns
StopWatch ‘col1‘: running time = 597400 ns
StopWatch ‘col2‘: running time = 371600 ns
StopWatch ‘row0‘: running time = 26082100 ns

StopWatch ‘col0‘: running time = 315700 ns
StopWatch ‘col1‘: running time = 429100 ns
StopWatch ‘col2‘: running time = 1941400 ns
StopWatch ‘row1‘: running time = 2899200 ns

StopWatch ‘col0‘: running time = 422900 ns
StopWatch ‘col1‘: running time = 660600 ns
StopWatch ‘col2‘: running time = 327400 ns
StopWatch ‘row2‘: running time = 1623800 ns

StopWatch ‘col0‘: running time = 343100 ns
StopWatch ‘col1‘: running time = 289400 ns
StopWatch ‘col2‘: running time = 275700 ns
StopWatch ‘row3‘: running time = 1072900 ns

StopWatch ‘col0‘: running time = 275900 ns
StopWatch ‘col1‘: running time = 258700 ns
StopWatch ‘col2‘: running time = 613400 ns
StopWatch ‘row4‘: running time = 1330700 ns

StopWatch ‘col0‘: running time = 899900 ns
StopWatch ‘col1‘: running time = 264500 ns
StopWatch ‘col2‘: running time = 235400 ns
StopWatch ‘row5‘: running time = 1572100 ns

StopWatch ‘col0‘: running time = 224000 ns
StopWatch ‘col1‘: running time = 219000 ns
StopWatch ‘col2‘: running time = 248200 ns
StopWatch ‘row6‘: running time = 826200 ns

StopWatch ‘col0‘: running time = 306100 ns
StopWatch ‘col1‘: running time = 272700 ns
StopWatch ‘col2‘: running time = 305300 ns
StopWatch ‘row7‘: running time = 1047000 ns

StopWatch ‘col0‘: running time = 240100 ns
StopWatch ‘col1‘: running time = 246600 ns
StopWatch ‘col2‘: running time = 278600 ns
StopWatch ‘row8‘: running time = 921000 ns

StopWatch ‘col0‘: running time = 248300 ns
StopWatch ‘col1‘: running time = 239700 ns
StopWatch ‘col2‘: running time = 245300 ns
StopWatch ‘row9‘: running time = 869600 ns

StopWatch ‘xssf‘: running time = 44311200 ns
---------------------------------------------
ns         %     Task name
---------------------------------------------
044311200  100%  

上述excel工具类有两个问题

  • 导出的耗时明显的长
  • 无论是使用哪一个excelExport入口,它都需要应用先把所有要导出的数据查出放到List中,这样当数据量去到几百万的时候会占用大量应用内存

优化一

为了避免多个线程同时写一个文件出现问题,目前采取了N条记录一个文件,每个文件一个线程写的设计。分sheet操作感觉也能尝试,至于同一个sheet的操作因为poi不是线程安全的,网上我看过几个demo好像都是会报错,不建议使用。

以下是使用多线程写文件改造后核心代码,这里最大线程数最多设置为电脑可用线程数2,测试过3或者*8速度会更快,不过这个功能不是核心功能,所以就不占用太多资源

     static final int nThreads = Runtime.getRuntime().availableProcessors();
    public static <T extends BaseExcelOutputVo> List<Workbook> createExcel_v2(List<List<T>> data) {

        CountDownLatch countDownLatch = new CountDownLatch(data.size());
        List<Workbook> excelList = new CopyOnWriteArrayList<>();
        if (data.size() > 0) {
            ExecutorService executor = Executors.newFixedThreadPool(data.size() < nThreads*2 ? data.size() : nThreads*2);
            data.forEach(list -> executor.execute(new pageTask(countDownLatch, list, excelList)));
        }
        try {
            countDownLatch.await();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        return excelList;
    }

   static class pageTask<T extends BaseExcelOutputVo> implements Runnable {
        private CountDownLatch countDownLatch;
        private List<T> data;
        private List<SXSSFWorkbook> excelList;

        public pageTask(CountDownLatch countDownLatch, List<T> data, List<SXSSFWorkbook> excelList) {
            this.countDownLatch = countDownLatch;
            this.data = data;
            this.excelList = excelList;
        }
        
 		public pageTask(List<T> data, List<SXSSFWorkbook> excelList){
           
            this.data = data;
            this.excelList = excelList;
        }


        @Override
        public void run() {
            try {
                Stopwatch sw = Stopwatch.createStarted();
                SXSSFWorkbook workbook = new SXSSFWorkbook(100);
                //设置列头样式
                CellStyle headerStyle = workbook.createCellStyle();
                headerStyle.setAlignment(CellStyle.ALIGN_CENTER); // 居中格式
                headerStyle.setFillForegroundColor(IndexedColors.SKY_BLUE.getIndex());// 背景色
                headerStyle.setFillPattern(CellStyle.SOLID_FOREGROUND); //加粗

                SXSSFSheet sheet = workbook.createSheet(data.get(0).getExcelTitle());

                CellStyle textStyle = workbook.createCellStyle();
                DataFormat format = workbook.createDataFormat();
                textStyle.setDataFormat(format.getFormat("@"));

                // 列数
                int cols = data.get(0).toExcelHeaders().length;
                //复制一行用于设置header
                data.add(0, data.get(0));
                int rows = data.size();
                int index = 0;

                for (int rowNum = 0; rowNum < rows; rowNum++) {
                    SXSSFRow row = sheet.createRow(rowNum);

                    for (int colNum = 0; colNum < cols; colNum++) {
                        Cell cell = row.createCell(colNum);
                        if (rowNum == 0) {
                            cell.setCellValue(data.get(0).toExcelHeaders()[colNum]);
                            cell.setCellStyle(headerStyle);
                        } else {
                            cell.setCellStyle(textStyle);
                            cell.setCellType(XSSFCell.CELL_TYPE_STRING);
                            Object obj = data.get(index).toExcelData()[colNum];
                            if (obj instanceof Date) {
                                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
                                cell.setCellValue(sdf.format((Date) obj));
                            } else if (obj instanceof Integer) {
                                cell.setCellValue(String.valueOf(obj));
                            } else if (obj instanceof Double) {
                                cell.setCellValue(String.valueOf(obj));
                            } else if (obj instanceof Long) {
                                cell.setCellValue(String.valueOf(obj));
                            } else {
                                cell.setCellValue((String) obj);
                            }
                        }
                    }
                    //log.trace("设置行{}完成,耗时{}",index,rowNum,sw.elapsed(TimeUnit.MILLISECONDS));
                    index++;

                }
                for (int colNum = 0; colNum < cols; colNum++) {
                    sheet.trackAllColumnsForAutoSizing();
                    sheet.autoSizeColumn(colNum);
                    sheet.setColumnWidth(colNum, sheet.getColumnWidth(colNum) * 17 / 10);
                }
                excelList.add(workbook);
                //workbook.dispose();
                log.trace("线程{}创建excel完成,耗时{}", Thread.currentThread().getName(), sw.stop().elapsed(TimeUnit.MILLISECONDS));
            } catch (Exception e) {
                log.trace("线程名{}data大小{}data是否为空{}", Thread.currentThread().getName(), data.size(), data == null);
                e.printStackTrace();
            } finally {
                if (countDownLatch != null) {
                    countDownLatch.countDown();
                }
            //这里处理完后清空数据列表,让它可以回收,之前没释放导出的时候堆一直往上涨的厉害,释放后少了大概25%
			data.clear();
            data = null;
            }
        }

    }

测试情况,500行一个文件

线程pool-12-thread-7创建excel完成,耗时716
线程pool-12-thread-5创建excel完成,耗时733
线程pool-12-thread-6创建excel完成,耗时734
线程pool-12-thread-1创建excel完成,耗时750
线程pool-12-thread-11创建excel完成,耗时756
线程pool-12-thread-9创建excel完成,耗时776
...
excel导出记录5500条,耗时1564
导出总耗时3074

线程pool-12-thread-10创建excel完成,耗时646 ms
线程pool-12-thread-3创建excel完成,耗时648 ms
线程pool-12-thread-6创建excel完成,耗时668 ms
线程pool-12-thread-7创建excel完成,耗时678 ms
线程pool-12-thread-5创建excel完成,耗时682 ms
线程pool-12-thread-9创建excel完成,耗时684 ms
线程pool-12-thread-4创建excel完成,耗时687 ms
线程pool-12-thread-1创建excel完成,耗时707 ms
线程pool-12-thread-2创建excel完成,耗时716 ms
线程pool-12-thread-8创建excel完成,耗时721 ms
excel导出记录5000条,耗时1479 ms
导出总耗时2673 ms

线程pool-13-thread-1创建excel完成,耗时109
excel导出记录500条,耗时181
导出总耗时436

测试情况,5000行一个文件

线程pool-15-thread-2创建excel完成,耗时35
线程pool-15-thread-1创建excel完成,耗时61
excel导出记录5500条,耗时204
导出总耗时1341

线程pool-14-thread-1创建excel完成,耗时56
excel导出记录5000条,耗时137
导出总耗时1103

线程pool-12-thread-1创建excel完成,耗时538
excel导出记录500条,耗时708
导出总耗时999

注意不要图快开大量线程把cpu占满了,这样其他请求这机器就处理不了了,而且当线程数超过当前服务器可用线程数时,线程切换频繁,速度也会变慢。
计算最大线程数方法参考

生成excel文件速度优化后,还存在一个问题:这里用于导出的数据是一次性加载到内存中的,当数据量大的时候会占用大量内存,需改用读一部分写一部分的形式。

优化二

采用了边加载边写excel文件的形式减少内存占用,类似于生产者消费者模型,使用队列进行数据的临时存储及交换,生产者线程不停将要写入的数据从数据库取出,经处理后存入队列中,消费者线程即excel线程管理类启动后不停从队列中取数据,当数据积累够指定数量或者所有要写excel的数据已经从队列中取出时,将这些数据交给写excel线程进行写入。大概流程如下:
技术图片
消费者类

    public static class ExportThread<T extends BaseExcelOutputVo> implements Runnable {

        private LinkedBlockingQueue<T> queue;
        private boolean isEnd = false;
        private List<T> list = new CopyOnWriteArrayList<>();
        List<Workbook> excelList;
        HttpServletRequest request;
        HttpServletResponse response;
        String downloadName;
        String fileName;
        ExecutorService executor;

        public ExportThread(LinkedBlockingQueue<T> queue,HttpServletRequest request, HttpServletResponse response, String downloadName, String fileName) {
            this.queue = queue;
            this.request = request;
            this.response = response;
            this.downloadName = downloadName;
            this.fileName = fileName;
        }

        public void setEnd(boolean end) {
            isEnd = end;
        }

        @Override
        public void run() {
            try {
                excelList = new CopyOnWriteArrayList<>();
                executor = Executors.newFixedThreadPool(nThreads * 2);

                while (!isEnd || queue.size() > 0) {
                    while (queue.size() > 0) {
                        list.add(queue.poll(60 * 1000, TimeUnit.MILLISECONDS));
                         if(list.size() == SINGLE_FILE_SIZE){
                            break;
                        }
                    }
                    if (list.size() > 0 && (list.size() == SINGLE_FILE_SIZE || (isEnd && queue.size() ==0))) {
                        executor.execute(new pageTask(new CopyOnWriteArrayList(list), excelList));
                        list.clear();
                    }
                }
                executor.shutdown();
                while(!executor.isTerminated()){

                }
                //log.trace("生成excel完成");
                if (excelList.size() > 0) {
                    try {
                        downloadFiles(request, response, downloadName, fileName, excelList);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            } catch (InterruptedException e) {
                e.printStackTrace();
            }

        }

    }

测试情况
文件分页 5000&查询分页 5000

数据量 耗时ms
一千 1162
十万 18579
一百万 611380

一百万的数据量时单次查询会越来越慢,是mysql分页查询里面limit偏移量变大的影响。

如果这样优化后导出还是需要十秒以上,为了避免导出数据太多导致页面响应超时,可以采用异步的形式,后端接收到导出请求并检查参数正确后返回成功,再提供一个接口用来查询该次导出是否已完成,完成再返回文件,未完成则需要页面过段时间再次调用查询接口检查。

记一次针对excel导出的优化

标签:trace   服务器   each   contains   sso   win   mod   压缩   input   

原文地址:https://www.cnblogs.com/zmj-pr/p/14701170.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!