标签:script not found out org hdf final array ons jar
功能说明:从hdfs读取excel文件,经过poi转换成txt文本文件并输出成hdfs文件
一、引入jar包
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.14</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.14</version>
</dependency>
二、代码实现
package operator.excel;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
/**
* @ClassName: ExcelInputFormat
* @Description:
* @Author: mashiwei
* @Date: 2017/6/30
*/
public class ExcelInputFormat {
public static void main(String[] args) throws IOException {
Configuration config = new Configuration();
InputStream inputStream;
String[] strArrayofLines;
String output = "/kettle/excel/test.txt";
String input = "/kettle/excel/test.xls";
FileSystem fileSystem = FileSystem.get(config);
FSDataOutputStream out = fileSystem.create(new Path(output));
inputStream = fileSystem.open(new Path(input));
// strArrayofLines = ExcelParser.parseExcelData(inputStream);
strArrayofLines = ExcelParser.parseExcelData(inputStream,new File(input));
for (String str:strArrayofLines) {
System.out.println("------"+str);
out.writeBytes(str);
out.writeUTF("\n");
}
inputStream.close();
out.close();
}
}
package operator.excel;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* @ClassName: ExcelParser
* @Description:
* @Author: mashiwei
* @Date: 2017/6/30
*/
public class ExcelParser {
private static final Log logger = LogFactory.getLog(ExcelParser.class);
public static void checkFile(File file) throws IOException{
//判断文件是否存在
if(null == file){
logger.error("文件不存在!");
throw new FileNotFoundException("文件不存在!");
}
//获得文件名
String fileName = file.getAbsolutePath();
//判断文件是否是excel文件
if(!fileName.endsWith("xls") && !fileName.endsWith("xlsx")){
logger.error(fileName + "不是excel文件");
throw new IOException(fileName + "不是excel文件");
}
}
/**
* 解析is
*
* @param is 数据源
* @return String[]
*/
public static String[] parseExcelData(InputStream is,File file) {
//获得文件名
String fileName = file.getAbsolutePath();
try {
checkFile(file);
} catch (IOException e) {
e.printStackTrace();
}
// 结果集
List<String> resultList = new ArrayList<String>();
Workbook workbook = null;
try {
// 获取Workbook
if(fileName.endsWith("xls")){
//2003
workbook = new HSSFWorkbook(is);
}else if(fileName.endsWith("xlsx")) {
//2007
workbook = new XSSFWorkbook(is);
}
// 获取sheet
Sheet sheet = workbook.getSheetAt(0);
Iterator<Row> rowIterator = sheet.iterator();
while (rowIterator.hasNext()) {
// 行
Row row = rowIterator.next();
// 字符串
StringBuilder rowString = new StringBuilder();
Iterator<Cell> colIterator = row.cellIterator();
while (colIterator.hasNext()) {
Cell cell = colIterator.next();
switch (cell.getCellType()) {
case Cell.CELL_TYPE_BOOLEAN:
rowString.append(cell.getBooleanCellValue() + ",");
break;
case Cell.CELL_TYPE_NUMERIC:
rowString.append(cell.getNumericCellValue() + ",");
break;
case Cell.CELL_TYPE_STRING:
rowString.append(cell.getStringCellValue() + ",");
break;
}
}
String str = rowString.delete(rowString.lastIndexOf(","),rowString.lastIndexOf(",")+1).toString();
resultList.add(str);
// resultList.add(rowString.toString());
}
} catch (IOException e) {
logger.error("IO Exception : File not found " + e);
}
return resultList.toArray(new String[0]);
}
}
标签:script not found out org hdf final array ons jar
原文地址:http://www.cnblogs.com/xiaoma0529/p/7098572.html