标签:Lucene io os 使用 java ar for 文件 数据
近期用Lucene做了个比较简单的站内检索,在这里和大家做个交流。全文检索的实现,从检索的数据源来分有两种:一种是数据库,另一种是已生成的文件(doc,html,txt......)。
无论哪一种方式,实现原理都是一样的。主要分为两大步:
一、将数据源转换为Lucene文件,保存到设定目录下
private static String filePath = "D:\\rookie\\date\\";//文件存放路径
	private static  String indexPath = "D:\\rookie\\source";//索引存放路径
	
	public static void main(String[] args) throws Exception {
		/* 指明要索引文件夹的位置,这里是d盘的文件夹下 */
		File fileDir = new File(filePath);
		/* 这里放索引文件的位置 */
		File indexDir = new File(indexPath);
		
		Analyzer luceneAnalyzer = new StandardAnalyzer();
		IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,true);//提醒:最后一个参数为false时,不重新创建索引文件夹需要追加索引(即更新索引时使用false)
		File[] textFiles = fileDir.listFiles();
		long startTime = new Date().getTime();
		// 增加document到索引去
		for (int i = 0; i < textFiles.length; i++) {
		
				//支持html,txt文件
				if (textFiles[i].isFile() && textFiles[i].getName().endsWith(".txt")) {
				String temp = FileReaderAll(textFiles[i].getCanonicalPath(),"GBK");
				Document document = new Document();
Field FieldId = new Field("id", "12345",Field.Store.YES, Field.Index.UN_TOKENIZED);//强烈建议在添加Field 时 保存一个Id
				Field FieldPath = new Field("path", textFiles[i].getPath(),Field.Store.YES, Field.Index.UN_TOKENIZED);
				Field FieldBody = new Field("contents", temp, Field.Store.YES,Field.Index.TOKENIZED,Field.TermVector.WITH_POSITIONS_OFFSETS);
document.add(FieldId);
				document.add(FieldPath);
				document.add(FieldBody);
				indexWriter.addDocument(document);
			}
			
		}
		// optimize()方法是对索引进行优化
		indexWriter.optimize();
		indexWriter.close();
		// 测试一下索引的时间
		long endTime = new Date().getTime();
		System.out.println("索引已经添加到文档中,共花费了" + (endTime - startTime) + " 毫秒! 索引路径是:"	+ fileDir.getPath());
	}
/**
	 * 功能:读取html ,txt...
	 * @author rookie_d
	 */
	public static String FileReaderAll(String FileName, String charset)
			throws IOException {
		BufferedReader reader = new BufferedReader(new InputStreamReader(
				new FileInputStream(FileName), charset));
		String line = new String();
		String temp = new String();
		while ((line = reader.readLine()) != null) {
			temp += line;
		}
		reader.close();
		return temp;
	}
二、从Lucene文件中进行检索
/**
  	 * 功能:从索引中查询出包含要搜索名字的所有的文件
  	 * @author rookie_d
  	 */
	public static List luceneSearcher() {
String queryString="好";//要检索的字符串
		  String indexPath = "D:\\rookie\\source";//得到索引存放路径
			Hits hits = null;
			Query query = null;
			IndexSearcher searcher;
			List list = new ArrayList();
			try {
				searcher = new IndexSearcher(indexPath);
				Analyzer analyzer = new StandardAnalyzer();
				QueryParser qp = new QueryParser("contents", analyzer);
				System.out.println(qp.getField());
				try {
					query = qp.parse(queryString);
					System.out.println(query);
				} catch (org.apache.lucene.queryParser.ParseException e) {
					e.printStackTrace();
				}
				if (searcher != null) {
					hits = searcher.search(query);
					System.out.println(hits.length());
					if (hits!=null && hits.length() > 0) {
            System.out.println("共找到:" + hits.length() + "个结果!");
						for(int i=0;i<hits.length();i++){
							Document document = hits.doc(i);
							String path = document.get("path");
							File file = new File(path);
							list.add(file.getPath());
						}
					}else{
						System.out.println("*****no result find*****");
					}
					
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
			return list;
	}
在开发过程中遇到了更新索引的小难题,下面也给段转来的代码,作为菜鸟认为这段代码还是比较有用的
mport java.io.IOException; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.index.IndexWriter; 
import org.apache.lucene.index.Term; 
import org.apache.lucene.queryParser.QueryParser; 
import org.apache.lucene.search.Hits; 
import org.apache.lucene.search.IndexSearcher; 
import org.apache.lucene.search.Query; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
public class UpdateDocument { 
private static String path = "d:/index"; 
public static void main(String[] args){ 
// addIndex(); 
updateIndex(); 
search("李四"); 
search("王五"); 
} 
public static void addIndex(){ 
try { 
IndexWriter write = new IndexWriter(path,new StandardAnalyzer(),true); 
Document doc = new Document(); 
doc.add(new Field("id","123456",Field.Store.YES,Field.Index.UN_TOKENIZED)); 
doc.add(new Field("userName","张三",Field.Store.YES,Field.Index.TOKENIZED)); 
doc.add(new Field("comefrom","北京",Field.Store.YES,Field.Index.TOKENIZED)); 
write.addDocument(doc); 
write.close(); 
} catch (IOException e) { 
e.printStackTrace(); 
} 
} 
public static void updateIndex(){ 
try { 
IndexWriter write = new IndexWriter(path,new StandardAnalyzer(),false); 
Document docNew = new Document(); 
docNew.add(new Field("id","123456",Field.Store.YES,Field.Index.UN_TOKENIZED)); 
docNew.add(new Field("userName","王五",Field.Store.YES,Field.Index.TOKENIZED)); 
Term term = new Term("id","123456"); 
/** 
调用updateDocument的方法,传给它一个新的doc来更新数据, 
Term term = new Term("id","1234567"); 
先去索引文件里查找id为1234567的Doc,如果有就更新它(如果有多条,最后更新后只有一条)。如果没有就新增. 
数据库更新的时候,我们可以只针对某个列来更新,而lucene只能针对一行数据更新。 
*/ 
write.updateDocument(term, docNew); 
write.close(); //注意在这里一定要关闭write 
} catch (IOException e) { 
e.printStackTrace(); 
} 
} 
public static Query queryParser(String str){ 
QueryParser queryParser = new QueryParser("userName", new StandardAnalyzer()); 
try { 
Query query = queryParser.parse(str); 
return query; 
} catch (Exception e) { 
e.printStackTrace(); 
} 
return null; 
} 
public static void search(String str){ 
try { 
IndexSearcher search = new IndexSearcher(path); 
Query query = queryParser(str); 
Hits hits = search.search(query); 
if(hits==null){ 
return; 
} 
if(hits.length() == 0){ 
System.out.println(" 没有搜索到‘" + str+"‘"); 
return; 
} 
for (int i = 0; i < hits.length(); i++) { 
Document doc = hits.doc(i); 
System.out.println("id = "+hits.id(i)); 
System.out.println("own id = " + doc.get("id")); 
System.out.println("userName = "+doc.get("userName")); 
System.out.println("come from = "+doc.get("comefrom")); 
System.out.println(""); 
} 
} catch (Exception e) { 
e.printStackTrace(); 
} 
} 
} 
最后再给一段删除索引的代码:
//删除Lucene中相应的索引
  		File indexDir = new File(indexPath);/* 这里放索引文件的位置 */
  		File[] textFiles = indexDir.listFiles();
  		Analyzer luceneAnalyzer = new StandardAnalyzer();
  		boolean create = false;
  		if(textFiles==null||textFiles.length<=0){
	  		create = true;
  		}
  		IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,create);
  		Term term = new Term("id",news.getId());
    		indexWriter.deleteDocuments(term); 
  		indexWriter.optimize();//optimize()方法是对索引进行优化
  		indexWriter.close();//关闭
在删除和更新索引时要注意new IndexWriter(indexDir, luceneAnalyzer,false);最后一个参数为false
关于全文检索的内容还有许多需要学习,写这篇文章来帮助新手和自己来熟悉Lucene,希望对你有一点帮助!
标签:Lucene io os 使用 java ar for 文件 数据
原文地址:http://www.cnblogs.com/huideng/p/3979890.html