标签:
问题?Lucene如何建立索引库,lucene所需要的jar包是那些 , lucene如何使用索引库,lucene的核心原理
package com.itcast.ldp.domain; import java.io.Serializable; public class Article implements Serializable{ private Long aid; private String title; private String content; public Long getAid() { return aid; } public void setAid(Long aid) { this.aid = aid; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } @Override public String toString() { return "Article [aid=" + aid + ", title=" + title + ", content=" + content + "]"; } }
package com.itcast.ldp.lucene; import java.io.File; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import com.itcast.ldp.domain.Article; /** * * 把Article对象放入到索引库中去 * 在索引库中把Article对象拿出来 * @author Administrator * */ public class CreateLucene1 { /** * * 创建索引 * @throws Exception */ @Test public void createIndex() throws Exception{ /** * 创建Article对象 */ Article article = new Article(); article.setAid(1L); article.setTitle("lucene是一个全文检索引擎"); article.setContent("taobao"); /** * //创建一个indexWriter对象 参数(1:索引库位置,2:分词器,3:代表文档中的属性最大长度) */ //1.索引库位置 Directory directory = FSDirectory.open(new File("./DirIndex")); //2.分词器:讲一段内容分成关键词的作用 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); //3.表示文档属性的最大长度 MaxFieldLength.LIMITED限制索引库汇总字段的大小,必须限制.源码中只能放10K IndexWriter indexWriter = new IndexWriter(directory, analyzer, MaxFieldLength.LIMITED); /** * 把Article对象转化doucument对象 * Field.Index.*:详解如下 *Index.ANALYZED : 使用分析器将域值分解成独立的词汇单元流,并使用每个语汇单元能被搜索。该选项适用于普通文本域(正文、标题、摘要等); *Index.NOT_ANALYZED : 对域进行索引,但不对String值进行分析。该操作实际上将域值作为单一语汇单元使之能够被搜索。该选项适用于索引那些不能被分解的域值(URL、文件路径、日期、人名、社保号码、手机号码等。)该选项尤其适用于"精确匹配"搜索; *Index.ANALYZED_NO_NORMS : 这是Index.ANALYZED选项的一个变体,它不会在索引里面存储norms信息。norms记录了索引中的index-time boost信息,但是当你进行搜索时可能会比较耗费内存; *Index.NOT_ANALYZED_NO_NORMS : 与Index.NOT_ANALYZED选项类似,但是也不存储norms。该选项用于搜索期间节省索引空间和减少内存消耗,因为single-token域并不需要norms信息,除非它们已被进行加权操作; *Index.NO : 使对应的域值不被搜索; */ //创建文档 Document document = new Document(); //1.表示在索引库中的字段 2.存储在索引库中的值 Field idField = new Field("aid", article.getAid().toString(), Store.YES, Index.NOT_ANALYZED); Field titleField = new Field("title", article.getTitle(), Store.YES, Index.ANALYZED); Field contentField = new Field("content", article.getContent(), Store.YES, Index.ANALYZED); //2.把field放入document中 document.add(idField); document.add(titleField); document.add(contentField); /** * 把document对象放入到索引库中 */ indexWriter.addDocument(document); /** * 关闭资源(因为是存放磁盘上的,就有IO流,那就需要关闭) */ /*indexWriter.optimize();合并多个indexWriter对象产生的cfs文件合并,也可以不写,底层到达一定数量了,自动优化*/ indexWriter.close(); } }
package com.itcast.ldp.lucene; import java.io.File; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import com.itcast.ldp.domain.Article; /** * * 把Article对象放入到索引库中去 * 在索引库中把Article对象拿出来 * @author Administrator * */ public class CreateLucene1 { /** * 从索引库中根据关键词检索出来 * @throws Exception * */ @Test public void findIndex() throws Exception{ /** * 1.创建insercher对象 */ Directory directory = FSDirectory.open(new File("./DirIndex")); IndexSearcher indexSearcher = new IndexSearcher(directory); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); //规定检索字段 1:版本号 2:字段 3:分词器 /*QueryParser queryParser = new QueryParser(Version.LUCENE_30, "title", analyzer);*/ QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_30, new String[]{"title","content"}, analyzer); //指定关键词 Query query = queryParser.parse("lucene"); //第二个参数:检索索引库中的前几个目录 TopDocs topDocs = indexSearcher.search(query, 20); int count = topDocs.totalHits;//根据关键词得到目录中中总的条目数 System.out.println("共查询到条目数:"+count); //ScoreDoc得到关键词所在的哪一行,得到总的索引号数组 ScoreDoc[] scoreDocs = topDocs.scoreDocs; List<Article> articles = new ArrayList<Article>(); for(ScoreDoc scoreDoc:scoreDocs){ //关键词的索引号 int doc= scoreDoc.doc; //根据索引号,得到文档,相当于得到书的页数 Document document = indexSearcher.doc(doc); Article article = new Article(); article.setAid(Long.parseLong(document.get("aid"))); article.setTitle(document.get("title")); article.setContent(document.get("content")); articles.add(article); } for(Article article:articles){ System.out.println("索引库中得到:"+article.toString()); } } }
1) 创建IndexSearch
2) 创建Query对象
3) 进行搜索
4) 获得总结果数和前N行记录ID列表
5) 根据目录ID列表把Document转为为JavaBean并放入集合中。
6) 循环出要检索的内容
package com.itcast.ldp.util; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import com.itcast.ldp.domain.Article; public class DocumentArticleUtil { public Document ArticleToDocument(Article article){ Document document = new Document(); //创建文档 //1.表示在索引库中的字段 2.存储在索引库中的值 /*NumericUtils.longToPrefixCoded(article.getAid());//要使用专业工具转化类型放入文档Long类型转化为String类型*/ Field idField = new Field("aid", article.getAid().toString(), Store.YES, Index.NOT_ANALYZED); Field titleField = new Field("title", article.getTitle(), Store.YES, Index.ANALYZED);//Index.NO: Field contentField = new Field("content", article.getContent(), Store.YES, Index.ANALYZED); //2.把field放入document中 document.add(idField); document.add(titleField); document.add(contentField); return document; } public Article DocumentToArticle(Document document){ Article article = new Article(); /*NumericUtils.prefixCodedToLong(document.get("aid"));//String类型转化为long类型*/ article.setAid(Long.parseLong(document.get("aid"))); article.setTitle(document.get("title")); article.setContent(document.get("content")); return article; } }
package com.itcast.ldp.util; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class DirectorAndAnalyzerUtile { public static Directory directory; public static Analyzer analyzer; static{ try { directory=FSDirectory.open(new File("./DirIndex")); analyzer = new StandardAnalyzer(Version.LUCENE_30); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
package com.itcast.ldp.lucene; import java.util.ArrayList; import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.util.Version; import org.junit.Test; import com.itcast.ldp.domain.Article; import com.itcast.ldp.util.DirectorAndAnalyzerUtile; import com.itcast.ldp.util.DocumentArticleUtil; public class CrudLucene { /** * 创建索引 * @throws Exception */ @Test public void createIndex() throws Exception{ Article article = new Article(); article.setAid(1L); article.setTitle("lucene是一个全文检索引擎"); article.setContent("Oracle,google,baidu,taobao"); IndexWriter indexWriter = new IndexWriter(DirectorAndAnalyzerUtile.directory, DirectorAndAnalyzerUtile.analyzer, MaxFieldLength.LIMITED); DocumentArticleUtil util = new DocumentArticleUtil(); Document document = util.ArticleToDocument(article); indexWriter.addDocument(document); indexWriter.optimize();//优化,将多个indexWriter对象产生的cfs文件合并,也可以不写,底层到达一定数量了,自动优化 indexWriter.close(); } /** * * 从索引库中得到索引 * @throws Exception */ @Test public void findIndex() throws Exception{ IndexSearcher indexSearcher = new IndexSearcher(DirectorAndAnalyzerUtile.directory); QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_30, new String[]{"title","content"}, DirectorAndAnalyzerUtile.analyzer); //指定关键词 Query query = queryParser.parse("taobao11"); TopDocs topDocs = indexSearcher.search(query, 1); int count = topDocs.totalHits;//得到含有lucene关键字的索引条目 System.out.println("总条数:"+count); ScoreDoc[] scoreDocs = topDocs.scoreDocs; DocumentArticleUtil documentArticleUtil = new DocumentArticleUtil(); List<Article> articles = new ArrayList<Article>(); for(ScoreDoc scoreDoc:scoreDocs){ int doc = scoreDoc.doc; Document document = indexSearcher.doc(doc); Article article = documentArticleUtil.DocumentToArticle(document); articles.add(article); } for(Article article:articles){ System.out.println("索引库索引得到:"+article.toString()); } } /** * * 删除关键词 * @throws Exception */ @Test public void deleteIndex() throws Exception{ //创建关键词对象,字段名称为title,字段名称中含有lucene的关键字 Term term = new Term("title","lucene"); IndexWriter indexWriter = new IndexWriter(DirectorAndAnalyzerUtile.directory, DirectorAndAnalyzerUtile.analyzer, MaxFieldLength.LIMITED); //删除关键词对象term是用来删除的 indexWriter.deleteDocuments(term); indexWriter.close(); } /** * * 更新关键字,先删除后更新 * @throws Exception */ @Test public void updateIndex() throws Exception{ //创建关键词对象,字段名称为title,字段名称中含有lucene的关键字 Term term = new Term("title","lucene"); IndexWriter indexWriter = new IndexWriter(DirectorAndAnalyzerUtile.directory, DirectorAndAnalyzerUtile.analyzer, MaxFieldLength.LIMITED); Article article = new Article(); article.setAid(1L); article.setTitle("lucene是一个全文检索引擎1"); article.setContent("Oracle,google,baidu,taobao11"); DocumentArticleUtil util = new DocumentArticleUtil(); Document doc = util.ArticleToDocument(article); //更新关键词 term是用来删除的,doc是用来增加的 indexWriter.updateDocument(term, doc); indexWriter.close(); } }
从图中可以看出来,lucene在执行删除的时候,是先把要删除的元素形成了一个文件del文件,然后再和cfs文件进行整合得出最后结果。
如果增加、删除反复操作很多次,就会造成文件大量增加,这样检索的速度也会下降,所以我们有必要去优化索引结构。Lucen有自动优化的功能,当文件数目到达一定量的时候,会自动合并cfs和del文件。但是我们可以手工去合并该文件。就一条语句:package com.itcast.ldp.memory; import java.io.File; import java.util.ArrayList; import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import com.itcast.ldp.domain.Article; import com.itcast.ldp.util.DirectorAndAnalyzerUtile; import com.itcast.ldp.util.DocumentArticleUtil; public class DirectoryMemoryTest { /** * * 内存索引库的建立RAMDirectory() * @throws Exception */ @Test public void testMemory() throws Exception{ /*Directory directory = FSDirectory.open(new File("./MemoryTest"));这是创建池畔索引库*/ Directory directory = new RAMDirectory();//这是创建内存索引库 IndexWriter indexWriter = new IndexWriter(directory, DirectorAndAnalyzerUtile.analyzer,MaxFieldLength.LIMITED); Article article = new Article(); article.setAid(1L); article.setTitle("lucene是一个全文检索引擎"); article.setContent("Oracle,google,baidu,taobao"); DocumentArticleUtil util = new DocumentArticleUtil(); Document document = util.ArticleToDocument(article); indexWriter.addDocument(document); indexWriter.close(); this.inserchDirectory(directory); } public void inserchDirectory(Directory directory) throws Exception{ IndexSearcher indexSearcher = new IndexSearcher(directory); QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_30, new String[]{"title","content"}, DirectorAndAnalyzerUtile.analyzer); Query query = queryParser.parse("lucene"); TopDocs docs = indexSearcher.search(query, 1); ScoreDoc[] scoreDocs = docs.scoreDocs; List<Article> articles = new ArrayList<Article>(); DocumentArticleUtil articleUtil = new DocumentArticleUtil(); for(ScoreDoc scoreDoc:scoreDocs){ int doc = scoreDoc.doc; Document document = indexSearcher.doc(doc); Article article = articleUtil.DocumentToArticle(document); articles.add(article); } for(Article article:articles){ System.out.println(article.toString()); } } /** * * 内存索引库与文件索引库的建立 * 保证文件索引库的持久性 * 保证内存索引库的性能 * @throws Exception */ @Test public void testMemoryDirectoryAndFileDirectory() throws Exception{ //1.建立两个索引库 //2.把文件索引库放入到内存索引库当中 Directory fileDirectory = FSDirectory.open(new File("./DirCrud")); Directory memoryDirectory = new RAMDirectory(fileDirectory); //3.建立两个indexWritter,这里的true代表内存索引库的内容到文件索引库中覆盖,默认false表示追加 IndexWriter fileWriter = new IndexWriter(fileDirectory, DirectorAndAnalyzerUtile.analyzer, true,MaxFieldLength.LIMITED); IndexWriter memoryWriter = new IndexWriter(memoryDirectory, DirectorAndAnalyzerUtile.analyzer, MaxFieldLength.LIMITED); //4.让内存索引库和客户端交互 Article article = new Article(); article.setAid(1L); article.setTitle("lucene是一个全文检索引擎"); article.setContent("Oracle,google,baidu,taobao"); DocumentArticleUtil util = new DocumentArticleUtil(); Document document = util.ArticleToDocument(article); memoryWriter.addDocument(document); //5.把内存索引库内容放入到文件索引库当中 fileWriter.addIndexesNoOptimize(memoryDirectory); memoryWriter.close(); fileWriter.close(); this.inserchDirectory(fileDirectory); } }
<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd"> <properties> <comment>IK Analyzer 扩展配置</comment> <!--用户可以在这里配置自己的扩展字典 --> <entry key="ext_dict">/mydict.dic;</entry> <!-- <entry key="ext_dict">/mydict.dic; /mypack/mydict2.dic ; /com/mycompany/dic/mydict3.dic ;</entry> --> <!--用户可以在这里配置自己的扩展停止词字典,也就是有些非法词汇可以过滤掉 <entry key="ext_stopwords">/ext_stopword.dic</entry> --> </properties>
package com.itcast.ldp.Analyzed; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.Version; import org.junit.Test; import org.wltea.analyzer.lucene.IKAnalyzer; public class AnalyzedTest { /** * 英文分词器三个步骤 * @throws Exception */ @Test public void englishDirectoryTest() throws Exception{ //1.拆分 //2.停用词 //3.大写转化为小写 String text = "I'm a the customer among all customers!"; Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); this.coverContent(analyzer, text); } //英文分词 private void coverContent(Analyzer analyzer,String text) throws Exception{ TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text)); tokenStream.addAttribute(TermAttribute.class); while(tokenStream.incrementToken()){ TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); System.out.println(termAttribute.term()); } } //中文分词器最麻烦,外国人提供的那套就不适用了 @Test public void ChineseDirectoryTest_1() throws Exception{ String text = "我是一名中国人!"; Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); this.coverContent(analyzer, text); } //二分法分词器 @Test public void ChineseDirectoryTest_2() throws Exception{ String text = "我是一名中国人!"; Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30); this.coverContent(analyzer, text); } //使用中国人开发的一套分词器 IKAnalyzer3.2.0Stable.jar该词库不就支持中文分词同时支持英文分词 //使用扩展词典的时候,必须保证编码格式相同 @Test public void ChineseDirectoryTest_3() throws Exception{ String text = "am the english 赵东 我是一名中国人!"; Analyzer analyzer = new IKAnalyzer(); this.coverContent(analyzer, text); } }
package com.itcast.ldp.highlight; import java.util.ArrayList; import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.Fragmenter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.Scorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.util.Version; import org.junit.Test; import com.itcast.ldp.domain.Article; import com.itcast.ldp.util.DirectorAndAnalyzerUtile; import com.itcast.ldp.util.DocumentArticleUtil; public class HighLightTest { public void inserchTest(int min,int max) throws Exception{ IndexSearcher indexSearcher = new IndexSearcher(DirectorAndAnalyzerUtile.directory); QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_30,new String[]{"title","content"},DirectorAndAnalyzerUtile.analyzer); Query query = queryParser.parse("lucene"); TopDocs topDocs = indexSearcher.search(query, 30); //创建和配置高亮器 Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>"); Scorer scorer =new QueryScorer(query);//得到高亮的关键词 Highlighter highlighter = new Highlighter(formatter, scorer);//给指定的关键词加前缀和后缀 //创建摘要器 Fragmenter fragmenter = new SimpleFragmenter(10);//指定索引库中的字段摘要大小,如果是无参构造器,则默认大小为150 highlighter.setTextFragmenter(fragmenter);//设置摘要 int count = topDocs.totalHits; System.out.println("查询总的记录条目数:"+count); ScoreDoc[] scoreDocs = topDocs.scoreDocs; int pageSize = Math.min(count, min+max); List<Article> articles = new ArrayList<Article>(); DocumentArticleUtil util = new DocumentArticleUtil(); for(int i=min;i<pageSize;i++){ System.out.println("相关度得分:"+scoreDocs[i].score); int index = scoreDocs[i].doc; Document document = indexSearcher.doc(index); Article article = util.DocumentToArticle(document); /** * 使用高亮器 * 1.Luence可以做搜索引擎---> <font color='red'>luence</font> * 2.分词器的作用就是提取高亮器高亮的关键字 */ String text = highlighter.getBestFragment(DirectorAndAnalyzerUtile.analyzer, "title", document.get("title")); article.setTitle(text); articles.add(article); } indexSearcher.close(); for(Article article:articles){ System.out.println(article.toString()); } } @Test public void highLightTest()throws Exception{ inserchTest(0, 40); } }
标签:
原文地址:http://blog.csdn.net/mr_li13/article/details/51727526