码迷,mamicode.com
首页 > Web开发 > 详细

探索Lucene.Net全文检索

时间:2014-12-22 18:09:24      阅读:227      评论:0      收藏:0      [点我收藏+]

标签:lucene.net   全文检索   

在CSDN,博客园找了一番Lucene.Net相关资料后,最后发现还是没有自己想要的,毕竟lucene.net版本一直在变,这里我用的是Lucene.Net 3.0的版本,demo是在http://www.dotlucene.net/ 网站里面找到的。方法很完善,API也比较全面,http://www.dotlucene.net/30648/lucene-net-api-search-demo。

不过demo里面讲解是检索文件内容服务,而我想要的是检索数据内容服务,我就在它的基础上改了一番。

过程:

1.建立索引,更新索引,删除索引

protected luseneTxt m=null;
        protected void Page_Load(object sender, EventArgs e)
        {

        }

        protected void creatIndexBt_Click(object sender, EventArgs e)
        {
            DateTime dt = DateTime.Now;            
            List<luseneTxt> l = new List<luseneTxt>();
            bool flag = true;
            int i = 0;
            LuceneSearch ls = new LuceneSearch();
            while (flag)
            {
                m = new luseneTxt();
                m.text = "test";
                m.path = "http://www.baidu.com/?i=";
                m.title = "mofijeck ";
                m.des = "12";
                m.keyword = "34";
                l.Add(m);
                i++;
                m = null;
                if (i == 99999)
                {
                    flag = false;
                }
            }
            ls.CreatIndexByData(l);
            l = new List<luseneTxt>();
            TimeSpan ts = DateTime.Now - dt;
            Label1.Text = "建立索引耗时" + ts.TotalSeconds + "秒";
        }

        protected void deleteIndexBt_Click(object sender, EventArgs e)
        {
            DateTime dt = DateTime.Now;
            LuceneSearch ls = new LuceneSearch();
            ls.DeleteIndex();
            TimeSpan ts = DateTime.Now - dt;
            Label1.Text = "删除索引耗时" + ts.TotalSeconds + "秒";
        }

        protected void updateIndexBt_Click(object sender, EventArgs e)
        {            
            DateTime dt = DateTime.Now;            
            List<luseneTxt> l = new List<luseneTxt>();
            bool flag = true;
            int i = 0;
            LuceneSearch ls = new LuceneSearch();
            while (flag)
            {
                m = new luseneTxt();
                m.text = "test";
                m.path = "http://www.baidu.com/?i=";
                m.title = "mofijeck ";
                m.des = "12";
                m.keyword = "34";
                l.Add(m);
                i++;
                m = null;
                if (i == 999999)
                {
                    flag = false;
                }
            }
            ls.UpdateIndexByData(l);
            l = new List<luseneTxt>();
            TimeSpan ts = DateTime.Now - dt;
            Label1.Text = "建立索引耗时" + ts.TotalSeconds + "秒";
        }

2.搜索

其实这里说白了就是增删改查,没有比较特别的东西,但是关于全文检索原理上的东西我也不是特别懂,网上很多大牛有相关博客解释的。

protected void search(string q) {
            if (q == "") {
                Label1.Text = "不能为空";
                return;
            }
            TimeSpan duration=new TimeSpan();
            DateTime dt = DateTime.Now;
            LuceneSearch ls = new LuceneSearch(); 
            int pageIndex = Request["Page"] == null ? 0 : int.Parse(Request["Page"]);
            int pageSize = 10;
            string colName = tbcol.Text.Trim() == "" ? "text" : tbcol.Text.Trim();
            tbcol.Text = colName;
            Repeater1.DataSource = ls.Search(q,colName, pageSize, pageIndex);
            Repeater1.DataBind();
            duration = DateTime.Now - dt;
            dateTimeMsg = "耗时" + duration.TotalSeconds + "秒";
            SqlPager sqlpager = new SqlPager("", pageSize, ls.getSearchCount(q, colName), "Search.aspx", "q=" + q, pageIndex);
            pagerStr = sqlpager.GetShowPageStr("2", 1, 10);
        }
前台显示:

 <div>
                <asp:repeater id="Repeater1" runat="server" >
					<ItemTemplate>
						<p><a href='<%# Eval("url")%>' class="link"><%# DataBinder.Eval(Container.DataItem, "title")  %></a><br/>
							<span class="sample">
								<%# DataBinder.Eval(Container.DataItem, "lightText")%>
							</span>
                            <br>
                            <span class="des">
								<%# DataBinder.Eval(Container.DataItem, "des")%>
							</span>
                            <br>
                            <span class="keyword">
								<%# DataBinder.Eval(Container.DataItem, "keyword")%>
							</span>
							<br>
							<span class="path">
								<%# DataBinder.Eval(Container.DataItem, "url")  %>
							</span>
						</p>
					</ItemTemplate>
				</asp:repeater>
             </div>
             <div class="paging">
               <%=this.pagerStr%>
            </div>


源码如下:

数据索引类(DataIndexer):

public class DataIndexer
    {
        private IndexWriter writer;
        public DataIndexer(string directory)
        {
            writer = new IndexWriter(FSDirectory.Open(directory), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30), true, IndexWriter.MaxFieldLength.LIMITED);
            writer.UseCompoundFile = true;
        }
        public DataIndexer(string directory, bool create) {
            writer = new IndexWriter(FSDirectory.Open(directory), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30), create, IndexWriter.MaxFieldLength.LIMITED);
            writer.UseCompoundFile = true;
        }
        public void AddHtmlData(List<luseneTxt> list)
        {
            foreach (luseneTxt t in list)
            {
                AddHtmlDocument(t);
            }
        }
        /// <summary>
        /// Loads, parses and indexes an HTML file.
        /// </summary>
        /// <param name="path"></param>
        public void AddHtmlDocument(luseneTxt lt)
        {
            Document doc = new Document();
            doc.Add(new Field("text", lt.title, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("path", lt.path, Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("title", lt.text, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("des", lt.des, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("keyword", lt.keyword, Field.Store.YES, Field.Index.ANALYZED));
            writer.AddDocument(doc);
        }
        /// <summary>
        /// Optimizes and save the index.
        /// </summary>
        public void Close()
        {
            writer.Optimize();
            writer.Dispose();
        }
        public void Delete()
        {
            writer.DeleteAll();
        }
        
    }

Lucene.Net搜索检索类(LuceneSearch):

public class LuceneSearch
    {
        private string indexDirectory = System.Web.HttpContext.Current.Server.MapPath("~/App_Data/index");//默认
        /// <summary>
        /// 空构造函数
        /// </summary>
        public LuceneSearch() {
        
        }
        /// <summary>
        /// 构造函数
        /// </summary>
        /// <param name="filePath">默认是~/App_Data/index</param>
        public LuceneSearch(string filePath) {
            indexDirectory = System.Web.HttpContext.Current.Server.MapPath(filePath);
        }
        /// <summary>
        /// 文件索引
        /// </summary>
        /// <param name="url">文件路径</param>
        /// <param name="pattenRegex">正则匹配 比如*.htm*</param>
        public void CreatIndex(string url,string pattenRegex) {
            string dataDirectory = System.Web.HttpContext.Current.Server.MapPath(url);
            IntranetIndexer indexer = new IntranetIndexer(indexDirectory);
            indexer.AddDirectory(new DirectoryInfo(dataDirectory), pattenRegex);            
            indexer.Close();
        }
        /// <summary>
        /// 创建数据库索引
        /// </summary>
        public void CreatIndexByData(List<luseneTxt> list) {
            DataIndexer indexer = new DataIndexer(indexDirectory);
            indexer.AddHtmlData(list);
            indexer.Close();
        }
        /// <summary>
        /// 更新数据库索引
        /// </summary>
        public void UpdateIndexByData(List<luseneTxt> list)
        {
            DataIndexer indexer = new DataIndexer(indexDirectory, false);
            indexer.AddHtmlData(list);
            indexer.Close();
        }
        public void DeleteIndex() {
            DataIndexer indexer = new DataIndexer(indexDirectory);
            indexer.Delete();
            indexer.Close();
        }
        #region 搜索       
        public List<SearchLucene> Search(string q, string colname, int pageSize, int page)
        {
            List<SearchLucene> list = new List<SearchLucene>();
            // create the searcher
            // index is placed in "index" subdirectory
            DateTime start = DateTime.Now;
            var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
            IndexSearcher searcher = new IndexSearcher(FSDirectory.Open(indexDirectory));
            // parse the query, "text" is the default field to search
            var parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, colname, analyzer);
            Query query = parser.Parse(q);
            // search
            TopDocs hits = searcher.Search(query, 200);
            int count = hits.TotalHits;
            // create highlighter
            IFormatter formatter = new SimpleHTMLFormatter("<span style=\"font-weight:bold;\">", "</span>");
            SimpleFragmenter fragmenter = new SimpleFragmenter(80);
            QueryScorer scorer = new QueryScorer(query);
            Highlighter highlighter = new Highlighter(formatter, scorer);
            highlighter.TextFragmenter = fragmenter;
            // initialize page
            int startPage  = page;
            int endPage = (page + 1) * pageSize > count ? count : (page + 1) * pageSize;
            // how many items we should show - less than defined at the end of the results
            for (int i = startPage; i < endPage; i++)
            {
                // get the document from index
                Document doc = searcher.Doc(hits.ScoreDocs[i].Doc);
                TokenStream stream = analyzer.TokenStream("", new StringReader(doc.Get("text")));
                String highText = highlighter.GetBestFragments(stream, doc.Get("text"), 2, "...");
                SearchLucene sm = new SearchLucene();
                sm.title = doc.Get("title");
                sm.des = doc.Get("des");
                sm.url = doc.Get("path");
                sm.text = doc.Get("text");
                sm.keyword = doc.Get("keyword");
                sm.lightText = highText;
                list.Add(sm);
            }
            searcher.Dispose();
            return list;
        }
        public int getSearchCount(string q, string colname)
        {
            var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
            IndexSearcher searcher = new IndexSearcher(FSDirectory.Open(indexDirectory));
            // parse the query, "text" is the default field to search
            var parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, colname, analyzer);
            Query query = parser.Parse(q);
            // search
            TopDocs hits = searcher.Search(query, 200);
            return hits.TotalHits;
        }
        
        #endregion

    }
    public class luseneTxt
    {
        public string title { set; get; }
        public string path { set; get; }
        public string text { set; get; }
        public string des { set; get; }
        public string keyword { set; get; }
    }
    public class SearchLucene{
        public string title{set;get;}
        public string des{set;get;}
        public string keyword { set; get; }
        public string url{set;get;}
        public string text{set;get;}
        public string lightText{set;get;}
    }

分页类库:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace LuceneSolution.Code
{
    /// <summary>
    /// 初始化构造函数
    /// 调用GetShowPageStr("2", 3, 10);
    /// </summary>
    public class SqlPager
    {
        #region 初始构造
        public SqlPager(string Language)
        {
            this.unitLa(Language);
        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="Language">默认是中文</param>
        /// <param name="MyPageSize"></param>
        /// <param name="Counts"></param>
        /// <param name="MyParameters"></param>
        /// <param name="PageIndex"></param>
        public SqlPager(string Language, int MyPageSize, int Counts, string TurnUrlStr, string MyParameters, int PageIndex)
        {
            this.unitLa(Language);
            this.PageSize = MyPageSize;//页码大小
            this.TotalRecord = Counts;//总记录数
            this.TurnUrlStr = TurnUrlStr;//要跳转的页面(当前页)
            this.Parameters = MyParameters;//页面所带参数
            this.PageIndex = PageIndex;//当前要显示的页码
            //具体样式
            this.PageString = "第[PageIndex]页/共[TotalPage]页  第每[PageSize]条/共[TotalRecord]条  [FirstStr] [PrevStr] [PageNumber] [NextStr] [LastStr]  [TurnControl]";
        }
        
        /// <summary>
        /// 初始化中英文
        /// </summary>
        /// <param name="Language"></param>
        protected void unitLa(string Language)
        {
            if (Language == "EN")//英文默认
            {
                PageString = "Page [PageIndex]/[TotalPage]  Info [PageSize]/[TotalRecord]  [FirstStr] [PrevStr] [PageNumber] [NextStr] [LastStr]  [TurnControl]";
                TurnUrlStr = "";
                PageSize = 20;
                PageIndex = 1;
                TotalRecord = 0;
                FirstStr = "<<";
                PrevStr = "<";
                NextStr = ">";
                LastStr = ">>";
                NoRecord = "Total 0";
                ButtonStr = " GO ";

                TotalPage = 0;
                TurnControl = "";
                PageNumber = "";
                ShowPageStr = "";
            }
            else//中文默认
            {
                PageString = "第[PageIndex]页/共[TotalPage]页  第每[PageSize]条/共[TotalRecord]条  [FirstStr] [PrevStr] [PageNumber] [NextStr] [LastStr]  [TurnControl]";
                TurnUrlStr = "";
                PageSize = 20;
                PageIndex = 1;
                TotalRecord = 0;
                FirstStr = "首页";
                PrevStr = "上一页";
                NextStr = "下一页";
                LastStr = "尾页";
                NoRecord = "共有 0 条信息";
                ButtonStr = "跳转";

                TotalPage = 0;
                TurnControl = "";
                PageNumber = "";
                ShowPageStr = "";
            }
        }

        #endregion
        #region 字段 公共
        /// <summary>
        /// 具体样式(需要显示哪些内容)
        /// </summary>
        public string PageString;

        /// <summary>
        /// 跳转 的url链接
        /// </summary>
        public string TurnUrlStr;

        /// <summary>
        /// 跳转的url链接的参数前面不要加问号和与号
        /// </summary>
        public string Parameters;

        /// <summary>
        /// 每页记录数
        /// </summary>
        public int PageSize;

        /// <summary>
        /// 需要获取第几页的数据,从 1 开始
        /// </summary>
        public int PageIndex;

        /// <summary>
        /// 总记录数
        /// </summary>
        public int TotalRecord;

        /// <summary>
        /// 首页 显示样式
        /// </summary>
        public string FirstStr;

        /// <summary>
        /// 上一页 显示样式
        /// </summary>
        public string PrevStr;

        /// <summary>
        /// 下一页 显示样式
        /// </summary>
        public string NextStr;

        /// <summary>
        /// 尾页 显示样式
        /// </summary>
        public string LastStr;

        /// <summary>
        /// 没有记录时显示的信息
        /// </summary>
        public string NoRecord;

        /// <summary>
        /// 跳转按钮文字
        /// </summary>
        public string ButtonStr;
        #endregion
        #region 字段 私有
        /// <summary>
        /// 总页数
        /// </summary>
        private int TotalPage;

        /// <summary>
        /// 跳转控件
        /// </summary>
        private string TurnControl;

        /// <summary>
        /// 存储中间的内容(如 1 2 3 页码)
        /// </summary>
        private string PageNumber;

        /// <summary>
        /// 显示页码的总字符
        /// </summary>
        private string ShowPageStr;

        #endregion
        #region 获取最终页码显示
        /// <summary>
        /// 获取最终页码显示
        /// </summary>
        /// <param name="Style">样式(1 是上下页,2 是显示一批页码)</param>
        /// <param name="PlaceIn">固定当前页在第几个位置(样式2用)</param>
        /// <param name="ShowNum">一页显示几个页码(样式2用)</param>
        /// <returns>最终页码显示</returns>
        public string GetShowPageStr(string Style, int PlaceIn, int ShowNum)
        {
            TotalPage = (TotalRecord + PageSize - 1) / PageSize;

            //超出最小页码
            if (PageIndex < 1)
            {
                PageIndex = 1;
            }

            //超出最大页码
            if (PageIndex > TotalPage)
            {
                PageIndex = TotalPage;
            }

            //跳转
            TurnControl = "<input value='" + PageIndex.ToString() + "' id='txtPageGo' name='txtPageGo' type='text' style='width:35px;' onkeydown=\"if(event.keyCode==13)window.location.href='" + TurnUrlStr + "?Page=' + document.getElementById('txtPageGo').value + '" + "&" + Parameters + "'\"><input name='btnGo' type='button' id='btnGo' value='" + ButtonStr + "' onclick=\"javascript:window.location.href='" + TurnUrlStr + "?Page=' + document.getElementById('txtPageGo').value + '" + "&" + Parameters + "'\">";

            //没有记录
            if (TotalRecord <= 0)
            {
                ShowPageStr = NoRecord;
            }
            //有记录
            else
            {
                //只有一页
                if (TotalPage <= 1)
                {
                    ShowPageStr = "共有 " + TotalRecord.ToString() + " 条信息";
                }
                //不止一页
                else
                {
                    //第一页
                    if (PageIndex == 1)
                    {
                    }
                    else
                    {
                        FirstStr = "<a href=\"" + TurnUrlStr + "?Page=1" + "&" + Parameters + "\">" + FirstStr + "</a>";
                        PrevStr = "<a href=\"" + TurnUrlStr + "?Page=" + Convert.ToString(PageIndex - 1) + "&" + Parameters + "\">" + PrevStr + "</a>";
                    }

                    //最后一页
                    if (PageIndex == TotalPage)
                    {
                    }
                    else
                    {
                        NextStr = "<a href=\"" + TurnUrlStr + "?Page=" + Convert.ToString(PageIndex + 1) + "&" + Parameters + "\">" + NextStr + "</a>";
                        LastStr = "<a href=\"" + TurnUrlStr + "?Page=" + TotalPage + "&" + Parameters + "\">" + LastStr + "</a>";
                    }
                }


                #region 样式一: 共X条信息 第N页/共M页 首页 上一页 下一页 尾页  跳转
                if (Style == "1")
                {
                    PageNumber = "";
                }
                #endregion

                #region 样式2: 共X条信息 第N页/共M页 首页 1 2 3 尾页 跳转 当前页码定位在第 PlaceIn 位
                if (Style == "2")
                {
                    int PageTemp = 0;
                    if (PlaceIn < 1 || PlaceIn > ShowNum)
                    {
                        PlaceIn = 2;//前面保持有两个,则当前页码定位在第3位
                    }
                    else
                    {
                        PlaceIn = PlaceIn - 1;//前面保持有 PlaceIn - 1 个
                    }
                    string strPageNum = "";
                    string strTempNow = "";

                    //当页码总数 <= 一页要显示的页码数
                    if (TotalPage <= ShowNum)
                    {
                        for (int i = 1; i <= TotalPage; i++)
                        {
                            strTempNow = Convert.ToString(PageTemp + i);

                            //当前页不显示超链接
                            if (PageIndex == PageTemp + i)
                            {
                                strPageNum = strPageNum + "<b>" + strTempNow + "</b>  ";
                            }
                            else
                            {
                                strPageNum = strPageNum + "<a href=\"" + TurnUrlStr + "?Page=" + strTempNow + "&" + Parameters + "\">[" + strTempNow + "]</a>  ";
                            }
                        }
                    }
                    else
                    {
                        //当前页码 <= 固定位置 PlaceIn
                        if (PageIndex <= PlaceIn)
                        {
                            for (int i = 1; i <= ShowNum; i++)
                            {
                                strTempNow = Convert.ToString(i);

                                //当前页不显示超链接
                                if (PageIndex == i)
                                {
                                    strPageNum = strPageNum + "<b>" + strTempNow + "</b>  ";
                                }
                                else
                                {
                                    strPageNum = strPageNum + "<a href=\"" + TurnUrlStr + "?Page=" + strTempNow + "&" + Parameters + "\">[" + strTempNow + "]</a>  ";
                                }
                            }
                        }
                        //当前页码 >=最后一批该显示
                        else if (PageIndex >= TotalPage - ShowNum + PlaceIn + 1)
                        {
                            //第一个显示的数字
                            PageTemp = TotalPage - ShowNum + 1;
                            for (int i = PageTemp; i <= TotalPage; i++)
                            {
                                if (i > TotalPage) break;

                                strTempNow = Convert.ToString(i);

                                //当前页不显示超链接
                                if (PageIndex == i)
                                {
                                    strPageNum = strPageNum + "<b>" + strTempNow + "</b>  ";
                                }
                                else
                                {
                                    strPageNum = strPageNum + "<a href=\"" + TurnUrlStr + "?Page=" + strTempNow + "&" + Parameters + "\">[" + strTempNow + "]</a>  ";
                                }
                            }
                        }
                        else
                        {
                            for (int i = PageIndex - PlaceIn; i < PageIndex - PlaceIn + ShowNum; i++)
                            {
                                strTempNow = Convert.ToString(i);

                                //当前页不显示超链接
                                if (i == PageIndex)
                                {
                                    strPageNum = strPageNum + "<b>" + strTempNow + "</b>  ";
                                }
                                else
                                {
                                    strPageNum = strPageNum + "<a href=\"" + TurnUrlStr + "?Page=" + strTempNow + "&" + Parameters + "\">[" + strTempNow + "]</a>  ";
                                }
                            }
                        }
                    }



                    PageNumber = strPageNum;
                }
                #endregion
            }

            ShowPageStr = PageString;

            ShowPageStr = ShowPageStr.Replace("[PageIndex]", PageIndex.ToString());//
            ShowPageStr = ShowPageStr.Replace("[TotalPage]", TotalPage.ToString());//
            ShowPageStr = ShowPageStr.Replace("[PageSize]", PageSize.ToString());//
            ShowPageStr = ShowPageStr.Replace("[TotalRecord]", TotalRecord.ToString());//
            ShowPageStr = ShowPageStr.Replace("[FirstStr]", FirstStr);//
            ShowPageStr = ShowPageStr.Replace("[PrevStr]", PrevStr);//
            ShowPageStr = ShowPageStr.Replace("[NextStr]", NextStr);//
            ShowPageStr = ShowPageStr.Replace("[LastStr]", LastStr);//
            ShowPageStr = ShowPageStr.Replace("[TurnControl]", TurnControl);//
            ShowPageStr = ShowPageStr.Replace("[PageNumber]", PageNumber);//

            return ShowPageStr;
        }
        #endregion
        #region 简要说明
        /*
        .Net通用分页类(存储过程分页版,可以完全自定义显示样式,包括中英显示).
        大概思路是:主要是利用存储过程在数据库进行分页,
        所以在这个类里面不涉及到数据的处理,只进行页码的显示格式处理,
        配合SQL2005 的 ROW_NUMBER () 功能,能够达到更好的效果.
        显示样式可以完全自定义,用图片也可以,只是要把字符串拼成图片的Html
        效果图:http://images.cnblogs.com/cnblogs_com/84ww/128905/r_PageStore.gif
        */
        #endregion
        #region 调用实例
        /*
        protected void BindData()
        {
            PageStyle MyPage = new PageStyle("EN");//创建类
            MyPage.PageSize = MyPageSize;//页码大小
            MyPage.TotalRecord = Counts;//总记录数
            MyPage.TurnUrlStr = "AskSearchList.aspx";//要跳转的页面(当前页)
            MyPage.Parameters = MyParameters;//页面所带参数
            MyPage.PageIndex = PageIndex;//当前要显示的页码
            //具体样式
            MyPage.PageString = "第[PageIndex]页/共[TotalPage]页  第每[PageSize]条/共[TotalRecord]条  [FirstStr] [PrevStr] [PageNumber] [NextStr] [LastStr]  [TurnControl]";

            string strPage = MyPage.GetShowPageStr("2", 3, 10);
            labShowPage.Text = strPage;
        }
        */
        #endregion       

    }
    
}


全部的代码都奉献上了,如果大家有更好的写法,更好的见解,欢迎拍砖。


探索Lucene.Net全文检索

标签:lucene.net   全文检索   

原文地址:http://blog.csdn.net/mofijeck/article/details/42081007

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!