What is Lucene——Apache Lucene
Apache Lucene(TM) is a high-performance, full-featured text search engine library written entirely in Java. It is a technology suitable for nearly any application that requires full-text search, especially cross-platform.
Apache Lucene is an open source project available for . Please use the links on the left to access Lucene.
使用Lucene的一些心得
最先用Lucene的时候不是使用Java下的开发包,而是用了.Net下的Lucene.Net.dll,用于搭建一个全文检索的引擎,用以全文检索用户上传的文档。效果图如下:
英文的全文检索相对于中文来说,简单一些,因为英文单词可以以空格为分割的依据,而中文的分割要根据语义,单词的解析模糊,很难界定。例如,最经典的:
长春市长春药店。→长春市|长春药店 →长春市长|春药店...
中文分词算法,分为几大类:基于字符串匹配的分词算法;基于理解的分词算法;基于统计的分词算法。
对于这些算法,网上有很多资料,感兴趣的话可以看看。
而我这个系统中,使用的是前辈的分词组件,(eaglet前辈是著名的盘古分词的作者)。
OK,现在讲一讲我如何使用Lucene.net.dll和KTDictSeg分词组件来实现全文检索:
1.为项目添加引用,如下:
Lucene.Net.dll
2.现在系统便有了Lucene和中文分词功能的支持。添加一个类,字段可以与你要索引的文件的字段一样。
这里我们假设我们添加了A类,字段有:DomID,Title,SynTime,HtmPath,KeyWords
3.添加完类,添加一个检索类,用于新建索引和搜索,代码如下:
1 using System; 2 using System.Collections.Generic; 3 using System.Linq; 4 using System.Web; 5 using System.Text; 6 using System.Text.RegularExpressions; 7 using System.IO; 8 using Lucene.Net; 9 using Lucene.Net.Analysis; 10 using Lucene.Net.Analysis.Standard; 11 using Lucene.Net.Documents; 12 using Lucene.Net.Index; 13 using Lucene.Net.QueryParsers; 14 using Lucene.Net.Search; 15 using Lucene.Net.Store; 16 using Lucene.Net.Util; 17 using Lucene.Net.Analysis.KTDictSeg; 18 using KTDictSeg.HighLight; 19 20 namespace LearnAndPracticeBLL 21 { 22 public class DictLucenceSearch 23 { 24 //用于存储检索出数据总数 25 private int _count; 26 public int Count 27 { 28 get { return _count; } 29 set { _count = value; } 30 } 31 32 //搜索关键字 33 private string _keyword; 34 public string KeyWord 35 { 36 get { return _keyword; } 37 set { _keyword = value; } 38 } 39 40 //搜索时间 41 private double _time; 42 public double Time 43 { 44 get { return _time; } 45 set { _time = value; } 46 } 47 48 //索引的路径 49 private string _myindex; 50 51 public string Myindex 52 { 53 get { return _myindex; } 54 set 55 { 56 _myindex = System.Web.HttpContext.Current.Server.MapPath("~/"+value); 57 } 58 } 59 60 ///61 ///对多个文档添加索引 62 /// 63 /// 64 /// 65 public static void AddDocument(IndexWriter writer,Listmydocs) 66 { 67 for (int i = 0; i < mydocs.Count; i++) 68 { 69 Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); 70 Field DomID = new Field("DomID", mydocs[i].DomID.ToString(), Field.Store.YES, Field.Index.TOKENIZED); 71 Field Title = new Field("Title", mydocs[i].Title, Field.Store.YES, Field.Index.TOKENIZED); 72 Field HtmPath = new Field("HtmlPath", mydocs[i].HtmPath, Field.Store.YES, Field.Index.NO); 73 Field KeyWords = new Field("KeyWords", mydocs[i].KeyWords, Field.Store.YES, Field.Index.TOKENIZED); 74 Field Content = new Field("Content", GetHTML(mydocs[i].HtmPath), Field.Store.YES, Field.Index.TOKENIZED); 75 76 //为字段设置权重,默认为1 77 Title.SetBoost(1.5f); 78 79 KeyWords.SetBoost(1.2f); 80 doc.Add(DomID); 81 doc.Add(Title); 82 doc.Add(HtmPath); 83 doc.Add(KeyWords); 84 doc.Add(SynTime); 85 doc.Add(Content); 86 87 writer.AddDocument(doc); 88 } 89 } 90 91 /// 92 ///对多个文档建立索引 93 /// 94 public void Index(List doms) 95 { 96 string INDEX_STORE_PATH = Myindex; 97 98 //使用eaglet的KTDictSeg的分词器 99 Analyzer analyzer = new KTDictSegAnalyzer();100 101 //lucene.net 默认分词器102 //Analyzer analyzer = new StandardAnalyzer();103 104 FSDirectory fsDir;105 if (System.IO.Directory.Exists(INDEX_STORE_PATH))106 {107 fsDir= FSDirectory.GetDirectory(INDEX_STORE_PATH, false);108 }109 else110 {111 fsDir = FSDirectory.GetDirectory(INDEX_STORE_PATH, true);112 }113 114 IndexWriter writer = new IndexWriter(fsDir, analyzer, true);115 116 AddDocument(writer, doms);117 writer.Optimize();118 writer.Close();119 120 }121 122 ///123 ///词进行分词 124 /// 125 ///要搜索的词 126 ///分词对象 127 ///分词后的结果 128 public string GetKeyWordSplitBySpace(string keyWords, KTDictSegTokenizer ktTokenizer)129 {130 StringBuilder builder = new StringBuilder();131 Listwords = ktTokenizer.SegmentToWordInfos(keyWords);132 133 foreach (FTAlgorithm.T_WordInfo word in words)134 {135 if (word == null)136 {137 continue;138 }139 140 KeyWord = KeyWord + word + ",";141 builder.AppendFormat("{0}^{1}", word.Word, (int)Math.Pow(3, word.Rank));142 }143 KeyWord = KeyWord.Substring(0, KeyWord.Length - 1);144 return builder.ToString().Trim();145 }146 147 /// 148 ///检索信息 149 /// 150 /// 检索关键字 151 /// 当前第几条 152 /// 每页显示的条数 153 ///154 public List Search(string keyWord, int pageNumber, int pageSize)155 {156 string word = GetKeyWordSplitBySpace(keyWord, new KTDictSegTokenizer());157 158 IndexSearcher search = new IndexSearcher(Myindex);159 StringBuilder builder = new StringBuilder();160 KTDictSegAnalyzer analyzer = new KTDictSegAnalyzer(true);161 162 //多字段搜索字段 163 MultiFieldQueryParser parser = new MultiFieldQueryParser(new string[] { "Title", "Content","KeyWords" }, analyzer);164 165 //分词 166 Query query = parser.Parse(word);167 Hits hits = search.Search(query);168 Count = hits.Length();169 170 int num = 0;//记录每页最后一条 171 if (Count < pageNumber + pageSize)172 {173 num = Count;174 }175 176 else177 {178 num = pageSize + pageNumber;179 }180 181 DateTime begin = DateTime.Now;182 183 List theindexs = new List ();184 for (int i = 0; i < num; i++)185 {186 //使用KTDictSeg.HighLight高亮组件为结果添加高亮效果187 KTDictSeg.HighLight.SimpleHTMLFormatter simpleHTMLFormatter =188 new KTDictSeg.HighLight.SimpleHTMLFormatter(" ", "");189 190 KTDictSeg.HighLight.Highlighter highlighter =191 new KTDictSeg.HighLight.Highlighter(simpleHTMLFormatter,192 new Lucene.Net.Analysis.KTDictSeg.KTDictSegTokenizer());193 194 highlighter.FragmentSize = 300;195 196 TheIndex theindex = new TheIndex();197 theindex.DomID =Convert.ToInt32(hits.Doc(i).Get("DomID"));198 theindex.Title= hits.Doc(i).Get("Title");199 200 string title = highlighter.GetBestFragment(keyWord,theindex.Title);201 if (!string.IsNullOrEmpty(title))202 {203 theindex.Title = title;204 }205 206 theindex.HtmPath = hits.Doc(i).Get("HtmlPath");207 theindex.Content =hits.Doc(i).Get("Content");208 209 string content=highlighter.GetBestFragment(keyWord,theindex.Content);210 if (!string.IsNullOrEmpty(content))211 {212 theindex.Content = content;213 }214 215 theindex.Content = theindex.Content.Substring(0, theindex.Content.Length>300?300:theindex.Content.Length);216 theindex.SynTime =Convert.ToDateTime(hits.Doc(i).Get("SynTime")).ToString("yyyy年MM月dd日");217 theindex.KeyWords = hits.Doc(i).Get("KeyWords");218 219 string keywords = highlighter.GetBestFragment(keyWord,theindex.KeyWords);220 if (!string.IsNullOrEmpty(keywords))221 {222 theindex.KeyWords = keywords;223 }224 225 theindexs.Add(theindex);226 }227 228 DateTime end = DateTime.Now;229 double ts = (end - begin).TotalMilliseconds;230 Time = ts / 1000.000;231 search.Close();//关闭检索器 232 return theindexs;233 }234 235 236 //定义方法,用来解析HTML,用以为HTML文档添加索引237 public static string GetHTML(string Path)238 {239 System.Text.Encoding encoding = System.Text.Encoding.GetEncoding("gb2312");240 StreamReader sr = new StreamReader(HttpContext.Current.Server.MapPath("~/") + Path, encoding);241 242 string str = "";243 244 while (sr.Peek() != -1)245 {246 char[] buffer = new char[4096];247 int bufferFillSize = sr.ReadBlock(buffer, 0, 4096);248 str = str + new string(buffer);249 }250 sr.Close();251 252 string strResult = findUsedFromHtml(str);253 return strResult;254 }255 256 /// 257 ///获取Html的body部分并进行格式化258 /// 259 /// 260 ///261 private static string findUsedFromHtml(string strHtml)262 {263 string strBody;264 265 int bodyStart = strHtml.IndexOf(" ");267 268 //Body部分269 strBody =StripHTML(strHtml.Substring(bodyStart, bodyEnd - bodyStart + 7));270 271 return strBody;272 }273 274 /// 去除HTML的tag275 /// 276 /// 源277 ///结果 278 public static string StripHTML(string HTML)279 {280 string[] Regexs =281 {282 @" ]*?>",283 @"
新建完这个类,我们就可以为文档添加索引了,并搜索了,呵呵。下次再写写再Java下的实现。
呵呵,博客新手+初级程序员,所以写出的文章难免干巴巴并且涉及比较少,请不要见笑。最近实习也比较忙,初稿就这样,呵呵,有什么问题,请发到我的邮箱: