龙空技术网

使用Lucene.Net实现全文检索

opendotnet 744

前言:

现在各位老铁们对“lucene搜索引擎”都比较珍视,各位老铁们都想要学习一些“lucene搜索引擎”的相关文章。那么小编同时在网摘上汇集了一些有关“lucene搜索引擎””的相关文章,希望姐妹们能喜欢,兄弟们一起来学习一下吧!

Lucene.net是Lucene的.net移植版本,是一个开源的全文检索引擎开发包,即它不是一个完整的全文检索引擎,而是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎。

开发人员可以基于Lucene.net实现全文检索的功能。

Lucene.net是Apache软件基金会赞助的开源项目,基于Apache License协议。

Lucene.net并不是一个爬行搜索引擎,也不会自动地索引内容。我们得先将要索引的文档中的文本抽取出来,然后再将其加到Lucene.net索引中。标准的步骤是先初始化一个Analyzer、打开一个IndexWriter、然后再将文档一个接一个地加进去。一旦完成这些步骤,索引就可以在关闭前得到优化,同时所做的改变也会生效。这个过程可能比开发者习惯的方式更加手工化一些,但却在数据的索引上给予你更多的灵活性,而且其效率也很高。

获取索引目录

 /// <summary>

/// 获取索引目录

/// </summary>

/// <param name="index">索引类型</param>

/// <returns>索引目录</returns>

private LcStore.Directory GetLuceneDirectory(IndexType index)

{

var indexPath = string.Empty;

try

{

var dirPath = ConfigHelper.GetAppSetting("LuceneIndexPath");

var indexName = Enum.EnumHelper.GetEnumDescription(index);

indexPath = Path.Combine(dirPath, indexName);

return LcStore.FSDirectory.Open(indexPath);

}

catch (Exception ex)

{

NLogger.Write($"获取索引目录失败" + Environment.NewLine +

$"路径:{indexPath}" + Environment.NewLine +

$"异常信息:{ex}",

"Lucene", "x", "x",

CustomException.UnknownError, CustomLogLevel.Error);

throw new Exception("获取索引目录异常,详情请查看相关日志");

}

}

#endregion 获取目录

盘古分词
 /// <summary>

/// 盘古分词

/// </summary>

/// <param name="keyword">语句</param>

/// <returns>词组集合</returns>

public string[] GetSplitKeywords(string keyword)

{

try

{

string ret = ;

var reader = new StringReader(keyword);

var ts = PanguAnalyzer.TokenStream(keyword, reader);

var hasNext = ts.IncrementToken();

Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita;

while (hasNext)

{

ita = ts.GetAttribute<Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();

ret += ita.Term + "|";

hasNext = ts.IncrementToken();

}

ts.CloneAttributes();

reader.Close();

PanguAnalyzer.Close();

if (string.IsOrWhiteSpace(ret)) return ;

ret = ret.Substring(0, ret.Length - 1);

return ret.Split('|');

}

catch (Exception ex)

{

NLogger.Write("分词异常" + Environment.NewLine +

$"关键词:{keyword}" + Environment.NewLine +

$"异常信息:{ex}",

"Lucene", "x", "x",

CustomException.UnknownError, CustomLogLevel.Error);

throw new Exception("分词出现异常,详情请查看相关日志");

}

}

#endregion 分词

创建索引或追加索引
 /// <summary>

/// 创建索引或追加索引

/// </summary>

/// <param name="dataList">数据集合</param>

/// <param name="index">索引类型</param>

public void CreateOrAppendIndexes(List<Document> dataList, IndexType index)

{

if (dataList == || dataList.Count == 0)

return;

IndexWriter writer;

var directory = GetLuceneDirectory(index);

try

{

//false表示追加(true表示删除之前的重新写入)

writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);

}

catch

{

//false表示追加(true表示删除之前的重新写入)

writer = new IndexWriter(directory, PanguAnalyzer, true, IndexWriter.MaxFieldLength.LIMITED);

}

writer.MergeFactor = 1000;

//writer.SetMaxBufferedDocs(1000);

foreach (var doc in dataList)

{

writer.AddDocument(doc);

}

writer.Optimize();

writer.Dispose();

directory.Dispose();

}

完整代码
 /// <summary>

/// Lucene搜索引擎帮助类

/// </summary>

public class LuceneHelper

{

/// <summary>

/// 私有构造函数

/// </summary>

private LuceneHelper()

{

}

#region 属性

private static LuceneHelper _instance;

/// <summary>

/// 单一实例

/// </summary>

public static LuceneHelper Instance => _instance ?? (_instance = new LuceneHelper());

private Analyzer _analyzer;

/// <summary>

/// 分析器

/// </summary>

private Analyzer PanguAnalyzer => _analyzer ?? (_analyzer = new PanGuAnalyzer());

#endregion 属性

#region 获取目录

/// <summary>

/// 获取索引目录

/// </summary>

/// <param name="index">索引类型</param>

/// <returns>索引目录</returns>

private LcStore.Directory GetLuceneDirectory(IndexType index)

{

var indexPath = string.Empty;

try

{

var dirPath = ConfigHelper.GetAppSetting("LuceneIndexPath");

var indexName = Enum.EnumHelper.GetEnumDescription(index);

indexPath = Path.Combine(dirPath, indexName);

return LcStore.FSDirectory.Open(indexPath);

}

catch (Exception ex)

{

NLogger.Write($"获取索引目录失败" + Environment.NewLine +

$"路径:{indexPath}" + Environment.NewLine +

$"异常信息:{ex}",

"Lucene", "x", "x",

CustomException.UnknownError, CustomLogLevel.Error);

throw new Exception("获取索引目录异常,详情请查看相关日志");

}

}

#endregion 获取目录

#region 分词

/// <summary>

/// 盘古分词

/// </summary>

/// <param name="keyword">语句</param>

/// <returns>词组集合</returns>

public string[] GetSplitKeywords(string keyword)

{

try

{

string ret = ;

var reader = new StringReader(keyword);

var ts = PanguAnalyzer.TokenStream(keyword, reader);

var hasNext = ts.IncrementToken();

Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita;

while (hasNext)

{

ita = ts.GetAttribute<Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();

ret += ita.Term + "|";

hasNext = ts.IncrementToken();

}

ts.CloneAttributes();

reader.Close();

PanguAnalyzer.Close();

if (string.IsOrWhiteSpace(ret)) return ;

ret = ret.Substring(0, ret.Length - 1);

return ret.Split('|');

}

catch (Exception ex)

{

NLogger.Write("分词异常" + Environment.NewLine +

$"关键词:{keyword}" + Environment.NewLine +

$"异常信息:{ex}",

"Lucene", "x", "x",

CustomException.UnknownError, CustomLogLevel.Error);

throw new Exception("分词出现异常,详情请查看相关日志");

}

}

#endregion 分词

#region 索引增删改查

/// <summary>

/// 创建索引或追加索引

/// </summary>

/// <param name="dataList">数据集合</param>

/// <param name="index">索引类型</param>

public void CreateOrAppendIndexes(List<Document> dataList, IndexType index)

{

if (dataList == || dataList.Count == 0)

return;

IndexWriter writer;

var directory = GetLuceneDirectory(index);

try

{

//false表示追加(true表示删除之前的重新写入)

writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);

}

catch

{

//false表示追加(true表示删除之前的重新写入)

writer = new IndexWriter(directory, PanguAnalyzer, true, IndexWriter.MaxFieldLength.LIMITED);

}

writer.MergeFactor = 1000;

//writer.SetMaxBufferedDocs(1000);

foreach (var doc in dataList)

{

writer.AddDocument(doc);

}

writer.Optimize();

writer.Dispose();

directory.Dispose();

}

/// <summary>

/// 删除索引

/// </summary>

/// <param name="field">字段名</param>

/// <param name="value">字段值</param>

/// <param name="index">索引类型</param>

public void DeleteIndexes(string field, string value, IndexType index)

{

IndexWriter writer = ;

var directory = GetLuceneDirectory(index);

try

{

writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);

var term = new Term(field, value);

writer.DeleteDocuments(term);

//var isSuccess = writer.HasDeletions();

writer.Optimize();

}

catch (Exception ex)

{

NLogger.Write("删除索引异常" + Environment.NewLine +

$"异常信息:{ex}", "Lucene", "x", "x",

CustomException.UnknownError, CustomLogLevel.Error);

throw new Exception("删除索引异常,详情请查看相关日志");

}

finally

{

writer?.Dispose();

directory?.Dispose();

}

}

/// <summary>

/// 更新索引;这里实际上是先删除原有索引,在创建新索引。

/// 所以在更新索引时,一定要确保传入的Document的所有字段都有值

/// 否则将会被置为空

/// </summary>

/// <param name="field">字段名</param>

/// <param name="value">字段值</param>

/// <param name="doc">文档</param>

/// <param name="index">索引类型</param>

public void UpdateIndexes(string field, string value, Document doc, IndexType index)

{

IndexWriter writer = ;

var directory = GetLuceneDirectory(index);

try

{

writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);

var term = new Term(field, value);

writer.UpdateDocument(term, doc);

}

catch (Exception ex)

{

NLogger.Write("更新索引异常" + Environment.NewLine +

$"异常信息:{ex}", "Lucene", "x", "x",

CustomException.UnknownError, CustomLogLevel.Error);

throw new Exception("更新索引异常,详情请查看相关日志");

}

finally

{

writer?.Dispose();

directory?.Dispose();

}

}

#endregion 索引增删改查

#region 查询

/// <summary>

/// 查询

/// </summary>

/// <typeparam name="T">实体类型</typeparam>

/// <param name="fields">条件字段</param>

/// <param name="keywords">关键词组</param>

/// <param name="index">索引类型</param>

/// <param name="sort">排序,可为空</param>

/// <param name="count">读取数量</param>

/// <returns>结果集</returns>

public List<T> Search<T>

(

string[] fields,

string[] keywords,

IndexType index,

Sort sort,

int count

) where T : new()

{

if (fields == || fields.Length == 0)

return ;

if (keywords == || keywords.Length == 0)

return ;

//索引目录

var directory = GetLuceneDirectory(index);

//查询条件

var boolQuery = GetQuery(fields, keywords);

//索引查询器

var searcher = new IndexSearcher(directory, true);

TopDocs docs;

if (sort != )

docs = searcher.Search(boolQuery, , count, sort);

else

docs = searcher.Search(boolQuery, count);

if (docs == || docs.TotalHits == 0)

return ;

//文档集合

var docList = docs.ScoreDocs.Select(sd => searcher.Doc(sd.Doc)).ToList();

//反射赋值

var list = ConvertDocToObj<T>(docList);

searcher.Dispose();

directory.Dispose();

return list;

}

/// <summary>

/// 查询分页数据(指定排序方式)

/// </summary>

/// <typeparam name="T">实体类型</typeparam>

/// <param name="fields">条件字段</param>

/// <param name="keywords">关键词组</param>

/// <param name="index">索引类型</param>

/// <param name="sort">排序,必填</param>

/// <param name="pageNumber">页码</param>

/// <param name="pageSize">页数</param>

/// <returns>结果集</returns>

public PagedResult<List<T>> SearchByPaged<T>

(

string[] fields,

string[] keywords,

IndexType index,

Sort sort,

int pageNumber = 1,

int pageSize = 20

) where T : new()

{

if (fields == || fields.Length == 0)

return ;

if (keywords == || keywords.Length == 0)

return ;

//索引目录

var directory = GetLuceneDirectory(index);

//查询条件

var boolQuery = GetQuery(fields, keywords);

var collector = TopFieldCollector

.Create(sort, pageNumber * pageSize, false, false, false, false);

var searcher = new IndexSearcher(directory, true);

searcher.Search(boolQuery, collector);

if (collector == || collector.TotalHits == 0)

return ;

//分页

var start = (pageNumber - 1) * pageSize;

var limit = pageSize;

var hits = collector.TopDocs(start, limit).ScoreDocs;

var totalCount = collector.TotalHits;

var docList = hits.Select(sd => searcher.Doc(sd.Doc)).ToList();

//反射赋值

var list = ConvertDocToObj<T>(docList);

searcher.Dispose();

directory.Dispose();

return new PagedResult<List<T>>

{

Total = totalCount,

Result = list

};

}

/// <summary>

/// 查询分页数据(默认排序方式)

/// </summary>

/// <typeparam name="T">实体类型</typeparam>

/// <param name="fields">条件字段</param>

/// <param name="keywords">关键词组</param>

/// <param name="index">索引类型</param>

/// <param name="pageNumber">页码</param>

/// <param name="pageSize">页数</param>

/// <returns>结果集</returns>

public PagedResult<List<T>> SearchByPaged<T>

(

string[] fields,

string[] keywords,

IndexType index,

int pageNumber = 1,

int pageSize = 20

) where T : new()

{

if (fields == || fields.Length == 0)

return ;

if (keywords == || keywords.Length == 0)

return ;

//索引目录

var directory = GetLuceneDirectory(index);

//查询条件

var boolQuery = GetQuery(fields, keywords);

var collector = TopScoreDocCollector.Create(pageNumber * pageSize, false);

var searcher = new IndexSearcher(directory, true);

searcher.Search(boolQuery, collector);

if (collector == || collector.TotalHits == 0)

return ;

//分页

var start = (pageNumber - 1) * pageSize;

var limit = pageSize;

var hits = collector.TopDocs(start, limit).ScoreDocs;

var totalCount = collector.TotalHits;

var docList = hits.Select(sd => searcher.Doc(sd.Doc)).ToList();

//反射赋值

var list = ConvertDocToObj<T>(docList);

searcher.Dispose();

directory.Dispose();

return new PagedResult<List<T>>

{

Total = totalCount,

Result = list

};

}

/// <summary>

/// 查询分页数据(默认排序方式)

/// </summary>

/// <param name="fields">条件字段</param>

/// <param name="keywords">关键词组</param>

/// <param name="index">索引类型</param>

/// <returns>结果集</returns>

public int GetTotla(string[] fields, string[] keywords, IndexType index)

{

if (fields == || fields.Length == 0)

return 0;

if (keywords == || keywords.Length == 0)

return 0;

//索引目录

var directory = GetLuceneDirectory(index);

//查询条件

var boolQuery = GetQuery(fields, keywords);

var collector = TopScoreDocCollector.Create(20, false);

var searcher = new IndexSearcher(directory, true);

searcher.Search(boolQuery, collector);

if (collector == || collector.TotalHits == 0)

return 0;

searcher.Dispose();

directory.Dispose();

return collector.TotalHits;

}

/// <summary>

/// 文档转换为对象

/// </summary>

/// <typeparam name="T">实体类型</typeparam>

/// <param name="docList">文档集合</param>

/// <returns>对象集合</returns>

private List<T> ConvertDocToObj<T>(List<Document> docList) where T : new()

{

var type = typeof(T);

var propertyList = type.GetProperties(BindingFlags.Public | BindingFlags.Instance);

var list = new List<T>();

var firstDoc = docList.First();

var fieldNames = firstDoc.GetFields().Select(x => x.Name).ToList();

foreach (var doc in docList)

{

var tObj = new T();

foreach (var pInfo in propertyList)

{

var name = pInfo.Name;

if (fieldNames.Any(x => x.ToLower() == name.ToLower()))

{

SetValue<T>(pInfo, tObj, doc, name);

}

}

list.Add(tObj);

}

return list;

}

/// <summary>

/// 获取查询条件

/// </summary>

/// <param name="fields">条件字段</param>

/// <param name="keywords">关键词组</param>

/// <returns></returns>

private BooleanQuery GetQuery(string[] fields, string[] keywords)

{

var boolQuery = new BooleanQuery();

foreach (var field in fields)

{

foreach (var keyword in keywords)

{

var t = new TermQuery(new Term(field, keyword));

boolQuery.Add(t, Occur.SHOULD);

}

}

return boolQuery;

}

#endregion 查询

private void SetValue<T>(PropertyInfo pInfo, T tObj, Document doc, string name)

{

var pType = pInfo.PropertyType.Name;

switch (pType)

{

case "String":

pInfo.SetValue(tObj, doc.Get(name), );

break;

case "Int32":

pInfo.SetValue(tObj, GetInt(doc.Get(name)), );

break;

case "Boolean":

pInfo.SetValue(tObj, GetBool(doc.Get(name)), );

break;

case "DateTime":

pInfo.SetValue(tObj, GetDate(doc.Get(name)), );

break;

case "Double":

pInfo.SetValue(tObj, GetDouble(doc.Get(name)), );

break;

case "Single":

pInfo.SetValue(tObj, GetFloat(doc.Get(name)), );

break;

case "Decimal":

pInfo.SetValue(tObj, GetDecimal(doc.Get(name)), );

break;

}

}

private int GetInt(string value)

{

var result = 0;

int.TryParse(value, out result);

return result;

}

private DateTime GetDate(string value)

{

DateTime result;

DateTime.TryParse(value, out result);

return result;

}

private bool GetBool(string value)

{

bool result;

bool.TryParse(value, out result);

return result;

}

private double GetDouble(string value)

{

double result;

double.TryParse(value, out result);

return result;

}

private float GetFloat(string value)

{

float result;

float.TryParse(value, out result);

return result;

}

private decimal GetDecimal(string value)

{

decimal result;

decimal.TryParse(value, out result);

return result;

}

}

标签: #lucene搜索引擎 #netlucene文件检索 #lucenenet检索文件