C# Lucene.Net多行正则表达式搜索
我们使用Lucene.Net 3.0.3 Whitespace Analyzer为文件编制索引,并使用下面显示的未分析和已分析选项将两个字段分隔开C# Lucene.Net多行正则表达式搜索,c#,regex,lucene.net,lucene,C#,Regex,Lucene.net,Lucene,我们使用Lucene.Net 3.0.3 Whitespace Analyzer为文件编制索引,并使用下面显示的未分析和已分析选项将两个字段分隔开 public static void WriteIndexes() { string indexPathRegex = ConfigurationManager.TfSettings.Application.CustomSettings["dbScritpsAddressRegex"]; va
public static void WriteIndexes()
{
string indexPathRegex = ConfigurationManager.TfSettings.Application.CustomSettings["dbScritpsAddressRegex"];
var analyzerRegex = new WhitespaceAnalyzer();
var indexWriterRegex = new IndexWriter(indexPathRegex, analyzerRegex, IndexWriter.MaxFieldLength.UNLIMITED);
foreach (LuceneIndex l in Indexes)
{
var doc = new Document();
doc.Add(new Field("ServerName", l.ServerName.ToLowerInvariant(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
doc.Add(new Field("DatabaseName", l.DatabaseName.ToLowerInvariant(), Field.Store.YES, Field.Index.ANALYZED,Field.TermVector.NO));
doc.Add(new Field("SchemaName", l.SchemaName.ToLowerInvariant(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
doc.Add(new Field("ObjectType", l.ObjectType.ToLowerInvariant(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
doc.Add(new Field("ObjectName", l.ObjectName.ToLowerInvariant(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
doc.Add(new Field("Script", l.Script, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
doc.Add(new Field("Script", l.Script, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
indexWriterRegex.AddDocument(doc);
}
indexWriterRegex.Optimize();
analyzerRegex.Close();
indexWriterRegex.Close();
}
当我们寻找一个单行正则表达式时是可以的,但当我们寻找多行正则表达式时;如果搜索文件的大小小于16KB,这是正常的。但是当它大于16KB时,Lucene找不到搜索关键字。这是一个bug吗?我们如何解决这个问题
示例关键字:*taxId.*\n.*customerNo.*
public List<item> SearchAllScriptInIndex()
{
string indexPathRegex = ConfigurationManager.TfSettings.Application.CustomSettings["dbScritpsAddressRegex"];
var searcher = new Lucene.Net.Search.IndexSearcher(indexPathRegex, false);
const int hitsLimit = 1000000;
var analyzer = new WhitespaceAnalyzer();
var parser = new MultiFieldQueryParser(Lucene.Net.Util.Version.LUCENE_29, new[] { "Script", "DatabaseName", "ObjectType", "ServerName" }, analyzer);
Term t = new Term("Script", Expression);
RegexQuery scriptQuery = new RegexQuery(t);
string s = string.Format("({0}) AND {1}", serverAndDatabasescript, objectTypeScript);
var query = parser.Parse(s);
BooleanQuery booleanQuery = new BooleanQuery();
booleanQuery.Add(query, BooleanClause.Occur.MUST);
booleanQuery.Add(scriptQuery, BooleanClause.Occur.MUST);
var hits = searcher.Search(booleanQuery, null, hitsLimit, Sort.RELEVANCE).ScoreDocs;
List<item> results = new List<item>();
List<string> values = new List<string>();
Dictionary<int, string> newLineIndices = new Dictionary<int, string>();
foreach (var hit in hits)
{
var hitDocument = searcher.Doc(hit.Doc);
string contentValue = hitDocument.Get("Script");
LuceneIndex item = new LuceneIndex();
item.ServerName = hitDocument.Get("ServerName");
item.DatabaseName = hitDocument.Get("DatabaseName");
item.ObjectName = hitDocument.Get("ObjectName");
item.ObjectType = hitDocument.Get("ObjectType");
item.SchemaName = hitDocument.Get("SchemaName");
item.Script = hitDocument.Get("Script");
results.Add(item);
}
return results;
public List SearchAllScriptInIndex()
{
string indexPathRegex=ConfigurationManager.TfSettings.Application.CustomSettings[“dbScritpsAddressRegex”];
var searcher=new Lucene.Net.Search.IndexSearcher(indexPathRegex,false);
常量int hitsLimit=1000000;
var analyzer=新的空白分析器();
var parser=new multifiedqueryparser(Lucene.Net.Util.Version.Lucene_29,new[]{“脚本”、“数据库名”、“对象类型”、“服务器名”},分析器);
术语t=新术语(“脚本”,表达式);
RegexQuery scriptQuery=新的RegexQuery(t);
字符串s=string.Format(({0})和{1}),serverAndDatabasescript,objectTypeScript);
var query=parser.Parse;
BooleanQuery BooleanQuery=新的BooleanQuery();
添加(query,BooleanClause.occurrent.MUST);
添加(scriptQuery,BooleanClause.Occurse.MUST);
var hits=searcher.Search(booleanQuery,null,hitsLimit,Sort.RELEVANCE);
列表结果=新列表();
列表值=新列表();
Dictionary newlineindex=newdictionary();
foreach(命中率中的var命中率)
{
var hitDocument=searcher.Doc(hit.Doc);
字符串contentValue=hitDocument.Get(“脚本”);
LuceneIndex项=新的LuceneIndex();
item.ServerName=hitDocument.Get(“ServerName”);
item.DatabaseName=hitDocument.Get(“DatabaseName”);
item.ObjectName=hitDocument.Get(“ObjectName”);
item.ObjectType=hitDocument.Get(“ObjectType”);
item.SchemaName=hitDocument.Get(“SchemaName”);
item.Script=hitDocument.Get(“脚本”);
结果:增加(项目);
}
返回结果;
}根据
IndexWriter.AddDocument
的文档和IndexWriter.MAX\u term\u length
字段,支持的最大字长为16 383个字符。似乎比这更长的术语会被忽略,从而导致您描述的问题
AddDocument的文档声明抛出了一个异常,而字段只提到信息被写入信息流[如果设置了信息流]
/// <p/>Note that each term in the document can be no longer
/// than 16383 characters, otherwise an
/// IllegalArgumentException will be thrown.<p/>
// [...]
/// <summary> Absolute hard maximum length for a term. If a term
/// arrives from the analyzer longer than this length, it
/// is skipped and a message is printed to infoStream, if
/// set (see <see cref="SetInfoStream" />).
/// </summary>
public static readonly int MAX_TERM_LENGTH;
//请注意,文档中的每个术语都不能再使用
///超过16383个字符,否则为
///将抛出IllegalArgumentException。
// [...]
///一个术语的绝对硬最大长度。如果是一个术语
///从分析器中到达的长度超过此长度时
///跳过,并将消息打印到infoStream,如果
///设置(请参阅)。
///
公共静态只读int MAX_TERM_LENGTH;
资料来源: