C# 如何提高Lucene.net的索引速度_C#_Performance_Lucene_Lucene.net_Full Text Indexing

C# 如何提高Lucene.net的索引速度

c# performance lucene

C# 如何提高Lucene.net的索引速度,c#,performance,lucene,lucene.net,full-text-indexing,C#,Performance,Lucene,Lucene.net,Full Text Indexing,我正在使用lucene.net为我的pdf文件编制索引。索引15000个pdf大约需要40分钟，索引时间随着我的文件夹中pdf文件数量的增加而增加如何提高lucene.net中的索引速度？还有其他索引服务具有快速索引性能吗？我正在使用最新版本的lucene.net索引（lucene.net 3.0.3）这是我的索引代码 public void refreshIndexes() { // Create Index Writer

我正在使用lucene.net为我的pdf文件编制索引。索引15000个pdf大约需要40分钟，索引时间随着我的文件夹中pdf文件数量的增加而增加

如何提高lucene.net中的索引速度？
还有其他索引服务具有快速索引性能吗？

我正在使用最新版本的lucene.net索引（lucene.net 3.0.3）

这是我的索引代码

public void refreshIndexes() 
        {
            // Create Index Writer
            string strIndexDir = @"E:\LuceneTest\index";
            IndexWriter writer = new IndexWriter(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir)), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED);

            // Find all files in root folder create index on them
            List<string> lstFiles = searchFiles(@"E:\LuceneTest\PDFs");
            foreach (string strFile in lstFiles)
            {
                Document doc = new Document();
                string FileName = System.IO.Path.GetFileNameWithoutExtension(strFile);
                string Text = ExtractTextFromPdf(strFile);
                string Path = strFile;
                string ModifiedDate = Convert.ToString(File.GetLastWriteTime(strFile));
                string DocumentType = string.Empty;
                string Vault = string.Empty;

                string headerText = Text.Substring(0, Text.Length < 150 ? Text.Length : 150);
                foreach (var docs in ltDocumentTypes)
                {
                    if (headerText.ToUpper().Contains(docs.searchText.ToUpper()))
                    {
                        DocumentType = docs.DocumentType;
                        Vault = docs.VaultName; ;
                    }
                }

                if (string.IsNullOrEmpty(DocumentType))
                {
                    DocumentType = "Default";
                    Vault = "Default";
                }

                doc.Add(new Field("filename", FileName, Field.Store.YES, Field.Index.ANALYZED));
                doc.Add(new Field("text", Text, Field.Store.YES, Field.Index.ANALYZED));
                doc.Add(new Field("path", Path, Field.Store.YES, Field.Index.NOT_ANALYZED));
                doc.Add(new Field("modifieddate", ModifiedDate, Field.Store.YES, Field.Index.ANALYZED));
                doc.Add(new Field("documenttype", DocumentType, Field.Store.YES, Field.Index.ANALYZED));
                doc.Add(new Field("vault", Vault, Field.Store.YES, Field.Index.ANALYZED));

                writer.AddDocument(doc);
            }
            writer.Optimize();
            writer.Dispose();
        }

public void refreshIndexes（）
{
//创建索引编写器
字符串strIndexDir=@“E:\luceNet\index”；
IndexWriter writer=new IndexWriter（Lucene.Net.Store.FSDirectory.Open（new System.IO.DirectoryInfo（strIndexDir）），new StandardAnalyzer（Lucene.Net.Util.Version.Lucene_29），true，IndexWriter.MaxFieldLength.UNLIMITED）；
//查找根文件夹中的所有文件并在其上创建索引
List lstFiles=searchFiles（@“E:\LuceneTest\PDFs”）；
foreach（lstFiles中的字符串strFile）
{
单据单据=新单据（）；
字符串文件名=System.IO.Path.GetFileNameWithoutExtension（strFile）；
string Text=ExtractTextFromPdf（strFile）；
字符串路径=strFile；
string ModifiedDate=Convert.ToString（File.GetLastWriteTime（strFile））；
string DocumentType=string.Empty；
string Vault=string.Empty；
字符串headerText=Text.Substring（0，Text.Length<150？Text.Length:150）；
foreach（ltDocumentTypes中的var文档）
{
if（headerText.ToUpper（）.Contains（docs.searchText.ToUpper（）））
{
DocumentType=docs.DocumentType；
Vault=docs.VaultName；
}
}
if（string.IsNullOrEmpty（DocumentType））
{
DocumentType=“默认”；
Vault=“默认”；
}
添加（新字段（“文件名”，文件名，Field.Store.YES，Field.Index.analysis））；
新增单据（新增字段（“文本”，文本，Field.Store.YES，Field.Index.analysed））；
添加文档（新字段（“路径”，路径，Field.Store.YES，Field.Index.NOT_analysis））；
新增单据（新增字段（“modifieddate”，modifieddate，Field.Store.YES，Field.Index.analysis））；
新增单据（新增字段（“documenttype”，documenttype，Field.Store.YES，Field.Index.Analysis））；
添加文档（新字段（“vault”，vault，Field.Store.YES，Field.Index.analysis））；
writer.AddDocument（doc）；
}
writer.Optimize（）；
writer.Dispose（）；
}

索引部分看起来正常。请注意，IndexWriter是线程安全的，因此如果您在多核计算机上，使用Parallel.Foreach（将MaxConcurrency设置为cores.play的数量）可能会有所帮助

但是，文档类型检测部分让GC变得疯狂。所有这些都是痛苦的

在lstFiles循环之外。创建ltDocumentTypes.searchText的副本（大写）

var upperDocTypes = ltDocumentTypes.Select(x=>x.searchText.ToUpper()).ToList();

在文档类型循环之外创建另一个字符串
```
string headerTestUpper = headerText.ToUpper();
```

当它找到一个匹配项时“中断”。这将在找到匹配项后退出循环，并阻止所有后续迭代。当然，这意味着首先比赛，而你的比赛是最后一场（如果这对你有影响的话）

string headerText=Text.Substring（0，Text.Length<150？Text.Length:150）；
foreach（大写类型的var searchText）
{
if（headerTextUpper.Contains（searchText））
{
DocumentType=docs.DocumentType；
Vault=docs.VaultName；
打破
}
}

根据ltDocumentTypes的大小，这可能不会给您带来太多的改进

我敢打赌，最昂贵的部分是从PDF中提取文本。通过探查器运行此操作或使用秒表进行检测，您应该知道成本在哪里。

您真的需要调用

writer.Optimize（）

？难道一个

writer.Commit（）

就足够了吗？谢谢回复@SimonSvensson。Optimize（）不是必需的。commit（）尝试过，性能没有提高。@Munavvar，在提出任何更改之前，您是否尝试过为相关方法添加一些基准测试？我对SearchFile和ExtractTextFromPdf方法特别感兴趣。我相信问题可能在后一种情况下出现，因为您的代码看起来还行（除了不应该分析的日期）。此外，您的PDF的大小是多少？您可以将索引和分析限制为相关数量的字符。

string headerText = Text.Substring(0, Text.Length < 150 ? Text.Length : 150);
foreach (var searchText in upperDocTypes)
{
    if (headerTextUpper.Contains(searchText))
    {
        DocumentType = docs.DocumentType;
        Vault = docs.VaultName;
        break;
    }
}