Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/csharp/304.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C# lucene.net索引中存在重复文档_C#_Wpf_Lucene_Lucene.net - Fatal编程技术网

C# lucene.net索引中存在重复文档

C# lucene.net索引中存在重复文档,c#,wpf,lucene,lucene.net,C#,Wpf,Lucene,Lucene.net,我正在使用lucene.net为我的pdf文件编制索引。刷新索引后,它将多次显示相同的documnet(=我刷新索引的次数) 我正在使用最新版本的lucene.net索引(lucene.net 3.0.3) 这是我的索引代码 public void refreshIndexes() { // Create Index Writer string strIndexDir = @"Z:\Munavvar\LuceneTest\index"

我正在使用lucene.net为我的pdf文件编制索引。刷新索引后,它将多次显示相同的documnet(=我刷新索引的次数)

我正在使用最新版本的lucene.net索引(lucene.net 3.0.3)

这是我的索引代码

public void refreshIndexes()  
    {
        // Create Index Writer
        string strIndexDir = @"Z:\Munavvar\LuceneTest\index";
        IndexWriter writer = new IndexWriter(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir)), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED);

        writer.DeleteAll();
        // Find all files in root folder create index on them
        List<string> lstFiles = searchFiles(@"Z:\Munavvar\LuceneTest\PDFs");
        foreach (string strFile in lstFiles)
        {
            Document doc = new Document();
            string FileName = System.IO.Path.GetFileNameWithoutExtension(strFile);
            string Text = ExtractTextFromPdf(strFile);
            string Path = strFile;
            string ModifiedDate = Convert.ToString(File.GetLastWriteTime(strFile));
            string DocumentType = string.Empty;
            string Vault = string.Empty;

            string headerText = Text.Substring(0, Text.Length < 150 ? Text.Length : 150);
            foreach (var docs in ltDocumentTypes)
            {
                if (headerText.ToUpper().Contains(docs.searchText.ToUpper()))
                {
                    DocumentType = docs.DocumentType;
                    Vault = docs.VaultName; ;
                }
            }

            if (string.IsNullOrEmpty(DocumentType))
            {
                DocumentType = "Default";
                Vault = "Default";
            }

            doc.Add(new Field("filename", FileName, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("text", Text, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("path", Path, Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("modifieddate", ModifiedDate, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("documenttype", DocumentType, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("vault", Vault, Field.Store.YES, Field.Index.ANALYZED));

            writer.AddDocument(doc);
        }
        writer.Optimize();
        writer.Dispose();
    }
public void refreshIndexes()
{
//创建索引编写器
字符串strIndexDir=@“Z:\Munavvar\LuceneTest\index”;
IndexWriter writer=new IndexWriter(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir)),new StandardAnalyzer(Lucene.Net.Util.Version.Lucene_29),true,IndexWriter.MaxFieldLength.UNLIMITED);
writer.DeleteAll();
//查找根文件夹中的所有文件并在其上创建索引
List lstFiles=searchFiles(@“Z:\Munavvar\LuceneTest\PDFs”);
foreach(lstFiles中的字符串strFile)
{
单据单据=新单据();
字符串文件名=System.IO.Path.GetFileNameWithoutExtension(strFile);
string Text=ExtractTextFromPdf(strFile);
字符串路径=strFile;
string ModifiedDate=Convert.ToString(File.GetLastWriteTime(strFile));
string DocumentType=string.Empty;
string Vault=string.Empty;
字符串headerText=Text.Substring(0,Text.Length<150?Text.Length:150);
foreach(ltDocumentTypes中的var文档)
{
if(headerText.ToUpper().Contains(docs.searchText.ToUpper()))
{
DocumentType=docs.DocumentType;
Vault=docs.VaultName;
}
}
if(string.IsNullOrEmpty(DocumentType))
{
DocumentType=“默认”;
Vault=“默认”;
}
添加(新字段(“文件名”,文件名,Field.Store.YES,Field.Index.analysis));
新增单据(新增字段(“文本”,文本,Field.Store.YES,Field.Index.analysed));
添加文档(新字段(“路径”,路径,Field.Store.YES,Field.Index.NOT_analysis));
新增单据(新增字段(“modifieddate”,modifieddate,Field.Store.YES,Field.Index.analysis));
新增单据(新增字段(“documenttype”,documenttype,Field.Store.YES,Field.Index.Analysis));
添加文档(新字段(“vault”,vault,Field.Store.YES,Field.Index.analysis));
writer.AddDocument(doc);
}
writer.Optimize();
writer.Dispose();
}
这是我的索引搜索代码

public List<IndexDocument> searchFromIndexes(string searchText)
    {
        try
        {
            #region search in indexes and fill list
            // Create list
            List<IndexDocument> searchResult = new List<IndexDocument>();

            if (!string.IsNullOrEmpty(searchText))
            {
                string strIndexDir = @"Z:\Munavvar\LuceneTest\index";
                var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
                IndexSearcher searcher = new IndexSearcher(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir)));

                // parse the query, "text" is the default field to search
                Lucene.Net.QueryParsers.QueryParser parser = new Lucene.Net.QueryParsers.QueryParser(Lucene.Net.Util.Version.LUCENE_29, "text", analyzer);


                Query query = parser.Parse(searchText);

                // search
                TopDocs hits = searcher.Search(query, searcher.MaxDoc);

                // showing first TotalHits results
                for (int i = 0; i < hits.TotalHits; i++)
                {
                    // get the document from index
                    Document doc = searcher.Doc(hits.ScoreDocs[i].Doc);

                    // create a new row with the result data
                    searchResult.Add(new IndexDocument()
                        {
                            FileName = doc.Get("filename"),
                            Text = doc.Get("text"),
                            Path = doc.Get("path"),
                            ModifiedDate = doc.Get("modifieddate"),
                            Vault = doc.Get("vault"),
                            DocumentType = doc.Get("documenttype"),
                        });

                }
                searcher.Dispose();
            }
            return searchResult;
            #endregion

        }
        catch (Exception ex)
        {
            throw ex;
        }
    }
公共列表searchFromIndexes(字符串搜索文本)
{
尝试
{
#索引和填充列表中的区域搜索
//创建列表
列表搜索结果=新列表();
如果(!string.IsNullOrEmpty(searchText))
{
字符串strIndexDir=@“Z:\Munavvar\LuceneTest\index”;
var analyzer=new StandardAnalyzer(Lucene.Net.Util.Version.Lucene_30);
IndexSearcher=newindexSearcher(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir));
//解析查询,“text”是要搜索的默认字段
Lucene.Net.QueryParsers.QueryParser=新的Lucene.Net.QueryParsers.QueryParser(Lucene.Net.Util.Version.lucene29,“文本”,分析器);
Query=parser.Parse(searchText);
//搜寻
TopDocs hits=searcher.Search(查询,searcher.MaxDoc);
//显示首个TotalHits结果
对于(int i=0;i

更新

我在窗口上有一个按钮,可以调用刷新索引方法

当我关闭并再次运行应用程序并单击该按钮时,它将清除旧索引


想出一个解决办法

问题: 我正在从全局类对象调用refreshIndexes方法

VaultIndexes vIndexes = new VaultIndexes();
private void btnRefreshIndex_Click(object sender, RoutedEventArgs e)
{
    vIndexes.refreshIndexes();
}
解决方案: 每次创建新对象

private void btnRefreshIndex_Click(object sender, RoutedEventArgs e)
{
    VaultIndexes vIndexes = new VaultIndexes();
    vIndexes.refreshIndexes();
}
我不知道它为什么用全局类创建重复文档 反对

作为注释中的@RichaGarg状态,它不能创建新文档 根据IndexWriter的第三个参数


想出一个解决办法

问题: 我正在从全局类对象调用refreshIndexes方法

VaultIndexes vIndexes = new VaultIndexes();
private void btnRefreshIndex_Click(object sender, RoutedEventArgs e)
{
    vIndexes.refreshIndexes();
}
解决方案: 每次创建新对象

private void btnRefreshIndex_Click(object sender, RoutedEventArgs e)
{
    VaultIndexes vIndexes = new VaultIndexes();
    vIndexes.refreshIndexes();
}
我不知道它为什么用全局类创建重复文档 反对

作为注释中的@RichaGarg状态,它不能创建新文档 根据IndexWriter的第三个参数


这是因为,您需要删除以前的数据。顺便说一句,
ltDocumentTypes
@RichaGarg是什么?
IndexWriterctor的第三个参数指定是覆盖还是附加现有索引(如果有)。如果
true
,则应删除旧索引。您能否提供一些有关如何搜索此索引的信息?我想知道的一件事是,你是否有可能在某个地方让老读者继续阅读。看起来你必须在
多读卡器中收集它们,