Java 计算lucene中的匹配字符串百分比

Java 计算lucene中的匹配字符串百分比,java,lucene,Java,Lucene,我在java应用程序中使用了Lucene算法,从索引中找出匹配的字符串。 我已经从所有点击中选取了前5个热门文档,但我想检查或计算原始字符串和匹配字符串的匹配情况。 在Lucene有可能吗? Lucene有什么办法找到它吗? 例如:- 当你说匹配百分比时,你是什么意思?如果您想知道结果文档中包含了多少原始文本的单词(例如,在您的案例中,3个单词中有2个) 然后,您可以使用来完成工作,获取字段和文档的术语向量,并迭代术语,查看您要查找的内容中是否有术语。甚至可以存储字符串并获取整个内容,然后进行数

我在java应用程序中使用了Lucene算法,从索引中找出匹配的字符串。 我已经从所有点击中选取了前5个热门文档,但我想检查或计算原始字符串和匹配字符串的匹配情况。 在Lucene有可能吗? Lucene有什么办法找到它吗? 例如:-


当你说匹配百分比时,你是什么意思?如果您想知道结果文档中包含了多少原始文本的单词(例如,在您的案例中,3个单词中有2个) 然后,您可以使用来完成工作,获取字段和文档的术语向量,并迭代术语,查看您要查找的内容中是否有术语。甚至可以存储字符串并获取整个内容,然后进行数学计算(如果存储不是问题的话)。 当前lucene使用(将从版本6x更改为BM25)计算分数,并给出匹配分数的依据 但score doc给出了十进制值,如果它足够,则使用它

如果这不能回答问题,那么请提供更多关于如何使用样本进行计算的详细信息

希望这有帮助

另外,我已经编写了简单的脚本,因此您可以根据需要查看并更正它:

package org.query;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.junit.Before;
import org.junit.Test;

import java.util.HashSet;
import java.util.Set;

/**
 * Created by ekamolid on 11/2/2015.
 */
public class LevenshteinTest {
    private RAMDirectory directory;
    private IndexSearcher searcher;
    private IndexReader reader;
    private Analyzer analyzer;

    @Before
    public void setUp() throws Exception {
        directory = new RAMDirectory();

        analyzer = new WhitespaceAnalyzer();
        IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(analyzer));

        Document doc = new Document();
        FieldType fieldType = new FieldType();
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        fieldType.setStoreTermVectors(true);
        doc.add(new Field("f", "the quick brown fox jumps over the lazy dog", fieldType));
        writer.addDocument(doc);

        doc = new Document();
        doc.add(new Field("f", "the quick red fox jumps over the sleepy cat", fieldType));
        writer.addDocument(doc);

        doc = new Document();
        doc.add(new Field("f", "quiick caar went xyztz dog", fieldType));
        writer.addDocument(doc);

        writer.close();

        reader = DirectoryReader.open(directory);
        searcher = new IndexSearcher(reader);
    }

    public static int distance(String a, String b) { //code is taken from http://rosettacode.org/wiki/Levenshtein_distance#Java
        a = a.toLowerCase();
        b = b.toLowerCase();
        // i == 0
        int[] costs = new int[b.length() + 1];
        for (int j = 0; j < costs.length; j++)
            costs[j] = j;
        for (int i = 1; i <= a.length(); i++) {
            // j == 0; nw = lev(i - 1, j)
            costs[0] = i;
            int nw = i - 1;
            for (int j = 1; j <= b.length(); j++) {
                int cj = Math.min(1 + Math.min(costs[j], costs[j - 1]), a.charAt(i - 1) == b.charAt(j - 1) ? nw : nw + 1);
                nw = costs[j];
                costs[j] = cj;
            }
        }
        return costs[b.length()];
    }


    @Test
    public void test1() throws Exception {
        String s = "quick caar dog";
        TokenStream tokenStream = analyzer.tokenStream("abc", s);
        TermToBytesRefAttribute termAttribute = tokenStream.getAttribute(TermToBytesRefAttribute.class);
        Set<String> stringSet = new HashSet<>();
        tokenStream.reset();
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        while (tokenStream.incrementToken()) {
            stringSet.add(termAttribute.getBytesRef().utf8ToString());
            Query query = new FuzzyQuery(new Term("f", termAttribute.getBytesRef().utf8ToString()), 2); //search only 2 edits
            builder.add(query, BooleanClause.Occur.SHOULD);
        }
        TopDocs hits = searcher.search(builder.build(), 10);
        int exactMatch = 0;
        int match1 = 0;
        int match2 = 0;
        for (ScoreDoc scoreDoc : hits.scoreDocs) {
            exactMatch = match1 = match2 = 0;
            Terms terms = reader.getTermVector(scoreDoc.doc, "f");
            TermsEnum termsEnum = terms.iterator();
            while (true) {
                BytesRef bytesRef = termsEnum.next();
                if (bytesRef == null) {
                    break;
                }
                String str = bytesRef.utf8ToString();
                if (stringSet.contains(str)) {
                    exactMatch++;
                    continue;
                }
                for (String s1 : stringSet) {
                    int distance = distance(s1, str);
                    if (distance <= 1) {
                        match1++;
                    } else if (distance <= 2) {
                        match2++;
                    }
                }
            }
            System.out.print(" doc=" + scoreDoc.doc);
            System.out.print(" exactMatch=" + exactMatch);
            System.out.print(" match1=" + match1);
            System.out.println(" match2=" + match1);
        }
    }
}

这是工作代码,告诉有多少个字符是完全匹配的,其中有多少是1个字符差和2个字符差。所以你可以把你的登录名放在那里,根据你手上的数字来计算%。这可能会稍微慢一点,因为您正在迭代文档,但是您应该将结果限制在一定的数量(示例中为
10
),这样不会太慢

谢谢你的帮助,先生。先生,术语向量用于匹配每个术语,例如“好女孩”与“好”和“女孩”匹配,但如果我的输入字符串中有任何语音错误,该怎么办。例如:-“god grl”匹配“good girl”,那么如何找到匹配字符串的百分比?
package org.query;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.junit.Before;
import org.junit.Test;

import java.util.HashSet;
import java.util.Set;

/**
 * Created by ekamolid on 11/2/2015.
 */
public class LevenshteinTest {
    private RAMDirectory directory;
    private IndexSearcher searcher;
    private IndexReader reader;
    private Analyzer analyzer;

    @Before
    public void setUp() throws Exception {
        directory = new RAMDirectory();

        analyzer = new WhitespaceAnalyzer();
        IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(analyzer));

        Document doc = new Document();
        FieldType fieldType = new FieldType();
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        fieldType.setStoreTermVectors(true);
        doc.add(new Field("f", "the quick brown fox jumps over the lazy dog", fieldType));
        writer.addDocument(doc);

        doc = new Document();
        doc.add(new Field("f", "the quick red fox jumps over the sleepy cat", fieldType));
        writer.addDocument(doc);

        doc = new Document();
        doc.add(new Field("f", "quiick caar went xyztz dog", fieldType));
        writer.addDocument(doc);

        writer.close();

        reader = DirectoryReader.open(directory);
        searcher = new IndexSearcher(reader);
    }

    public static int distance(String a, String b) { //code is taken from http://rosettacode.org/wiki/Levenshtein_distance#Java
        a = a.toLowerCase();
        b = b.toLowerCase();
        // i == 0
        int[] costs = new int[b.length() + 1];
        for (int j = 0; j < costs.length; j++)
            costs[j] = j;
        for (int i = 1; i <= a.length(); i++) {
            // j == 0; nw = lev(i - 1, j)
            costs[0] = i;
            int nw = i - 1;
            for (int j = 1; j <= b.length(); j++) {
                int cj = Math.min(1 + Math.min(costs[j], costs[j - 1]), a.charAt(i - 1) == b.charAt(j - 1) ? nw : nw + 1);
                nw = costs[j];
                costs[j] = cj;
            }
        }
        return costs[b.length()];
    }


    @Test
    public void test1() throws Exception {
        String s = "quick caar dog";
        TokenStream tokenStream = analyzer.tokenStream("abc", s);
        TermToBytesRefAttribute termAttribute = tokenStream.getAttribute(TermToBytesRefAttribute.class);
        Set<String> stringSet = new HashSet<>();
        tokenStream.reset();
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        while (tokenStream.incrementToken()) {
            stringSet.add(termAttribute.getBytesRef().utf8ToString());
            Query query = new FuzzyQuery(new Term("f", termAttribute.getBytesRef().utf8ToString()), 2); //search only 2 edits
            builder.add(query, BooleanClause.Occur.SHOULD);
        }
        TopDocs hits = searcher.search(builder.build(), 10);
        int exactMatch = 0;
        int match1 = 0;
        int match2 = 0;
        for (ScoreDoc scoreDoc : hits.scoreDocs) {
            exactMatch = match1 = match2 = 0;
            Terms terms = reader.getTermVector(scoreDoc.doc, "f");
            TermsEnum termsEnum = terms.iterator();
            while (true) {
                BytesRef bytesRef = termsEnum.next();
                if (bytesRef == null) {
                    break;
                }
                String str = bytesRef.utf8ToString();
                if (stringSet.contains(str)) {
                    exactMatch++;
                    continue;
                }
                for (String s1 : stringSet) {
                    int distance = distance(s1, str);
                    if (distance <= 1) {
                        match1++;
                    } else if (distance <= 2) {
                        match2++;
                    }
                }
            }
            System.out.print(" doc=" + scoreDoc.doc);
            System.out.print(" exactMatch=" + exactMatch);
            System.out.print(" match1=" + match1);
            System.out.println(" match2=" + match1);
        }
    }
}
 doc=2 exactMatch=2 match1=1 match2=1
 doc=1 exactMatch=1 match1=0 match2=0
 doc=0 exactMatch=2 match1=0 match2=0