Lucene查询（带瓦？）_Lucene - Fatal编程技术网

Lucene查询（带瓦？）

lucene

Lucene查询（带瓦？）,lucene,Lucene,我有一个Lucene索引，其中包含以下文档： _id | Name | Alternate Names | Population 123 Bosc de Planavilla (some names here in 5000 345 Planavilla other languages)

我有一个Lucene索引，其中包含以下文档：

_id     |           Name            |        Alternate Names      |    Population

123       Bosc de Planavilla               (some names here in          5000
345       Planavilla                       other languages)             20000
456       Bosc de la Planassa                                           1000
567       Bosc de Plana en Blanca                                       100000

考虑到我需要以下几点，我应该使用什么样的Lucene查询类型以及应该如何构造它：

如果用户查询： “博斯德普拉纳维拉附近的意大利餐厅” 我希望返回id为123的文档，因为它包含与文档名称完全匹配的内容

如果用户查询： “普拉纳维拉附近的意大利餐厅” 我想要id为345的文档，因为查询包含精确匹配，并且它具有最高的总体

如果用户查询“Bosc附近的意大利餐厅” 我想要567，因为查询包含“Bosc”，在3个“Bosc”中，它的pop最高

可能还有很多其他的用例。。。但你能感觉到我需要什么

什么样的查询会对我产生影响？我应该生成单词N grams（带状图）并使用带状图创建一个OR布尔查询，然后应用自定义评分吗？还是常规短语查询就可以了？我也看到了析取MaxQuery，但不知道这是否是我要找的

正如您现在可能已经知道的那样，这个想法是找到用户在查询中隐含的确切位置。从那以后，我可以开始我的地理搜索，并添加一些关于这方面的进一步查询

最好的方法是什么

提前感谢。

如何标记字段？是否将它们存储为完整字符串？另外，如何解析查询

好吧，我在玩这个。我一直在使用StopFilter来删除la、en、de。然后我使用了一个木瓦过滤器来获得多个组合，以便进行“精确匹配”。例如，Bosc de Planavilla被标记为[Bosc][Bosc Planavilla]，Bosc de Plana en Blanca被标记为[Bosc][Bosc Plana][Plana Blanca][Bosc Plana Blanca]。这样，您就可以在查询的某些部分上进行“精确匹配”

然后我查询用户传递的确切字符串，尽管也可能有一些调整。我采用了简单的案例，以使结果更符合您的要求

下面是我正在使用的代码（lucene 3.0.3）：

下面是用于排序的代码。尽管我认为，考虑到城市规模，增加一个自定义评分会更有意义，而不是对人口进行残酷的排序。另外请注意，这使用了FieldCache，这可能不是关于内存使用的最佳解决方案

public class ShingleFilterTests {
    private Analyzer analyzer;
    private IndexSearcher searcher;
    private IndexReader reader;
    private QueryParser qp;
    private Sort sort;

    public static Analyzer createAnalyzer(final int shingles) {
        return new Analyzer() {
            @Override
            public TokenStream tokenStream(String fieldName, Reader reader) {
                TokenStream tokenizer = new WhitespaceTokenizer(reader);
                tokenizer = new StopFilter(false, tokenizer, ImmutableSet.of("de", "la", "en"));
                if (shingles > 0) {
                    tokenizer = new ShingleFilter(tokenizer, shingles);
                }
                return tokenizer;
            }
        };
    }

    public class PopulationComparatorSource extends FieldComparatorSource {
        @Override
        public FieldComparator newComparator(String fieldname, int numHits, int sortPos, boolean reversed) throws IOException {
            return new PopulationComparator(fieldname, numHits);
        }

        private class PopulationComparator extends FieldComparator {
            private final String fieldName;
            private Integer[] values;
            private int[] populations;
            private int bottom;

            public PopulationComparator(String fieldname, int numHits) {
                values = new Integer[numHits];
                this.fieldName = fieldname;
            }

            @Override
            public int compare(int slot1, int slot2) {
                if (values[slot1] > values[slot2]) return -1;
                if (values[slot1] < values[slot2]) return 1;
                return 0;
            }

            @Override
            public void setBottom(int slot) {
                bottom = values[slot];
            }

            @Override
            public int compareBottom(int doc) throws IOException {
                int value = populations[doc];
                if (bottom > value) return -1;
                if (bottom < value) return 1;
                return 0;
            }

            @Override
            public void copy(int slot, int doc) throws IOException {
                values[slot] = populations[doc];
            }

            @Override
            public void setNextReader(IndexReader reader, int docBase) throws IOException {
                /* XXX uses field cache */
                populations = FieldCache.DEFAULT.getInts(reader, "population");
            }

            @Override
            public Comparable value(int slot) {
                return values[slot];
            }
        }
    }

    @Before
    public void setUp() throws Exception {
        Directory dir = new RAMDirectory();
        analyzer = createAnalyzer(3);

        IndexWriter writer = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
        ImmutableList<String> cities = ImmutableList.of("Bosc de Planavilla", "Planavilla", "Bosc de la Planassa",
                                                               "Bosc de Plana en Blanca");
        ImmutableList<Integer> populations = ImmutableList.of(5000, 20000, 1000, 100000);

        for (int id = 0; id < cities.size(); id++) {
            Document doc = new Document();
            doc.add(new Field("id", String.valueOf(id), Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.add(new Field("city", cities.get(id), Field.Store.YES, Field.Index.ANALYZED));
            doc.add(new Field("population", String.valueOf(populations.get(id)),
                                     Field.Store.YES, Field.Index.NOT_ANALYZED));
            writer.addDocument(doc);
        }
        writer.close();

        qp = new QueryParser(Version.LUCENE_30, "city", createAnalyzer(0));
        sort = new Sort(new SortField("population", new PopulationComparatorSource()));
        searcher = new IndexSearcher(dir);
        searcher.setDefaultFieldSortScoring(true, true);
        reader = searcher.getIndexReader();
    }

    @After
    public void tearDown() throws Exception {
        searcher.close();
    }

    @Test
    public void testShingleFilter() throws Exception {
        System.out.println("shingle filter");

        printSearch("city:\"Bosc de Planavilla\"");
        printSearch("city:Planavilla");
        printSearch("city:Bosc");
    }

    private void printSearch(String query) throws ParseException, IOException {
        Query q = qp.parse(query);
        System.out.println("query " + q);
        TopDocs hits = searcher.search(q, null, 4, sort);
        System.out.println("results " + hits.totalHits);
        int i = 1;
        for (ScoreDoc dc : hits.scoreDocs) {
            Document doc = reader.document(dc.doc);
            System.out.println(i++ + ". " + dc + " \"" + doc.get("city") + "\" population: " + doc.get("population"));
        }
        System.out.println();
    }
}

谢谢你的回复，维森。实际上，名称字段是使用标准标记器、标准标记过滤器、小写标记过滤器和停止标记过滤器编制索引的。但这很容易改变。我的问题实际上也是我应该如何索引和查询解析？非常感谢！您的方法与我最终使用的方法类似，并且产生了良好的结果。但它并不完美。。。在一个300万文档索引上，我得到的响应时间高达1秒（在一台机器上）。此外，我经常会遇到一些奇怪的事情，比如在搜索“印第安酒吧巴黎”时，它返回了“富有的印第安酒吧保护区”，而这并不是我想要的：）。如果可能的话，我将尝试使用评分和索引时间增加（取决于功能类型）来进一步完善这一点。谢谢你的帮助！3百万份文档的1秒声音听起来太多了。你是怎么分类的？您可以使用探查器来检查CPU的去向。我正在搜索一个4000万文档索引，其中包含复杂的查询、刻面和自定义排序，耗时约70毫秒。

query city:"Bosc Planavilla"
results 1
1. doc=0 score=1.143841 "Bosc de Planavilla" population: 5000

query city:Planavilla
results 2
1. doc=1 score=1.287682 "Planavilla" population: 20000
2. doc=0 score=0.643841 "Bosc de Planavilla" population: 5000

query city:Bosc
results 3
1. doc=0 score=0.5 "Bosc de Planavilla" population: 5000
2. doc=2 score=0.5 "Bosc de la Planassa" population: 1000
3. doc=3 score=0.375 "Bosc de Plana en Blanca" population: 100000

public class ShingleFilterTests {
    private Analyzer analyzer;
    private IndexSearcher searcher;
    private IndexReader reader;
    private QueryParser qp;
    private Sort sort;

    public static Analyzer createAnalyzer(final int shingles) {
        return new Analyzer() {
            @Override
            public TokenStream tokenStream(String fieldName, Reader reader) {
                TokenStream tokenizer = new WhitespaceTokenizer(reader);
                tokenizer = new StopFilter(false, tokenizer, ImmutableSet.of("de", "la", "en"));
                if (shingles > 0) {
                    tokenizer = new ShingleFilter(tokenizer, shingles);
                }
                return tokenizer;
            }
        };
    }

    public class PopulationComparatorSource extends FieldComparatorSource {
        @Override
        public FieldComparator newComparator(String fieldname, int numHits, int sortPos, boolean reversed) throws IOException {
            return new PopulationComparator(fieldname, numHits);
        }

        private class PopulationComparator extends FieldComparator {
            private final String fieldName;
            private Integer[] values;
            private int[] populations;
            private int bottom;

            public PopulationComparator(String fieldname, int numHits) {
                values = new Integer[numHits];
                this.fieldName = fieldname;
            }

            @Override
            public int compare(int slot1, int slot2) {
                if (values[slot1] > values[slot2]) return -1;
                if (values[slot1] < values[slot2]) return 1;
                return 0;
            }

            @Override
            public void setBottom(int slot) {
                bottom = values[slot];
            }

            @Override
            public int compareBottom(int doc) throws IOException {
                int value = populations[doc];
                if (bottom > value) return -1;
                if (bottom < value) return 1;
                return 0;
            }

            @Override
            public void copy(int slot, int doc) throws IOException {
                values[slot] = populations[doc];
            }

            @Override
            public void setNextReader(IndexReader reader, int docBase) throws IOException {
                /* XXX uses field cache */
                populations = FieldCache.DEFAULT.getInts(reader, "population");
            }

            @Override
            public Comparable value(int slot) {
                return values[slot];
            }
        }
    }

    @Before
    public void setUp() throws Exception {
        Directory dir = new RAMDirectory();
        analyzer = createAnalyzer(3);

        IndexWriter writer = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
        ImmutableList<String> cities = ImmutableList.of("Bosc de Planavilla", "Planavilla", "Bosc de la Planassa",
                                                               "Bosc de Plana en Blanca");
        ImmutableList<Integer> populations = ImmutableList.of(5000, 20000, 1000, 100000);

        for (int id = 0; id < cities.size(); id++) {
            Document doc = new Document();
            doc.add(new Field("id", String.valueOf(id), Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.add(new Field("city", cities.get(id), Field.Store.YES, Field.Index.ANALYZED));
            doc.add(new Field("population", String.valueOf(populations.get(id)),
                                     Field.Store.YES, Field.Index.NOT_ANALYZED));
            writer.addDocument(doc);
        }
        writer.close();

        qp = new QueryParser(Version.LUCENE_30, "city", createAnalyzer(0));
        sort = new Sort(new SortField("population", new PopulationComparatorSource()));
        searcher = new IndexSearcher(dir);
        searcher.setDefaultFieldSortScoring(true, true);
        reader = searcher.getIndexReader();
    }

    @After
    public void tearDown() throws Exception {
        searcher.close();
    }

    @Test
    public void testShingleFilter() throws Exception {
        System.out.println("shingle filter");

        printSearch("city:\"Bosc de Planavilla\"");
        printSearch("city:Planavilla");
        printSearch("city:Bosc");
    }

    private void printSearch(String query) throws ParseException, IOException {
        Query q = qp.parse(query);
        System.out.println("query " + q);
        TopDocs hits = searcher.search(q, null, 4, sort);
        System.out.println("results " + hits.totalHits);
        int i = 1;
        for (ScoreDoc dc : hits.scoreDocs) {
            Document doc = reader.document(dc.doc);
            System.out.println(i++ + ". " + dc + " \"" + doc.get("city") + "\" population: " + doc.get("population"));
        }
        System.out.println();
    }
}

query city:"Bosc Planavilla"
results 1
1. doc=0 score=1.143841[5000] "Bosc de Planavilla" population: 5000

query city:Planavilla
results 2
1. doc=1 score=1.287682[20000] "Planavilla" population: 20000
2. doc=0 score=0.643841[5000] "Bosc de Planavilla" population: 5000

query city:Bosc
results 3
1. doc=3 score=0.375[100000] "Bosc de Plana en Blanca" population: 100000
2. doc=0 score=0.5[5000] "Bosc de Planavilla" population: 5000
3. doc=2 score=0.5[1000] "Bosc de la Planassa" population: 1000