Lucene查询(带瓦?)
我有一个Lucene索引,其中包含以下文档:Lucene查询(带瓦?),lucene,Lucene,我有一个Lucene索引,其中包含以下文档: _id | Name | Alternate Names | Population 123 Bosc de Planavilla (some names here in 5000 345 Planavilla other languages)
_id | Name | Alternate Names | Population
123 Bosc de Planavilla (some names here in 5000
345 Planavilla other languages) 20000
456 Bosc de la Planassa 1000
567 Bosc de Plana en Blanca 100000
考虑到我需要以下几点,我应该使用什么样的Lucene查询类型以及应该如何构造它:
提前感谢。如何标记字段?是否将它们存储为完整字符串?另外,如何解析查询 好吧,我在玩这个。我一直在使用StopFilter来删除la、en、de。然后我使用了一个木瓦过滤器来获得多个组合,以便进行“精确匹配”。例如,Bosc de Planavilla被标记为[Bosc][Bosc Planavilla],Bosc de Plana en Blanca被标记为[Bosc][Bosc Plana][Plana Blanca][Bosc Plana Blanca]。这样,您就可以在查询的某些部分上进行“精确匹配” 然后我查询用户传递的确切字符串,尽管也可能有一些调整。我采用了简单的案例,以使结果更符合您的要求 下面是我正在使用的代码(lucene 3.0.3):
下面是用于排序的代码。尽管我认为,考虑到城市规模,增加一个自定义评分会更有意义,而不是对人口进行残酷的排序。另外请注意,这使用了FieldCache,这可能不是关于内存使用的最佳解决方案
public class ShingleFilterTests {
private Analyzer analyzer;
private IndexSearcher searcher;
private IndexReader reader;
private QueryParser qp;
private Sort sort;
public static Analyzer createAnalyzer(final int shingles) {
return new Analyzer() {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream tokenizer = new WhitespaceTokenizer(reader);
tokenizer = new StopFilter(false, tokenizer, ImmutableSet.of("de", "la", "en"));
if (shingles > 0) {
tokenizer = new ShingleFilter(tokenizer, shingles);
}
return tokenizer;
}
};
}
public class PopulationComparatorSource extends FieldComparatorSource {
@Override
public FieldComparator newComparator(String fieldname, int numHits, int sortPos, boolean reversed) throws IOException {
return new PopulationComparator(fieldname, numHits);
}
private class PopulationComparator extends FieldComparator {
private final String fieldName;
private Integer[] values;
private int[] populations;
private int bottom;
public PopulationComparator(String fieldname, int numHits) {
values = new Integer[numHits];
this.fieldName = fieldname;
}
@Override
public int compare(int slot1, int slot2) {
if (values[slot1] > values[slot2]) return -1;
if (values[slot1] < values[slot2]) return 1;
return 0;
}
@Override
public void setBottom(int slot) {
bottom = values[slot];
}
@Override
public int compareBottom(int doc) throws IOException {
int value = populations[doc];
if (bottom > value) return -1;
if (bottom < value) return 1;
return 0;
}
@Override
public void copy(int slot, int doc) throws IOException {
values[slot] = populations[doc];
}
@Override
public void setNextReader(IndexReader reader, int docBase) throws IOException {
/* XXX uses field cache */
populations = FieldCache.DEFAULT.getInts(reader, "population");
}
@Override
public Comparable value(int slot) {
return values[slot];
}
}
}
@Before
public void setUp() throws Exception {
Directory dir = new RAMDirectory();
analyzer = createAnalyzer(3);
IndexWriter writer = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
ImmutableList<String> cities = ImmutableList.of("Bosc de Planavilla", "Planavilla", "Bosc de la Planassa",
"Bosc de Plana en Blanca");
ImmutableList<Integer> populations = ImmutableList.of(5000, 20000, 1000, 100000);
for (int id = 0; id < cities.size(); id++) {
Document doc = new Document();
doc.add(new Field("id", String.valueOf(id), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("city", cities.get(id), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("population", String.valueOf(populations.get(id)),
Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
}
writer.close();
qp = new QueryParser(Version.LUCENE_30, "city", createAnalyzer(0));
sort = new Sort(new SortField("population", new PopulationComparatorSource()));
searcher = new IndexSearcher(dir);
searcher.setDefaultFieldSortScoring(true, true);
reader = searcher.getIndexReader();
}
@After
public void tearDown() throws Exception {
searcher.close();
}
@Test
public void testShingleFilter() throws Exception {
System.out.println("shingle filter");
printSearch("city:\"Bosc de Planavilla\"");
printSearch("city:Planavilla");
printSearch("city:Bosc");
}
private void printSearch(String query) throws ParseException, IOException {
Query q = qp.parse(query);
System.out.println("query " + q);
TopDocs hits = searcher.search(q, null, 4, sort);
System.out.println("results " + hits.totalHits);
int i = 1;
for (ScoreDoc dc : hits.scoreDocs) {
Document doc = reader.document(dc.doc);
System.out.println(i++ + ". " + dc + " \"" + doc.get("city") + "\" population: " + doc.get("population"));
}
System.out.println();
}
}
谢谢你的回复,维森。实际上,名称字段是使用标准标记器、标准标记过滤器、小写标记过滤器和停止标记过滤器编制索引的。但这很容易改变。我的问题实际上也是我应该如何索引和查询解析?非常感谢!您的方法与我最终使用的方法类似,并且产生了良好的结果。但它并不完美。。。在一个300万文档索引上,我得到的响应时间高达1秒(在一台机器上)。此外,我经常会遇到一些奇怪的事情,比如在搜索“印第安酒吧巴黎”时,它返回了“富有的印第安酒吧保护区”,而这并不是我想要的:)。如果可能的话,我将尝试使用评分和索引时间增加(取决于功能类型)来进一步完善这一点。谢谢你的帮助!3百万份文档的1秒声音听起来太多了。你是怎么分类的?您可以使用探查器来检查CPU的去向。我正在搜索一个4000万文档索引,其中包含复杂的查询、刻面和自定义排序,耗时约70毫秒。
query city:"Bosc Planavilla"
results 1
1. doc=0 score=1.143841 "Bosc de Planavilla" population: 5000
query city:Planavilla
results 2
1. doc=1 score=1.287682 "Planavilla" population: 20000
2. doc=0 score=0.643841 "Bosc de Planavilla" population: 5000
query city:Bosc
results 3
1. doc=0 score=0.5 "Bosc de Planavilla" population: 5000
2. doc=2 score=0.5 "Bosc de la Planassa" population: 1000
3. doc=3 score=0.375 "Bosc de Plana en Blanca" population: 100000
public class ShingleFilterTests {
private Analyzer analyzer;
private IndexSearcher searcher;
private IndexReader reader;
private QueryParser qp;
private Sort sort;
public static Analyzer createAnalyzer(final int shingles) {
return new Analyzer() {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream tokenizer = new WhitespaceTokenizer(reader);
tokenizer = new StopFilter(false, tokenizer, ImmutableSet.of("de", "la", "en"));
if (shingles > 0) {
tokenizer = new ShingleFilter(tokenizer, shingles);
}
return tokenizer;
}
};
}
public class PopulationComparatorSource extends FieldComparatorSource {
@Override
public FieldComparator newComparator(String fieldname, int numHits, int sortPos, boolean reversed) throws IOException {
return new PopulationComparator(fieldname, numHits);
}
private class PopulationComparator extends FieldComparator {
private final String fieldName;
private Integer[] values;
private int[] populations;
private int bottom;
public PopulationComparator(String fieldname, int numHits) {
values = new Integer[numHits];
this.fieldName = fieldname;
}
@Override
public int compare(int slot1, int slot2) {
if (values[slot1] > values[slot2]) return -1;
if (values[slot1] < values[slot2]) return 1;
return 0;
}
@Override
public void setBottom(int slot) {
bottom = values[slot];
}
@Override
public int compareBottom(int doc) throws IOException {
int value = populations[doc];
if (bottom > value) return -1;
if (bottom < value) return 1;
return 0;
}
@Override
public void copy(int slot, int doc) throws IOException {
values[slot] = populations[doc];
}
@Override
public void setNextReader(IndexReader reader, int docBase) throws IOException {
/* XXX uses field cache */
populations = FieldCache.DEFAULT.getInts(reader, "population");
}
@Override
public Comparable value(int slot) {
return values[slot];
}
}
}
@Before
public void setUp() throws Exception {
Directory dir = new RAMDirectory();
analyzer = createAnalyzer(3);
IndexWriter writer = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
ImmutableList<String> cities = ImmutableList.of("Bosc de Planavilla", "Planavilla", "Bosc de la Planassa",
"Bosc de Plana en Blanca");
ImmutableList<Integer> populations = ImmutableList.of(5000, 20000, 1000, 100000);
for (int id = 0; id < cities.size(); id++) {
Document doc = new Document();
doc.add(new Field("id", String.valueOf(id), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("city", cities.get(id), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("population", String.valueOf(populations.get(id)),
Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
}
writer.close();
qp = new QueryParser(Version.LUCENE_30, "city", createAnalyzer(0));
sort = new Sort(new SortField("population", new PopulationComparatorSource()));
searcher = new IndexSearcher(dir);
searcher.setDefaultFieldSortScoring(true, true);
reader = searcher.getIndexReader();
}
@After
public void tearDown() throws Exception {
searcher.close();
}
@Test
public void testShingleFilter() throws Exception {
System.out.println("shingle filter");
printSearch("city:\"Bosc de Planavilla\"");
printSearch("city:Planavilla");
printSearch("city:Bosc");
}
private void printSearch(String query) throws ParseException, IOException {
Query q = qp.parse(query);
System.out.println("query " + q);
TopDocs hits = searcher.search(q, null, 4, sort);
System.out.println("results " + hits.totalHits);
int i = 1;
for (ScoreDoc dc : hits.scoreDocs) {
Document doc = reader.document(dc.doc);
System.out.println(i++ + ". " + dc + " \"" + doc.get("city") + "\" population: " + doc.get("population"));
}
System.out.println();
}
}
query city:"Bosc Planavilla"
results 1
1. doc=0 score=1.143841[5000] "Bosc de Planavilla" population: 5000
query city:Planavilla
results 2
1. doc=1 score=1.287682[20000] "Planavilla" population: 20000
2. doc=0 score=0.643841[5000] "Bosc de Planavilla" population: 5000
query city:Bosc
results 3
1. doc=3 score=0.375[100000] "Bosc de Plana en Blanca" population: 100000
2. doc=0 score=0.5[5000] "Bosc de Planavilla" population: 5000
3. doc=2 score=0.5[1000] "Bosc de la Planassa" population: 1000