Java “Lucene禁止条款”;“模糊化”;它应该在哪里';T
我正在写一个基于Lucene的过滤器:我有一些来自API的结果,我想强制执行这些结果以匹配某个查询(API有时不起作用)。由于结果是从API获得的,我基本上将它们存储在RAM中,对其进行索引和过滤。如果Lucene在我的索引中找到DOC,我认为这个DOC是可以的,如果不是,它将被过滤。 有时我希望它是模糊的,有时我不希望。有一个近似开关。因此,我使用StandardAnalyzer表示近似值=false,巴西利亚Analyzer表示近似值=true。好吗 问题是巴西利亚分析器近似否定项,我认为这不是一个很好的方法。例如,如果我需要“greve-trabalhadores”,则带有“greve do trabalho”的文档与查询匹配,但不应该匹配。如果我使用StandardAnalyzer,它工作得很好,如果我使用巴西利亚分析器,它会忽略所有包含“trabalh”的内容,因为有词干 我的解决方案是使用StandardAnalyzer重写禁止条款,它不进行词干/模糊分析。因此,查询中被禁止的部分,我将使用StandardAnalyzer,另一部分将使用巴西利亚Analyzer或标准(取决于近似开关) 问题是它(有时)不起作用 我的代码的一个小测试如下:Java “Lucene禁止条款”;“模糊化”;它应该在哪里';T,java,lucene,filtering,full-text-indexing,Java,Lucene,Filtering,Full Text Indexing,我正在写一个基于Lucene的过滤器:我有一些来自API的结果,我想强制执行这些结果以匹配某个查询(API有时不起作用)。由于结果是从API获得的,我基本上将它们存储在RAM中,对其进行索引和过滤。如果Lucene在我的索引中找到DOC,我认为这个DOC是可以的,如果不是,它将被过滤。 有时我希望它是模糊的,有时我不希望。有一个近似开关。因此,我使用StandardAnalyzer表示近似值=false,巴西利亚Analyzer表示近似值=true。好吗 问题是巴西利亚分析器近似否定项,我认为这
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
public class Lucene {
private static Logger log = Logger.getLogger(Lucene.class.getName());
private String[] fields = new String[] { "title" };
private BrazilianAnalyzer analyzerBrazil = new BrazilianAnalyzer(Version.LUCENE_41, new CharArraySet(Version.LUCENE_41, Collections.emptyList(), true));
private StandardAnalyzer analyzerStandard = new StandardAnalyzer(Version.LUCENE_41, new CharArraySet(Version.LUCENE_41, Collections.emptyList(), true));
private MultiFieldQueryParser parserBrazil = new MultiFieldQueryParser(Version.LUCENE_41, fields , analyzerBrazil);
private MultiFieldQueryParser parserStandard = new MultiFieldQueryParser(Version.LUCENE_41, fields , analyzerStandard);
public void filter(String query, boolean fuzzy, List<Result> results) {
Directory index = null;
if (results == null || results.size() == 0) {
return;
}
try {
Analyzer analyzer = fuzzy ? analyzerBrazil : analyzerStandard;
Query q = fuzzy ? parserBrazil.parse(query) : parserStandard.parse(query);
// terms to ignore/prohibited shoudn't be fuzzyfied...
if (fuzzy) {
Query queryNoFuzzy = parserStandard.parse(query);
if (q instanceof BooleanQuery) {
BooleanClause[] clauses = ((BooleanQuery)queryNoFuzzy).getClauses();
if (clauses != null && clauses.length > 0) {
BooleanClause clause = null;
for (int i = 0; i < clauses.length; i++) {
clause = clauses[i];
if (clause.isProhibited()) {
((BooleanQuery)q).clauses().set(i, clause);
}
}
}
}
}
log.info(q.toString());
index = index(results, analyzer);
IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(index));
TopDocs resultsFoundDocs = searcher.search(q, results.size());
List<Result> resultsFound = new ArrayList<Result>();
for (ScoreDoc resultadoFiltro : resultsFoundDocs.scoreDocs) {
log.info("Score " + resultadoFiltro.score);
resultsFound.add(results.get(Integer.parseInt(searcher.doc(resultadoFiltro.doc).get("index"))));
}
for (Result result : results) {
if (!resultsFound.contains(result)) {
result.setFiltered(true);
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
index.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
private Directory index(List<Result> resultados, Analyzer analyzer) {
try {
Directory index = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41, analyzer);
IndexWriter writer = new IndexWriter(index, config);
indexResults(writer, analyzer, resultados);
return index;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
private void indexResults(IndexWriter w, Analyzer analyzer, List<Result> resultados) throws IOException {
try {
Document resultado = null;
for (int i = 0; i < resultados.size(); i++) {
resultado = new Document();
resultado.add(new TextField(fields[0], resultados.get(i).getTitle(), Field.Store.YES));
resultado.add(new IntField("index", i, Field.Store.YES));
w.addDocument(resultado, analyzer);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
w.close();
}
}
public static void main(String[] args) {
List<Result> ocs = new ArrayList<Result>();
Result rb = new Result("Vivo Celular - não instalação do produto");
ocs.add(rb);
System.out.println("ITEMS ____________________________");
for (Result oc : ocs) {
System.out.println(oc.getTitle());
}
System.out.println("ITEMS ____________________________");
String query = "vivo -celular";
System.out.println("\n >> QUERY " + query);
new Lucene().filter(query, true, ocs);
System.out.println("\nFOUND ____________________________");
for (Result oc : ocs) {
if (!oc.getFiltered()) {
System.out.println(oc.getTitle());
}
}
System.out.println("FOUND ____________________________");
}
}
class Result {
private String title;
private Boolean filtered = false;
public Result(String title) {
this.title = title;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public Boolean getFiltered() {
return filtered;
}
public void setFiltered(Boolean filtered) {
this.filtered = filtered;
}
}
import java.io.IOException;
导入java.util.ArrayList;
导入java.util.Collections;
导入java.util.List;
导入java.util.logging.Logger;
导入org.apache.lucene.analysis.Analyzer;
导入org.apache.lucene.analysis.br.BrazilianAnalyzer;
导入org.apache.lucene.analysis.standard.StandardAnalyzer;
导入org.apache.lucene.analysis.util.CharArraySet;
导入org.apache.lucene.document.document;
导入org.apache.lucene.document.Field;
导入org.apache.lucene.document.IntField;
导入org.apache.lucene.document.TextField;
导入org.apache.lucene.index.DirectoryReader;
导入org.apache.lucene.index.IndexWriter;
导入org.apache.lucene.index.IndexWriterConfig;
导入org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
导入org.apache.lucene.search.booleansclause;
导入org.apache.lucene.search.BooleanQuery;
导入org.apache.lucene.search.indexsearch;
导入org.apache.lucene.search.Query;
导入org.apache.lucene.search.ScoreDoc;
导入org.apache.lucene.search.TopDocs;
导入org.apache.lucene.store.Directory;
导入org.apache.lucene.store.RAMDirectory;
导入org.apache.lucene.util.Version;
公共级Lucene{
私有静态记录器log=Logger.getLogger(Lucene.class.getName());
私有字符串[]字段=新字符串[]{“标题”};
private BrazilianAnalyzer Analyzer Brazilian=新的巴西Analyzer(Version.LUCENE_41,new chararray set(Version.LUCENE_41,Collections.emptyList(),true));
private StandardAnalyzer Analyzer Standard=新的StandardAnalyzer(Version.LUCENE_41,new chararrySet(Version.LUCENE_41,Collections.emptyList(),true));
private multifiedqueryparser parserBrazil=新的multifiedqueryparser(Version.LUCENE_41,fields,analyzerBrazil);
private MultiFieldQueryParser parserStandard=新的MultiFieldQueryParser(Version.LUCENE_41,fields,analyzerStandard);
公共无效过滤器(字符串查询、布尔模糊、列表结果){
目录索引=null;
if(results==null | | results.size()==0){
返回;
}
试一试{
分析仪=模糊?分析仪巴西:分析仪标准;
查询q=fuzzy?parserBrazil.parse(查询):parserStandard.parse(查询);
//忽略/禁止的术语不应模糊化。。。
if(模糊){
queryquerynofuzzy=parserStandard.parse(查询);
if(布尔查询的q实例){
BooleanClause[]子句=((BooleanQuery)queryNoFuzzy.getClaires();
if(子句!=null&&clauses.length>0){
布尔子句=空;
for(int i=0;inew BrazilianAnalyzer(Version.LUCENE_41, stops, getNoStemmingSet(query));
private CharArraySet getNoStemmingSet(String query) {
if (query != null && !query.contains(" -")) {
return new CharArraySet(Version.LUCENE_41, Collections.emptyList(), true);
}
List<String> proihibitedClauses = new ArrayList<String>();
for (String clause : query.split("\\s")) {
if (clause.startsWith("-")) {
proihibitedClauses.add(clause.replace("-", ""));
}
}
return new CharArraySet(Version.LUCENE_41, proihibitedClauses, true);
}
BrazilianAnalyzer.getDefaultStopSet()
private CharArraySet getNoStemmingSet(String query) {
if (query != null && !query.contains(" -")) {
return new CharArraySet(Version.LUCENE_41, Collections.emptyList(), true);
}
List<String> proihibitedClauses = new ArrayList<String>();
String[] quotedWords = null;
for (int i = 0; i < query.length(); i++) {
if (query.charAt(i) == '-' && query.charAt(i+1) == '\"') {
quotedWords = query.substring(i+2, query.indexOf('\"', i+2)).split("\\s");
for (String quotedWord : quotedWords) {
proihibitedClauses.add(quotedWord);
}
} else if (query.charAt(i) == '-') {
if (query.indexOf(' ', i+1) > 0) {
proihibitedClauses.add(query.substring(i+1, query.indexOf(' ', i+1)));
} else {
proihibitedClauses.add(query.substring(i+1, query.length()));
}
} else {
continue;
}
}
return new CharArraySet(Version.LUCENE_41, proihibitedClauses, true);
}