Java Solr WordDelimiterFilter+;Lucene荧光灯
我试图让Lucene的Highlighter类与来自Solr的WordDelimiterFilter的令牌正常工作。它在90%的时间内工作,但如果匹配文本包含“,”字符,例如“1500”,则输出不正确: 预期:“测试1500此” 观察到:'测试11500此' 我目前不确定是Highlighter搞乱了重组,还是WordDelimiterFilter搞乱了标记化,但有些事情令人不快。以下是我的pom中的相关依赖项: org.apache.lucene lucene岩芯 2.9.3 罐子 编译 org.apache.lucene lucene荧光灯 2.9.3 罐子 编译 org.apache.solr solr核 1.4.0 罐子 编译 下面是一个简单的JUnit测试类,演示了这个问题:Java Solr WordDelimiterFilter+;Lucene荧光灯,java,solr,lucene,tokenize,lucene-highlighter,Java,Solr,Lucene,Tokenize,Lucene Highlighter,我试图让Lucene的Highlighter类与来自Solr的WordDelimiterFilter的令牌正常工作。它在90%的时间内工作,但如果匹配文本包含“,”字符,例如“1500”,则输出不正确: 预期:“测试1500此” 观察到:'测试11500此' 我目前不确定是Highlighter搞乱了重组,还是WordDelimiterFilter搞乱了标记化,但有些事情令人不快。以下是我的pom中的相关依赖项: org.apache.lucene lucene岩芯 2.9.3 罐子 编译 o
package test.lucene;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.util.Version;
import org.apache.solr.analysis.StandardTokenizerFactory;
import org.apache.solr.analysis.WordDelimiterFilterFactory;
import org.junit.Test;
public class HighlighterTester {
private static final String PRE_TAG = "<b>";
private static final String POST_TAG = "</b>";
private static String[] highlightField( Query query, String fieldName, String text )
throws IOException, InvalidTokenOffsetsException {
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter( PRE_TAG, POST_TAG );
Highlighter highlighter = new Highlighter( formatter, new QueryScorer( query, fieldName ) );
highlighter.setTextFragmenter( new SimpleFragmenter( Integer.MAX_VALUE ) );
return highlighter.getBestFragments( getAnalyzer(), fieldName, text, 10 );
}
private static Analyzer getAnalyzer() {
return new Analyzer() {
@Override
public TokenStream tokenStream( String fieldName, Reader reader ) {
// Start with a StandardTokenizer
TokenStream stream = new StandardTokenizerFactory().create( reader );
// Chain on a WordDelimiterFilter
WordDelimiterFilterFactory wordDelimiterFilterFactory = new WordDelimiterFilterFactory();
HashMap<String, String> arguments = new HashMap<String, String>();
arguments.put( "generateWordParts", "1" );
arguments.put( "generateNumberParts", "1" );
arguments.put( "catenateWords", "1" );
arguments.put( "catenateNumbers", "1" );
arguments.put( "catenateAll", "0" );
wordDelimiterFilterFactory.init( arguments );
return wordDelimiterFilterFactory.create( stream );
}
};
}
@Test
public void TestHighlighter() throws ParseException, IOException, InvalidTokenOffsetsException {
String fieldName = "text";
String text = "test 1,500 this";
String queryString = "1500";
String expected = "test " + PRE_TAG + "1,500" + POST_TAG + " this";
QueryParser parser = new QueryParser( Version.LUCENE_29, fieldName, getAnalyzer() );
Query q = parser.parse( queryString );
String[] observed = highlightField( q, fieldName, text );
for ( int i = 0; i < observed.length; i++ ) {
System.out.println( "\t" + i + ": '" + observed[i] + "'" );
}
if ( observed.length > 0 ) {
System.out.println( "Expected: '" + expected + "'\n" + "Observed: '" + observed[0] + "'" );
assertEquals( expected, observed[0] );
}
else {
assertTrue( "No matches found", false );
}
}
}
package test.lucene;
导入静态org.junit.Assert.assertEquals;
导入静态org.junit.Assert.assertTrue;
导入java.io.IOException;
导入java.io.Reader;
导入java.util.HashMap;
导入org.apache.lucene.analysis.Analyzer;
导入org.apache.lucene.analysis.TokenStream;
导入org.apache.lucene.queryParser.ParseException;
导入org.apache.lucene.queryParser.queryParser;
导入org.apache.lucene.search.Query;
导入org.apache.lucene.search.Highlighter;
导入org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
导入org.apache.lucene.search.highlight.QueryScorer;
导入org.apache.lucene.search.highlight.SimpleFragmenter;
导入org.apache.lucene.search.highlight.SimpleHTMLFormatter;
导入org.apache.lucene.util.Version;
导入org.apache.solr.analysis.StandardTokenizerFactory;
导入org.apache.solr.analysis.WordDelimiterFilterFactory;
导入org.junit.Test;
公共级HighlighterTester{
私有静态最终字符串PRE_TAG=“”;
私有静态最终字符串POST_TAG=“”;
私有静态字符串[]高亮字段(查询、字符串字段名、字符串文本)
引发IOException,InvalidTokenOffsetSexException{
SimpleHTMLFormatter formatter=新的SimpleHTMLFormatter(前标记、后标记);
Highlighter Highlighter=新的Highlighter(格式化程序,新的QueryScore(查询,字段名));
highlighter.setTextFragmenter(新的SimpleFragmenter(Integer.MAX_值));
返回highlighter.getBestFragments(getAnalyzer(),字段名,文本,10);
}
专用静态分析器getAnalyzer(){
返回新分析器(){
@凌驾
公共令牌流令牌流(字符串字段名、读卡器){
//从标准标记器开始
TokenStream=新建StandardTokenizerFactory()。创建(读卡器);
//WordDelimiterFilter上的链
WordDelimiterFilterFactory WordDelimiterFilterFactory=新的WordDelimiterFilterFactory();
HashMap参数=新的HashMap();
参数。put(“generateWordParts”、“1”);
参数。put(“1”);
论点。付诸表决(“catenateWords”,“1”);
参数。put(“catenateNumbers”,“1”);
参数。put(“catenateAll”,“0”);
wordDelimiterFilterFactory.init(参数);
返回wordDelimiterFilterFactory.create(流);
}
};
}
@试验
public void TestHighlighter()引发ParseException、IOException、InvalidTokenOffsetSexException{
字符串fieldName=“text”;
String text=“test 1500 this”;
字符串queryString=“1500”;
字符串expected=“test”+前标签+“1500”+后标签+“this”;
QueryParser parser=newQueryParser(Version.LUCENE_29,fieldName,getAnalyzer());
Query q=parser.parse(queryString);
String[]observed=highlightField(q,fieldName,text);
对于(int i=0;i0){
System.out.println(“预期的:”+预期的+“'\n”+“观察到的:”+观察到的[0]+“”);
资产质量(预期、观察[0]);
}
否则{
assertTrue(“未找到匹配项”,false);
}
}
}
有人有什么想法或建议吗?这里有一个可能的原因。
您的荧光灯需要使用与搜索相同的分析仪。IIUC,您的代码使用默认分析器进行突出显示,即使它使用专门的分析器来解析查询。我相信您需要更改Fragmenter以使用特定的令牌流。进一步调查后,这似乎是Lucene Highlighter代码中的一个bug。正如你在这里看到的:
public class TokenGroup {
...
protected boolean isDistinct() {
return offsetAtt.startOffset() >= endOffset;
}
...
代码试图通过检查起始偏移量是否大于前一个结束偏移量来确定一组令牌是否不同。这个问题说明了这种方法的问题。如果您要逐步查看令牌,您将看到它们如下所示:
0-4: 'test', 'test'
5-6: '1', '1'
7-10: '500', '500'
5-10: '1500', '1,500'
11-15: 'this', 'this'
由此可以看出,第三个标记在第二个标记结束后开始,但第四个标记与第二个标记开始的位置相同。预期的结果是将令牌2、3和4分组,但根据此实现,令牌3被视为与2分开,因此2会自行显示,然后3和4分组,留下此结果:
Expected: 'test <b>1,500</b> this'
Observed: 'test 1<b>1,500</b> this'
应为:'test 1500 this'
观察到:'测试11500此'
我不确定这是否可以在没有两个过程的情况下完成,一个是获取所有索引,另一个是合并它们。此外,我不确定在这个具体案例之外会有什么影响。这里有人有什么想法吗
编辑
这是我最后的源代码。它将正确地对事物进行分组。它似乎也比Lucene Highlighter实现简单得多,但不可否认,由于我的应用程序仅适用于nee,所以它不会处理不同级别的评分
public TextFragments<E> getTextFragments( TokenStream tokenStream,
String text,
Scorer scorer )
throws IOException, InvalidTokenOffsetsException {
OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute( OffsetAttribute.class );
TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute( TermAttribute.class );
TokenStream newStream = scorer.init( tokenStream );
if ( newStream != null ) {
tokenStream = newStream;
}
TokenGroups tgs = new TokenGroups();
scorer.startFragment( null );
while ( tokenStream.incrementToken() ) {
tgs.add( offsetAtt.startOffset(), offsetAtt.endOffset(), scorer.getTokenScore() );
if ( log.isTraceEnabled() ) {
log.trace( new StringBuilder()
.append( scorer.getTokenScore() )
.append( " " )
.append( offsetAtt.startOffset() )
.append( "-" )
.append( offsetAtt.endOffset() )
.append( ": '" )
.append( termAtt.term() )
.append( "', '" )
.append( text.substring( offsetAtt.startOffset(), offsetAtt.endOffset() ) )
.append( "'" )
.toString() );
}
}
return tgs.fragment( text );
}
private class TokenGroup {
private int startIndex;
private int endIndex;
private float score;
public TokenGroup( int startIndex, int endIndex, float score ) {
this.startIndex = startIndex;
this.endIndex = endIndex;
this.score = score;
}
}
private class TokenGroups implements Iterable<TokenGroup> {
private List<TokenGroup> tgs;
public TokenGroups() {
tgs = new ArrayList<TokenGroup>();
}
public void add( int startIndex, int endIndex, float score ) {
add( new TokenGroup( startIndex, endIndex, score ) );
}
public void add( TokenGroup tg ) {
for ( int i = tgs.size() - 1; i >= 0; i-- ) {
if ( tg.startIndex < tgs.get( i ).endIndex ) {
tg = merge( tg, tgs.remove( i ) );
}
else {
break;
}
}
tgs.add( tg );
}
private TokenGroup merge( TokenGroup tg1, TokenGroup tg2 ) {
return new TokenGroup( Math.min( tg1.startIndex, tg2.startIndex ),
Math.max( tg1.endIndex, tg2.endIndex ),
Math.max( tg1.score, tg2.score ) );
}
private TextFragments<E> fragment( String text ) {
TextFragments<E> fragments = new TextFragments<E>();
int lastEndIndex = 0;
for ( TokenGroup tg : this ) {
if ( tg.startIndex > lastEndIndex ) {
fragments.add( text.substring( lastEndIndex, tg.startIndex ), textModeNormal );
}
fragments.add(
text.substring( tg.startIndex, tg.endIndex ),
tg.score > 0 ? textModeHighlighted : textModeNormal );
lastEndIndex = tg.endIndex;
}
if ( lastEndIndex < ( text.length() - 1 ) ) {
fragments.add( text.substring( lastEndIndex ), textModeNormal );
}
return fragments;
}
@Override
public Iterator<TokenGroup> iterator() {
return tgs.iterator();
}
}