Java Solr8.6中的矢量评分插件
我正在尝试升级到Solr 8.6。在检查Lucene 8.x的版本后,我了解到插件viz.CustomScoreQuery和CustomScoreProvider中使用的类是不推荐的,相反,我们必须使用FunctionScoreQuery和DoubleValuesSource。我做了很多搜索,但没有找到任何使用上述类实现自定义记分器的示例。我在java lucene论坛上偶然发现了这两个线程,该论坛基本上讨论了相同的问题,提到的解决方案是实现一个具有自定义逻辑的自定义DoubleValuesSource类。以下是实施情况Java Solr8.6中的矢量评分插件,java,solr,lucene,Java,Solr,Lucene,我正在尝试升级到Solr 8.6。在检查Lucene 8.x的版本后,我了解到插件viz.CustomScoreQuery和CustomScoreProvider中使用的类是不推荐的,相反,我们必须使用FunctionScoreQuery和DoubleValuesSource。我做了很多搜索,但没有找到任何使用上述类实现自定义记分器的示例。我在java lucene论坛上偶然发现了这两个线程,该论坛基本上讨论了相同的问题,提到的解决方案是实现一个具有自定义逻辑的自定义DoubleValuesSo
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.DoubleValues;
import org.apache.lucene.search.DoubleValuesSource;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException;
public class CustomDoubleValueSource extends DoubleValuesSource {
List<Double> vector;
private String field;
private boolean cosine;
double queryVectorNorm = 0;
public CustomDoubleValueSource(String field, String Vector, boolean cosine) {
// TODO Auto-generated constructor stub
super();
this.field = field;
this.cosine = cosine;
this.vector = new ArrayList<Double>();
String[] vectorArray = Vector.split(",");
for (int i = 0; i < vectorArray.length; i++) {
double v = Double.parseDouble(vectorArray[i]);
vector.add(v);
if (cosine) {
queryVectorNorm += Math.pow(v, 2.0);
}
}
System.out.println("Vector size:"+this.vector.size());
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
// TODO Auto-generated method stub
return false;
}
@Override
public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
Terms terms = ctx.reader().terms(field);
TermsEnum te = terms == null ? null : terms.iterator();
System.out.println("Term size:"+terms.size());
if (vector.size() != terms.size()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"indexed and input vector array must have same length");
}
final PostingsEnum pe = te.postings(null);
// TODO Auto-generated method stub
return new DoubleValues() {
@Override
public double doubleValue() throws IOException {
// TODO Auto-generated method stub
float score = 0;
double docVectorNorm = 0;
BytesRef text;
while ((text = te.next()) != null) {
String term = text.utf8ToString();
float payloadValue = 0f;
PostingsEnum postings = te.postings(null, PostingsEnum.ALL);
while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int freq = postings.freq();
while (freq-- > 0)
postings.nextPosition();
BytesRef payload = postings.getPayload();
payloadValue = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
if (cosine)
docVectorNorm += Math.pow(payloadValue, 2.0);
}
score = (float) (score + payloadValue * (vector.get(Integer.parseInt(term))));
}
if (cosine) {
if ((docVectorNorm == 0) || (queryVectorNorm == 0))
return 0f;
return (float) (score / (Math.sqrt(docVectorNorm) * Math.sqrt(queryVectorNorm)));
}
return score;
}
@Override
public boolean advanceExact(int doc) throws IOException {
// TODO Auto-generated method stub
if (pe.docID() > doc)
return false;
return pe.docID() == doc || pe.advance(doc) == doc;
}
};
}
@Override
public boolean needsScores() {
// TODO Auto-generated method stub
return true;
}
@Override
public DoubleValuesSource rewrite(IndexSearcher reader) throws IOException {
// TODO Auto-generated method stub
return null;
}
@Override
public int hashCode() {
// TODO Auto-generated method stub
return 0;
}
@Override
public boolean equals(Object obj) {
// TODO Auto-generated method stub
return false;
}
@Override
public String toString() {
// TODO Auto-generated method stub
return null;
}
}
另外,我将自定义查询实现升级到8.6特定版本,以避免查询未实现createWieght错误
下面是向量的实现
import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
public class VectorQuery extends Query {
String queryStr = "";
Query q;
public VectorQuery(Query subQuery) {
this.q = subQuery;
}
public void setQueryString(String queryString){
this.queryStr = queryString;
}
public Weight createWeight(IndexSearcher searcher, ScoreMode needsScores, float boost) throws IOException {
Weight w;
if(q == null){
w = new ConstantScoreWeight(this, boost) {
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
return new ConstantScoreScorer(this, score(), needsScores, DocIdSetIterator.all(context.reader().maxDoc()));
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
// TODO Auto-generated method stub
return false;
}
};
}else{
w = searcher.createWeight(q, needsScores, boost);
}
return w;
}
@Override
public String toString(String field) {
return queryStr;
}
@Override
public boolean equals(Object other) {
return sameClassAs(other) &&
queryStr.equals(other.toString());
}
@Override
public int hashCode() {
return classHash() ^ queryStr.hashCode();
}
}
我添加了print语句来检查执行流,并调用了CustomDoubleValueSource类。下面是日志的屏幕截图
但是执行流没有到达getValues方法。我得到以下错误
2020-10-21 16:55:09.578 ERROR (qtp1962826816-19) [ x:example_vector] o.a.s.s.HttpSolrCall null:java.lang.NullPointerException
at org.apache.lucene.queries.function.FunctionScoreQuery$MultiplicativeBoostValuesSource.getValues(FunctionScoreQuery.java:261)
at org.apache.lucene.queries.function.FunctionScoreQuery$FunctionScoreWeight.scorer(FunctionScoreQuery.java:224)
at org.apache.lucene.search.Weight.bulkScorer(Weight.java:181)
at org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:658)
at org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:445)
at org.apache.solr.search.SolrIndexSearcher.buildAndRunCollectorChain(SolrIndexSearcher.java:208)
at org.apache.solr.search.SolrIndexSearcher.getDocListNC(SolrIndexSearcher.java:1593)
at org.apache.solr.search.SolrIndexSearcher.getDocListC(SolrIndexSearcher.java:1410)
at org.apache.solr.search.SolrIndexSearcher.search(SolrIndexSearcher.java:593)
at org.apache.solr.handler.component.QueryComponent.doProcessUngroupedSearch(QueryComponent.java:1513)
at org.apache.solr.handler.component.QueryComponent.process(QueryComponent.java:403)
at org.apache.solr.handler.component.SearchHandler.handleRequestBody(SearchHandler.java:331)
at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:214)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:2606)
at org.apache.solr.servlet.HttpSolrCall.execute(HttpSolrCall.java:812)
at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:588)
at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:415)
at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345)
at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1596)
at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:545)
at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:590)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235)
at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1610)
at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)
at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1300)
at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)
at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:485)
at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1580)
at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)
at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1215)
at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:221)
at org.eclipse.jetty.server.handler.InetAccessHandler.handle(InetAccessHandler.java:177)
at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:146)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
at org.eclipse.jetty.rewrite.handler.RewriteHandler.handle(RewriteHandler.java:322)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
at org.eclipse.jetty.server.Server.handle(Server.java:500)
at org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:383)
at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:547)
at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:375)
at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:273)
at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)
at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:103)
at org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:117)
at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:806)
at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:938)
at java.lang.Thread.run(Thread.java:748)
我强烈怀疑这与CustomDoubleValuesSource中未实现的方法有关。我阅读了DoubleValuesSource的说明,但它们不是描述性的或不包含任何示例
非常感谢任何能帮助我继续前进的帮助:)我不熟悉这些代码,但可能会尝试帮助调试。。您是否尝试在
乘法BostValuesSource.getValues(FunctionScoreQuery.java:261)
处附加调试器以查看哪些参数为空?由于scores
参数在那里可以为null,因此它可能会给您一个提示-如果scoreser.createWeight
函数以某种方式返回null。。由于这是一个nullpointer异常,我不认为未实现的方法应该是一个问题(因为它们不是抽象的,它们应该返回一个工作默认值…)@MatsLindh Hey感谢您的评论。非常感谢,但是FunctionScoreQuery是Lucene依赖的一部分。所以不确定我们是否能做到这一点。您可以从github将签出的源代码加载到Solr+Lucene并编译并运行它(这很简单,我从头开始做了几次来创建补丁),或者您可以在您最喜欢的IDE中使用反编译器/调试器(至少eclipse和IntelliJ支持这样做)。大多数调试器还可以设置“异常断点”,如果抛出特定异常,您可以自动使调试器中断-它至少可以让您检查本地堆栈框架以查看设置了哪些变量。@MatsLindh我将研究它。另一个选项是,如果您可以添加文件名以及如何编译它们,可能有人(不确定我是否有时间)尝试在完整的设置中自己编译它们。
2020-10-21 16:55:09.578 ERROR (qtp1962826816-19) [ x:example_vector] o.a.s.s.HttpSolrCall null:java.lang.NullPointerException
at org.apache.lucene.queries.function.FunctionScoreQuery$MultiplicativeBoostValuesSource.getValues(FunctionScoreQuery.java:261)
at org.apache.lucene.queries.function.FunctionScoreQuery$FunctionScoreWeight.scorer(FunctionScoreQuery.java:224)
at org.apache.lucene.search.Weight.bulkScorer(Weight.java:181)
at org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:658)
at org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:445)
at org.apache.solr.search.SolrIndexSearcher.buildAndRunCollectorChain(SolrIndexSearcher.java:208)
at org.apache.solr.search.SolrIndexSearcher.getDocListNC(SolrIndexSearcher.java:1593)
at org.apache.solr.search.SolrIndexSearcher.getDocListC(SolrIndexSearcher.java:1410)
at org.apache.solr.search.SolrIndexSearcher.search(SolrIndexSearcher.java:593)
at org.apache.solr.handler.component.QueryComponent.doProcessUngroupedSearch(QueryComponent.java:1513)
at org.apache.solr.handler.component.QueryComponent.process(QueryComponent.java:403)
at org.apache.solr.handler.component.SearchHandler.handleRequestBody(SearchHandler.java:331)
at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:214)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:2606)
at org.apache.solr.servlet.HttpSolrCall.execute(HttpSolrCall.java:812)
at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:588)
at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:415)
at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345)
at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1596)
at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:545)
at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:590)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235)
at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1610)
at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)
at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1300)
at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)
at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:485)
at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1580)
at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)
at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1215)
at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:221)
at org.eclipse.jetty.server.handler.InetAccessHandler.handle(InetAccessHandler.java:177)
at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:146)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
at org.eclipse.jetty.rewrite.handler.RewriteHandler.handle(RewriteHandler.java:322)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
at org.eclipse.jetty.server.Server.handle(Server.java:500)
at org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:383)
at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:547)
at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:375)
at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:273)
at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)
at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:103)
at org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:117)
at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:806)
at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:938)
at java.lang.Thread.run(Thread.java:748)