Java 如何在pylotokenfilter继承的PyLucene中创建自己的令牌过滤器_Java_Python_Solr_Lucene_Pylucene

Java 如何在pylotokenfilter继承的PyLucene中创建自己的令牌过滤器

java python solr lucene

Java 如何在pylotokenfilter继承的PyLucene中创建自己的令牌过滤器,java,python,solr,lucene,pylucene,Java,Python,Solr,Lucene,Pylucene,各位：我正在PyLucene 4.9.0中开发自己的分析器，并在分析器中为CompoundTokenFilter创建了一个TokenFilter，因为DictionaryCompoundTokenFilter的性能不是很好 DictionaryCompoundTokenFilter使用了一种暴力算法，但我只想在复合词中的子词都在字典中时分割复合词，就像在给定的字典中同时存在“乳腺癌”和“癌症”时分割“乳腺癌” 但在运行程序时，它显示“CharTermAttribute对象的属性'length'

各位：

我正在PyLucene 4.9.0中开发自己的分析器，并在分析器中为CompoundTokenFilter创建了一个TokenFilter，因为DictionaryCompoundTokenFilter的性能不是很好

DictionaryCompoundTokenFilter使用了一种暴力算法，但我只想在复合词中的子词都在字典中时分割复合词，就像在给定的字典中同时存在“乳腺癌”和“癌症”时分割“乳腺癌”

但在运行程序时，它显示“CharTermAttribute对象的属性'length'不可读”，我找不到它有什么问题。谢谢

来自未来进口部的


导入lucene、数学、itertools
从java.lang导入CharSequence
从java.io导入IOException
从java.util导入LinkedList
从org.apache.pylucene.analysis导入PythonTokenStream
从org.apache.lucene.analysis导入令牌过滤器
从org.apache.pylucene.analysis导入PythonTokenFilter
从org.apache.lucene.analysis导入令牌流
从org.apache.lucene.analysis.tokenattributes导入CharterMattAttribute
从org.apache.lucene.analysis.tokenattributes导入OffsetAttribute
从org.apache.lucene.analysis.tokenattributes导入位置IncrementAttribute
来自org.apache.lucene.analysis.util导入字符集
从org.apache.lucene.util导入属性资源
从org.apache.lucene.util导入版本
类CompoundTokenFilter（PythonTokenFilter）：
定义初始大小（self、matchVersion、input、dictionary、默认最小单词大小、默认最小子单词大小、默认最大子单词大小）：
super（CompoundTokenFilter，self）。\uuuu init\uuuu（输入）
self.matchVersion=matchVersion
self.dictionary=字典
self.tokens=LinkedList（）
self.minWordSize=默认的单词大小
self.minSubwordSize=默认的子字大小
self.maxSubwordSize=默认值\u最大\u子单词\u大小
self.current=AttributeSource.State
self.termAtt=input.addAttribute（chartermatAttribute.class）
self.offsetAtt=input.addAttribute（OffsetAttribute.class）
self.posIncAtt=input.addAttribute（PositionIncrementAttribute.class）
self.input=输入
def分解（自）：
l=self.termAtt.length（）
s=自终端连接子序列（0，l）
如果self.dictionary中有s：
添加（CompoundToken（self.matchVersion，self.input，self.dictionary，self.minWordSize，self.minSubwordSize，self.maxSubwordSize，0，l））
其他：
d=过滤器（λx:len（x）>=self.minSubwordSize和len（x）0:
start=int（math.floor（l/self.maxSubwordSize））
end=int（math.ceil（l/self.minSubwordSize））
子词_组合=[]
对于X范围内的i（开始、结束+1）：
子词_组合.扩展（itertools.置换（d，i））
子字\u组合=过滤器（λx:''。联接（x）=s，子字\u组合）
子词=排序（集合（减少（λx，y:x+y，子词_组合）），关键字=λx:-1*len（x））
对于子字中的子字：
添加（CompoundToken（self.matchVersion、self.input、self.dictionary、self.minWordSize、self.minSubwordSize、self.maxSubwordSize、s.find（子词）、s.find（子词）+len（子词）））
def增量令牌（自身）：
if（非self.tokens.isEmpty（））：
断言self.current！=无
token=self.tokens.removeFirst（）
属性资源还原属性（自当前）
self.termAtt.setEmpty（）.append（token.txt）
self.offsetAttribute.setOffset（token.startOffset，token.endOffset）
self.posIncAtt.setPositionIncrement（0）
返回真值
自身电流=无
if（self.input.incrementToken（））：
如果self.termAtt.length（）>=self.minWordSize：
分解
如果不是令牌，则为。isEmpty（）：
self.current=AttributeSource.captureState（）
返回真值
其他：
返回错误
def重置（自）：
super（CompoundTokenFilter，self）.reset（）
self.tokens.clear（）
自身电流=无
类复合令牌：
定义初始（self、matchVersion、input、dictionary、默认最小单词大小、默认最小子单词大小、默认最大子单词大小、偏移量、长度）：
compoundTokenFilter=compoundTokenFilter（匹配版本、输入、字典、默认最小单词大小、默认最小子单词大小、默认最大子单词大小）
self.txt=compoundTokenFilter.termAtt.subSequence（偏移量，偏移量+长度）
startOff=compoundWordTokenFilterBase.this.offsetAtt.startOffset（）
endOff=compoundWordTokenFilterBase.this.offsetAtt.endOffset（）
如果matchVersion.onOrAfter（Version.LUCENE_4_4）或endOff-startOff！=compoundTokenFilter.termAtt.length（）：
self.startOffset=startOff
self.endOffset=endOff
其他：
newStart=startOff+offset
self.startOffset=newStart
self.endOffset=newStart+length

from __future__ import division
import lucene, math, itertools

from java.lang import CharSequence
from java.io import IOException
from java.util import LinkedList
from org.apache.pylucene.analysis import PythonTokenStream
from org.apache.lucene.analysis import TokenFilter
from org.apache.pylucene.analysis import PythonTokenFilter
from org.apache.lucene.analysis import TokenStream
from org.apache.lucene.analysis.tokenattributes import CharTermAttribute
from org.apache.lucene.analysis.tokenattributes import OffsetAttribute
from org.apache.lucene.analysis.tokenattributes import PositionIncrementAttribute
from org.apache.lucene.analysis.util import CharArraySet
from org.apache.lucene.util import AttributeSource
from org.apache.lucene.util import Version

class CompoundTokenFilter(PythonTokenFilter):

    def __init__(self,matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE):
        super(CompoundTokenFilter,self).__init__(input)
        self.matchVersion=matchVersion
        self.dictionary=dictionary
        self.tokens=LinkedList()
        self.minWordSize=DEFAULT_MIN_WORD_SIZE
        self.minSubwordSize=DEFAULT_MIN_SUBWORD_SIZE
        self.maxSubwordSize=DEFAULT_MAX_SUBWORD_SIZE
        self.current=AttributeSource.State
        self.termAtt=input.addAttribute(CharTermAttribute.class_)
        self.offsetAtt=input.addAttribute(OffsetAttribute.class_)
        self.posIncAtt=input.addAttribute(PositionIncrementAttribute.class_)
        self.input=input

    def decompose(self):
        l=self.termAtt.length()
        s=self.termAtt.subSequence(0,l)
        if s in self.dictionary:
            self.tokens.add(CompoundToken(self.matchVersion,self.input,self.dictionary,self.minWordSize,self.minSubwordSize,self.maxSubwordSize,0,l))
        else:

            d=filter(lambda x:len(x)>=self.minSubwordSize and len(x)<=self.maxSubwordSize in s,this.dictionary)
            if len(d)>0:
                start=int(math.floor(l/self.maxSubwordSize))
                end=int(math.ceil(l/self.minSubwordSize))
                subwords_combinations=[]
                for i in xrange(start,end+1):
                    subwords_combinations.extend(itertools.permutations(d,i))
                subwords_combinations=filter(lambda x:''.join(x)==s,subwords_combinations)
                subwords=sorted(set(reduce(lambda x,y:x+y,subwords_combinations)),key=lambda x:-1*len(x))
                for subword in subwords:
                    tokens.add(CompoundToken(self.matchVersion,self.input,self.dictionary,self.minWordSize,self.minSubwordSize,self.maxSubwordSize,s.find(subword),s.find(subword)+len(subword)))

    def incrementToken(self):
        if (not self.tokens.isEmpty()):
            assert self.current!=None
            token=self.tokens.removeFirst()
            AttributeSource.restoreState(self.current)
            self.termAtt.setEmpty().append(token.txt)
            self.offsetAttribute.setOffset(token.startOffset, token.endOffset)
            self.posIncAtt.setPositionIncrement(0)
            return True

        self.current=None

        if(self.input.incrementToken()):
            if self.termAtt.length()>=self.minWordSize:
                decompose()
                if not tokens.isEmpty():
                    self.current=AttributeSource.captureState()
            return True
        else:
            return False

    def reset(self):
        super(CompoundTokenFilter,self).reset()
        self.tokens.clear()
        self.current=None

    class CompoundToken:
        def __init__(self,matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE,offset,length):
            compoundTokenFilter=CompoundTokenFilter(matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE)
            self.txt=compoundTokenFilter.termAtt.subSequence(offset, offset + length)

            startOff = compoundWordTokenFilterBase.this.offsetAtt.startOffset()
            endOff = compoundWordTokenFilterBase.this.offsetAtt.endOffset()

            if matchVersion.onOrAfter(Version.LUCENE_4_4) or endOff - startOff != compoundTokenFilter.termAtt.length():
                self.startOffset = startOff
                self.endOffset = endOff
            else:
                newStart = startOff + offset
                self.startOffset = newStart
                self.endOffset = newStart + length