Search 搜索索引-12=12
我很好奇什么是处理标记化/索引词(在Lucene中)或任何搜索引擎的最佳方法,以便这些搜索匹配相应的词 “12”=“12” “mx1”=“mx一号”Search 搜索索引-12=12,search,lucene,full-text-search,pylucene,Search,Lucene,Full Text Search,Pylucene,我很好奇什么是处理标记化/索引词(在Lucene中)或任何搜索引擎的最佳方法,以便这些搜索匹配相应的词 “12”=“12” “mx1”=“mx一号” 是否有我忽略的内置功能?您看过Lucene吗?Lucene中最简单的方法是创建两个单独的令牌过滤器,在初始字符串被令牌化后使用。第一个需要在数字序列和非数字序列之间分割。第二种方法是将数字(数字串)转换成数字(拼写) 以下是PyLucene的示例(不包括偏移和位置属性逻辑): 对同义词映射使用同义词过滤器不需要您提前适应所有可能的映射吗?是的。但是
是否有我忽略的内置功能?您看过Lucene吗?Lucene中最简单的方法是创建两个单独的令牌过滤器,在初始字符串被令牌化后使用。第一个需要在数字序列和非数字序列之间分割。第二种方法是将数字(数字串)转换成数字(拼写) 以下是PyLucene的示例(不包括偏移和位置属性逻辑):
对同义词映射使用同义词过滤器不需要您提前适应所有可能的映射吗?是的。但是,在处理同义词时,由于IDF,您最好提前做一些事情。看见
class AlphaNumberBoundaryFilter(lucene.PythonTokenFilter):
seq = re.compile(r"((?:\d+")|(?:\D+))")
def __init__(self, in_stream):
lucene.PythonTokenFilter.__init__(self, in_stream)
term = self.term = self.addAttribute(lucene.TermAttribute.class_)
# Get tokens.
tokens = []
while in_stream.incrementToken():
tokens.append(term.term())
# Filter tokens.
self.tokens = self.filter(tokens)
# Setup iterator.
self.iter = iter(self.tokens)
def filter(self, tokens):
seq = self.seq
return [split for token in tokens for split in seq.findall(token)]
def incrementToken(self):
try:
self.term.setTermBuffer(next(self.iter))
except StopIteration:
return False
return True
class NumberToWordFilter(lucene.PythonTokenFilter):
num_map = {0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine", 10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen", 17: "seventeen", 18: "eighteen", 19: "nineteen", 20: "twenty", 30: "thirty", 40: "forty", 50: "fifty", 60: "sixty", 70: "seventy", 80: "eighty", 90: "ninety", 100: "hundred", 1000: "thousand", 1000000: "million"}
is_num = re.compile(r"^\d+$")
def __init__(self, in_stream):
lucene.PythonTokenFilter.__init__(self, in_stream)
term = self.term = self.addAttribute(lucene.TermAttribute.class_)
# Get tokens.
tokens = []
while in_stream.incrementToken():
tokens.append(term.term())
# Filter tokens.
self.tokens = self.filter(tokens)
# Setup iterator.
self.iter = iter(self.tokens)
def filter(self, tokens):
num_map = self.num_map
is_num = self.is_num
final = []
for token in tokens:
if not is_num.match(token):
final.append(token)
continue
# Reverse digits from token.
digits = token.lstrip('0')[::-1]
if not digits:
# We have a zero.
final.append(num_map[0])
continue
# Group every 3 digits and iterate over digit groups in reverse
# so that groups are yielded in the original order and in each
# group: 0 -> ones, 1 -> tens, 2 -> hundreds
groups = [digits[i:i+3] for i in xrange(0, len(digits), 3)][::-1]
scale = len(groups) - 1
result = []
for oth in groups:
l = len(oth)
if l == 3 and oth[2] != '0':
# 2 -> x
# 1 -> .
# 0 -> .
result.append(num_map[int(oth[2])])
result.append(num_map[100])
if l >= 2:
if oth[1] == '1':
# 1 -> 1
# 0 -> x
result.append(num_map[int(oth[1::-1])])
else:
if oth[1] != '0':
# 1 -> x (x >= 2)
# 0 -> x
result.append(num_map[int(oth[1]) * 10])
if oth[0] != '0':
result.append(num_map[int(oth[0])])
elif oth[0] != '0':
# 0 -> x
result.append(num_map[int(oth[0])])
# Add scale modifier.
s = scale
if s % 2:
result.append(num_map[1000])
while s >= 2:
result.append(num_map[1000000])
s -= 2
scale -= 1
final.extend(result)
return final
def incrementToken(self):
try:
self.term.setTermBuffer(next(self.iter))
except StopIteration:
return False
return True