如何在Python中创建trie
我对tries和DAWG(直接非循环单词图)很感兴趣,我读了很多关于它们的书,但我不明白输出trie或DAWG文件应该是什么样子如何在Python中创建trie,python,trie,dawg,Python,Trie,Dawg,我对tries和DAWG(直接非循环单词图)很感兴趣,我读了很多关于它们的书,但我不明白输出trie或DAWG文件应该是什么样子 trie应该是嵌套字典的对象吗?每一个字母都被划分为多个字母,以此类推 如果有100k或500k个条目,那么在这样的字典上执行查找会很快吗 如何实现由多个单词组成的单词块,这些单词之间用-或空格分隔 如何将单词的前缀或后缀链接到结构中的另一部分?(适用于DAWG) 我想了解最好的输出结构,以了解如何创建和使用一个输出结构 我还希望DAWG的输出以及trie 我不想
- trie应该是嵌套字典的对象吗?每一个字母都被划分为多个字母,以此类推
- 如果有100k或500k个条目,那么在这样的字典上执行查找会很快吗
- 如何实现由多个单词组成的单词块,这些单词之间用
或空格分隔-
- 如何将单词的前缀或后缀链接到结构中的另一部分?(适用于DAWG)
trie
。首先,构造trie的函数:
>>> _end = '_end_'
>>>
>>> def make_trie(*words):
... root = dict()
... for word in words:
... current_dict = root
... for letter in word:
... current_dict = current_dict.setdefault(letter, {})
... current_dict[_end] = _end
... return root
...
>>> make_trie('foo', 'bar', 'baz', 'barz')
{'b': {'a': {'r': {'_end_': '_end_', 'z': {'_end_': '_end_'}},
'z': {'_end_': '_end_'}}},
'f': {'o': {'o': {'_end_': '_end_'}}}}
>>> def in_trie(trie, word):
... current_dict = trie
... for letter in word:
... if letter not in current_dict:
... return False
... current_dict = current_dict[letter]
... return _end in current_dict
...
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'baz')
True
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'barz')
True
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'barzz')
False
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'bart')
False
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'ba')
False
如果您不熟悉,它只需在字典中查找一个键(在这里,字母
或\u end
)。如果键存在,则返回相关值;如果没有,它将为该键指定一个默认值并返回该值({}
或\u end
)。(这就像是一个版本,它也会更新字典。)
接下来,一个测试单词是否在trie中的函数:
>>> _end = '_end_'
>>>
>>> def make_trie(*words):
... root = dict()
... for word in words:
... current_dict = root
... for letter in word:
... current_dict = current_dict.setdefault(letter, {})
... current_dict[_end] = _end
... return root
...
>>> make_trie('foo', 'bar', 'baz', 'barz')
{'b': {'a': {'r': {'_end_': '_end_', 'z': {'_end_': '_end_'}},
'z': {'_end_': '_end_'}}},
'f': {'o': {'o': {'_end_': '_end_'}}}}
>>> def in_trie(trie, word):
... current_dict = trie
... for letter in word:
... if letter not in current_dict:
... return False
... current_dict = current_dict[letter]
... return _end in current_dict
...
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'baz')
True
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'barz')
True
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'barzz')
False
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'bart')
False
>>> in_trie(make_trie('foo', 'bar', 'baz', 'barz'), 'ba')
False
我将把插入和删除留给您作为练习
当然,Unwind的建议不会太难。查找正确的子节点需要进行线性搜索,这可能有一个轻微的速度劣势。但是搜索将限于可能的字符数——如果我们包括\u end
,则为27个。而且,正如他所建议的那样,创建一个庞大的节点列表并通过索引访问它们也不会有什么好处;你还不如把列表嵌套起来
最后,我要补充一点,创建一个有向无环单词图(DAWG)会有点复杂,因为您必须检测当前单词与结构中另一个单词共享后缀的情况。事实上,这可能会变得相当复杂,这取决于您想要如何构造DAWG!你可能需要学习一些相关的知识才能把事情做好 看看这个:
Python的静态内存高效Trie结构(2.x和3.x)
MARISA trie中的字符串数据占用的内存可能比
在标准Python dict中;原始查找速度相当;特里亚
还提供快速高级方法,如前缀搜索
基于玛丽莎TIEE C++库.
以下是一家成功使用marisa trie的公司的博客帖子:在Repustate,我们在文本分析中使用的许多数据模型可以表示为简单的键值对,或者Python术语中的字典。在我们的特殊情况下,我们的字典是巨大的,每个都有几百MB,需要不断地访问它们。事实上,对于一个给定的HTTP请求,可能会访问4或5个模型,每个模型进行20-30次查找。因此,我们面临的问题是如何使客户端的速度尽可能快,而服务器的速度尽可能轻
我找到了这个包,玛丽莎尝试,它是一个围绕玛丽莎TIE C++实现的Python包装器。“Marisa”是匹配算法与递归实现存储的首字母缩写。marisa Trys最棒的地方是存储机制确实减少了您需要的内存量。Python插件的作者声称大小减少了50-100倍——我们的经验与此类似
marisa trie包的优点在于,底层trie结构可以写入磁盘,然后通过内存映射对象读入。使用内存映射的marisa trie,我们的所有需求现在都得到了满足。我们的服务器的内存使用量急剧下降,下降了40%左右,我们的性能与使用Python的字典实现时没有变化还有一些纯Python实现,但是除非您在一个受限的平台上,否则您希望使用上面的C++支持实现以获得最佳性能:
class Trie:
def __init__(self):
self.__final = False
self.__nodes = {}
def __repr__(self):
return 'Trie<len={}, final={}>'.format(len(self), self.__final)
def __getstate__(self):
return self.__final, self.__nodes
def __setstate__(self, state):
self.__final, self.__nodes = state
def __len__(self):
return len(self.__nodes)
def __bool__(self):
return self.__final
def __contains__(self, array):
try:
return self[array]
except KeyError:
return False
def __iter__(self):
yield self
for node in self.__nodes.values():
yield from node
def __getitem__(self, array):
return self.__get(array, False)
def create(self, array):
self.__get(array, True).__final = True
def read(self):
yield from self.__read([])
def update(self, array):
self[array].__final = True
def delete(self, array):
self[array].__final = False
def prune(self):
for key, value in tuple(self.__nodes.items()):
if not value.prune():
del self.__nodes[key]
if not len(self):
self.delete([])
return self
def __get(self, array, create):
if array:
head, *tail = array
if create and head not in self.__nodes:
self.__nodes[head] = Trie()
return self.__nodes[head].__get(tail, create)
return self
def __read(self, name):
if self.__final:
yield name
for key, value in self.__nodes.items():
yield from value.__read(name + [key])
class-Trie:
定义初始化(自):
self.\uuu final=False
self._节点={}
定义报告(自我):
返回“Trie”。格式(len(self),self.\uu final)
定义获取状态(自身):
返回self.\u final,self.\u节点
定义设置状态(自身、状态):
self.\u final,self.\u节点=状态
定义(自我):
返回len(self.\u节点)
定义(自我):
返回自我。\u最终
def___;包含____;(自身,数组):
尝试:
返回自我[数组]
除KeyError外:
返回错误
定义(自我):
屈服于自己
对于self中的节点。\uuuu nodes.values():
节点屈服
定义uu获取项目uu uu(自我,
Cooolfrom collections import defaultdict
_trie = lambda: defaultdict(_trie)
trie = _trie()
for s in ["cat", "bat", "rat", "cam"]:
curr = trie
for c in s:
curr = curr[c]
curr.setdefault("_end")
def word_exist(trie, word):
curr = trie
for w in word:
if w not in curr:
return False
curr = curr[w]
return '_end' in curr
print(word_exist(trie, 'cam'))
class Trie:
head = {}
def add(self,word):
cur = self.head
for ch in word:
if ch not in cur:
cur[ch] = {}
cur = cur[ch]
cur['*'] = True
def search(self,word):
cur = self.head
for ch in word:
if ch not in cur:
return False
cur = cur[ch]
if '*' in cur:
return True
else:
return False
def printf(self):
print (self.head)
dictionary = Trie()
dictionary.add("hi")
#dictionary.add("hello")
#dictionary.add("eye")
#dictionary.add("hey")
print(dictionary.search("hi"))
print(dictionary.search("hello"))
print(dictionary.search("hel"))
print(dictionary.search("he"))
dictionary.printf()
True
False
False
False
{'h': {'i': {'*': True}}}
class Node:
def __init__(self):
self.children = [None]*26
self.isend = False
class trie:
def __init__(self,):
self.__root = Node()
def __len__(self,):
return len(self.search_byprefix(''))
def __str__(self):
ll = self.search_byprefix('')
string = ''
for i in ll:
string+=i
string+='\n'
return string
def chartoint(self,character):
return ord(character)-ord('a')
def remove(self,string):
ptr = self.__root
length = len(string)
for idx in range(length):
i = self.chartoint(string[idx])
if ptr.children[i] is not None:
ptr = ptr.children[i]
else:
raise ValueError("Keyword doesn't exist in trie")
if ptr.isend is not True:
raise ValueError("Keyword doesn't exist in trie")
ptr.isend = False
return
def insert(self,string):
ptr = self.__root
length = len(string)
for idx in range(length):
i = self.chartoint(string[idx])
if ptr.children[i] is not None:
ptr = ptr.children[i]
else:
ptr.children[i] = Node()
ptr = ptr.children[i]
ptr.isend = True
def search(self,string):
ptr = self.__root
length = len(string)
for idx in range(length):
i = self.chartoint(string[idx])
if ptr.children[i] is not None:
ptr = ptr.children[i]
else:
return False
if ptr.isend is not True:
return False
return True
def __getall(self,ptr,key,key_list):
if ptr is None:
key_list.append(key)
return
if ptr.isend==True:
key_list.append(key)
for i in range(26):
if ptr.children[i] is not None:
self.__getall(ptr.children[i],key+chr(ord('a')+i),key_list)
def search_byprefix(self,key):
ptr = self.__root
key_list = []
length = len(key)
for idx in range(length):
i = self.chartoint(key[idx])
if ptr.children[i] is not None:
ptr = ptr.children[i]
else:
return None
self.__getall(ptr,key,key_list)
return key_list
t = trie()
t.insert("shubham")
t.insert("shubhi")
t.insert("minhaj")
t.insert("parikshit")
t.insert("pari")
t.insert("shubh")
t.insert("minakshi")
print(t.search("minhaj"))
print(t.search("shubhk"))
print(t.search_byprefix('m'))
print(len(t))
print(t.remove("minhaj"))
print(t)
def make_trie(words):
trie = {}
for word in words:
head = trie
for char in word:
if char not in head:
head[char] = {}
head = head[char]
head["_end_"] = "_end_"
return trie
from functools import reduce
from collections import defaultdict
T = lambda : defaultdict(T)
trie = T()
reduce(dict.__getitem__,'how',trie)['isEnd'] = True
defaultdict(<function __main__.<lambda>()>,
{'h': defaultdict(<function __main__.<lambda>()>,
{'o': defaultdict(<function __main__.<lambda>()>,
{'w': defaultdict(<function __main__.<lambda>()>,
{'isEnd': True})})})})
curr = trie
for w in 'how':
if w in curr:
curr = curr[w]
else:
print("Not Found")
break
if curr['isEnd']:
print('Found')
class TrieNode:
def __init__(self):
#Dict: Key = letter, Item = TrieNode
self.children = {}
self.end = False
class Trie:
def __init__(self):
self.root = TrieNode()
def build_trie(self,words):
for word in words:
self.insert(word)
def insert(self,word):
node = self.root
for char in word:
if char not in node.children:
node.children[char] = TrieNode()
node = node.children[char]
node.end = True
def search(self, word):
node = self.root
for char in word:
if char in node.children:
node = node.children[char]
else:
return False
return node.end
def _walk_trie(self, node, word, word_list):
if node.children:
for char in node.children:
word_new = word + char
if node.children[char].end:
# if node.end:
word_list.append( word_new)
# word_list.append( word)
self._walk_trie(node.children[char], word_new , word_list)
def auto_complete(self, partial_word):
node = self.root
word_list = [ ]
#find the node for last char of word
for char in partial_word:
if char in node.children:
node = node.children[char]
else:
# partial_word not found return
return word_list
if node.end:
word_list.append(partial_word)
# word_list will be created in this method for suggestions that start with partial_word
self._walk_trie(node, partial_word, word_list)
return word_list
t = Trie()
words = ['hi', 'hieght', 'rat', 'ram', 'rattle', 'hill']
t.build_trie(words)
words = ['hi', 'hello']
for word in words:
print(word, t.search(word))
hi True
hel False
partial_word = 'ra'
t.auto_complete(partial_word)
['rat', 'rattle', 'ram']