golang实现全文检索的有效途径
我试图在golang中实现一个简单的全文搜索,但我的所有实现都太慢,无法克服阈值 任务如下:golang实现全文检索的有效途径,go,binary-search-tree,trie,Go,Binary Search Tree,Trie,我试图在golang中实现一个简单的全文搜索,但我的所有实现都太慢,无法克服阈值 任务如下: 文档是由小写单词除以空格组成的非空字符串 每个文档都有一个隐式标识符,该标识符等于其在输入数组中的索引 New()构造索引 Search():接受一个查询,该查询也是一个由小写单词除以空格组成的字符串,并返回文档唯一标识符的排序数组,该数组包含来自查询的所有单词,而不考虑它们的顺序 例如: index := New([]string{ "this is the house that jack bui
- 文档是由小写单词除以空格组成的非空字符串
- 每个文档都有一个隐式标识符,该标识符等于其在输入数组中的索引
- New()构造索引
- Search():接受一个查询,该查询也是一个由小写单词除以空格组成的字符串,并返回文档唯一标识符的排序数组,该数组包含来自查询的所有单词,而不考虑它们的顺序
index := New([]string{
"this is the house that jack built", //: 0
"this is the rat that ate the malt", //: 1
})
index.Search("") // -> []
index.Search("in the house that jack built") // -> []
index.Search("malt rat") // -> [1]
index.Search("is this the") // -> [0, 1]
我已经尝试实施:
- 每个文档和所有文档的二元搜索树
- 每个文档和所有文档的trie(前缀树)
- 反向索引搜索
type Index struct {
m map[string]map[int]bool
}
// New creates a fulltext search index for the given documents
func New(docs []string) *Index {
m := make(map[string]map[int]bool)
for i := 0; i < len(docs); i++ {
words := strings.Fields(docs[i])
for j := 0; j < len(words); j++ {
if m[words[j]] == nil {
m[words[j]] = make(map[int]bool)
}
m[words[j]][i+1] = true
}
}
return &(Index{m})
}
// Search returns a slice of unique ids of documents that contain all words from the query.
func (idx *Index) Search(query string) []int {
if query == "" {
return []int{}
}
ret := make(map[int]bool)
arr := strings.Fields(query)
fl := 0
for i := range arr {
if idx.m[arr[i]] == nil {
return []int{}
}
if fl == 0 {
for value := range idx.m[arr[i]] {
ret[value] = true
}
fl = 1
} else {
tmp := make(map[int]bool)
for value := range ret {
if idx.m[arr[i]][value] == true {
tmp[value] = true
}
}
ret = tmp
}
}
ret_arr := []int{}
for value := range ret {
ret_arr = append(ret_arr, value-1)
}
sort.Ints(ret_arr)
return ret_arr
}
反向索引的新和搜索实现:
type Index struct {
m map[string]map[int]bool
}
// New creates a fulltext search index for the given documents
func New(docs []string) *Index {
m := make(map[string]map[int]bool)
for i := 0; i < len(docs); i++ {
words := strings.Fields(docs[i])
for j := 0; j < len(words); j++ {
if m[words[j]] == nil {
m[words[j]] = make(map[int]bool)
}
m[words[j]][i+1] = true
}
}
return &(Index{m})
}
// Search returns a slice of unique ids of documents that contain all words from the query.
func (idx *Index) Search(query string) []int {
if query == "" {
return []int{}
}
ret := make(map[int]bool)
arr := strings.Fields(query)
fl := 0
for i := range arr {
if idx.m[arr[i]] == nil {
return []int{}
}
if fl == 0 {
for value := range idx.m[arr[i]] {
ret[value] = true
}
fl = 1
} else {
tmp := make(map[int]bool)
for value := range ret {
if idx.m[arr[i]][value] == true {
tmp[value] = true
}
}
ret = tmp
}
}
ret_arr := []int{}
for value := range ret {
ret_arr = append(ret_arr, value-1)
}
sort.Ints(ret_arr)
return ret_arr
}
//新建为给定文档创建全文搜索索引
func新建(文档[]字符串)*索引{
m:=make(map[string]map[int]bool)
对于i:=0;i
我是做错了什么,还是golang有更好的搜索算法
非常感谢您的帮助。对于特定于语言的部分,我无法真正帮助您,但如果有任何帮助,下面是一个伪代码,它描述了一个Trie实现以及一个函数,可以以相当有效的方式解决您当前的问题
struct TrieNode{
map[char] children // maps character to children
set[int] contains // set of all ids of documents that contain the word
}
// classic search function in trie, except it returns a set of document ids instead of a simple boolean
function get_doc_ids(TrieNode node, string w, int depth){
if (depth == length(w)){
return node.contains
} else {
if (node.hasChild(w[depth]) {
return get_doc_ids(node.getChild(w[depth], w, depth+1)
} else {
return empty_set()
}
}
}
// the answering query function, as straight forward as it can be
function answer_query(TrieNode root, list_of_words L){
n = length(L)
result = get_docs_ids(root, L[0], 0)
for i from 1 to n-1 do {
result = intersection(result, get_docs_ids(root, L[i], 0)) // set intersection
if (result.is_empty()){
break // no documents contains them all, no need to check further
}
}
return result
}
我帮不了你的算法,但是你检查过了吗?谢谢你的评论。事实上,我不应该使用现有的解决方案来实现全文搜索。解决我的问题肯定比解决Bleve更容易。
// New creates a fulltext search index for the given documents
func New(docs []string) *Index {
m := make(map[string]map[int]bool)
for i := 0; i < len(docs); i++ {
words := strings.Fields(docs[i])
for j := 0; j < len(words); j++ {
if m[words[j]] == nil {
m[words[j]] = make(map[int]bool)
}
m[words[j]][i+1] = true
}
}
return &(Index{m})
}
// Search returns a slice of unique ids of documents that contain all words from the query.
func (idx *Index) Search(query string) []int {
if query == "" {
return []int{}
}
ret := make(map[int]bool)
arr := strings.Fields(query)
fl := 0
for i := range arr {
if idx.m[arr[i]] == nil {
return []int{}
}
if fl == 0 {
for value := range idx.m[arr[i]] {
ret[value] = true
}
fl = 1
} else {
tmp := make(map[int]bool)
for value := range ret {
if idx.m[arr[i]][value] == true {
tmp[value] = true
}
}
ret = tmp
}
}
ret_arr := []int{}
for value := range ret {
ret_arr = append(ret_arr, value-1)
}
sort.Ints(ret_arr)
return ret_arr
}
struct TrieNode{
map[char] children // maps character to children
set[int] contains // set of all ids of documents that contain the word
}
// classic search function in trie, except it returns a set of document ids instead of a simple boolean
function get_doc_ids(TrieNode node, string w, int depth){
if (depth == length(w)){
return node.contains
} else {
if (node.hasChild(w[depth]) {
return get_doc_ids(node.getChild(w[depth], w, depth+1)
} else {
return empty_set()
}
}
}
// the answering query function, as straight forward as it can be
function answer_query(TrieNode root, list_of_words L){
n = length(L)
result = get_docs_ids(root, L[0], 0)
for i from 1 to n-1 do {
result = intersection(result, get_docs_ids(root, L[i], 0)) // set intersection
if (result.is_empty()){
break // no documents contains them all, no need to check further
}
}
return result
}