Python 使用布尔检索提取文件+;最大似然算法
我试图创建一个程序,从一个目录中提取文件,而不知道基于文档中写入的文本的文件名。使用布尔检索+ML算法(稍后我将添加该算法,因为现在正在处理布尔检索),在输入查询时,我接收到错误,但文档中存在单词。请帮我渡过难关Python 使用布尔检索提取文件+;最大似然算法,python,machine-learning,artificial-intelligence,Python,Machine Learning,Artificial Intelligence,我试图创建一个程序,从一个目录中提取文件,而不知道基于文档中写入的文本的文件名。使用布尔检索+ML算法(稍后我将添加该算法,因为现在正在处理布尔检索),在输入查询时,我接收到错误,但文档中存在单词。请帮我渡过难关 Enter your query: profit [] profit not found [] [] --------------------------------------------------------------------------- IndexError
Enter your query: profit
[]
profit not found
[]
[]
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-100-d08da3ab4dc8> in <module>()
55 files = []
56 print(zeroes_and_ones_of_all_words)
---> 57 lis = zeroes_and_ones_of_all_words[0]
58 cnt = 1
59 for index in lis:
IndexError: list index out of range
除此之外,如果有任何想法建议如何使用ML和AI方法通过关键字提取文件,那将是非常棒的。
谢谢请阅读标签的说明。
import nltk
nltk.download("stopwords")
nltk.download("punkt")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import sent_tokenize , word_tokenize
import glob
import re
import os
import numpy as np
import sys
Stopwords = set(stopwords.words('english'))
all_words = []
dict_global = {}
file_folder = 'data/*'
idx =1
files_with_index = {}
for file in glob.glob(file_folder):
print(file)
fname = file
file = open(file,"r")
text = file.read()
text = remove_special_characters(text)
text = re.sub(re.compile('\d'),'',text)
sentences = sent_tokenize(text)
words = word_tokenize(text)
words = [word for word in words if len(words)>1]
words = [word.lower() for word in words]
words = [word for word in words if word not in Stopwords]
dict_global.update(finding_all_unique_words_and_freq(words))
files_with_index[idx] = os.path.basename(fname)
idx = idx + 1
unique_words_all = set(dict_global.keys())
def finding_all_unique_words_and_freq(words):
words_unique = []
word_freq = {}
for word in words:
if word not in words_unique:
words_unique.append(word)`enter code here`
for word in words_unique:
word_freq[word] = words.count(word)
return word_freq
def finding_freq_of_word_in_doc(word,words):
freq = words.count(word)
def remove_special_characters(text):
regex = re.compile('[^a-zA-Z0-9\s]')
text_returned = re.sub(regex,'',text)
return text_returned
class Node:
def __init__(self ,docId, freq = None):
self.freq = freq
self.doc = docId
self.nextval = None
class SlinkedList:
def __init__(self ,head = None):
self.head = head
linked_list_data = {}
for word in unique_words_all:
linked_list_data[word] = SlinkedList()
linked_list_data[word].head = Node(1,Node)
word_freq_in_doc = {}
idx = 1
for file in glob.glob(file_folder):
file = open(file, "r")
text = file.read()
text = remove_special_characters(text)
text = re.sub(re.compile('\d'),'',text)
sentences = sent_tokenize(text)
words = word_tokenize(text)
words = [word for word in words if len(words)>1]
words = [word.lower() for word in words]
words = [word for word in words if word not in Stopwords]
word_freq_in_doc = finding_all_unique_words_and_freq(words)
for word in word_freq_in_doc.keys():
linked_list = linked_list_data[word].head
while linked_list.nextval is not None:
linked_list = linked_list.nextval
linked_list.nextval = Node(idx ,word_freq_in_doc[word])
idx = idx + 1
query = input('Enter your query:')
query = word_tokenize(query)
connecting_words = []
cnt = 1
different_words = []
for word in query:
if word.lower() != "and" and word.lower() != "or" and word.lower() != "not":
different_words.append(word.lower())
else:
connecting_words.append(word.lower())
print(connecting_words)
total_files = len(files_with_index)
zeroes_and_ones = []
zeroes_and_ones_of_all_words = []
for word in (different_words):
if word.lower() in unique_words_all:
zeroes_and_ones = [0] * total_files
linkedlist = linked_list_data[word].head
print(word)
while linkedlist.nextval is not None:
zeroes_and_ones[linkedlist.nextval.doc - 1] = 1
linkedlist = linkedlist.nextval
zeroes_and_ones_of_all_words.append(zeroes_and_ones)
else:
print(word," not found")
print(zeroes_and_ones_of_all_words)
for word in connecting_words:
word_list1 = zeroes_and_ones_of_all_words[0]
word_list2 = zeroes_and_ones_of_all_words[1]
if word == "and":
bitwise_op = [w1 & w2 for (w1,w2) in zip(word_list1,word_list2)]
zeroes_and_ones_of_all_words.remove(word_list1)
zeroes_and_ones_of_all_words.remove(word_list2)
zeroes_and_ones_of_all_words.insert(0, bitwise_op);
elif word == "or":
bitwise_op = [w1 | w2 for (w1,w2) in zip(word_list1,word_list2)]
zeroes_and_ones_of_all_words.remove(word_list1)
zeroes_and_ones_of_all_words.remove(word_list2)
zeroes_and_ones_of_all_words.insert(0, bitwise_op);
elif word == "not":
bitwise_op = [not w1 for w1 in word_list2]
bitwise_op = [int(b == True) for b in bitwise_op]
zeroes_and_ones_of_all_words.remove(word_list2)
zeroes_and_ones_of_all_words.remove(word_list1)
bitwise_op = [w1 & w2 for (w1,w2) in zip(word_list1,bitwise_op)]
zeroes_and_ones_of_all_words.insert(0, bitwise_op);
files = []
print(zeroes_and_ones_of_all_words)
lis = zeroes_and_ones_of_all_words[0]
cnt = 1
for index in lis:
if index == 1:
files.append(files_with_index[cnt])
cnt = cnt+1
print(files)