Python 3.x Keras One热编码内存管理-最佳解决方案
我知道这个问题在过去有不同的解决方法。但我无法理解并适应我的代码,需要帮助。我正在使用作为我的数据集。最终的期望是尝试为chatbot训练LSTM模型。但我被最初的一个热编码所困扰,并且内存不足。请注意,我正在培训的虚拟机内存为86GB,但仍然存在问题。在nmt_special_utils_mod.py中,一个热编码超出了分配的内存,我无法通过该阶段。在不丧失功能的情况下,任何替代方法都会很有帮助Python 3.x Keras One热编码内存管理-最佳解决方案,python-3.x,keras,lstm,chatbot,one-hot-encoding,Python 3.x,Keras,Lstm,Chatbot,One Hot Encoding,我知道这个问题在过去有不同的解决方法。但我无法理解并适应我的代码,需要帮助。我正在使用作为我的数据集。最终的期望是尝试为chatbot训练LSTM模型。但我被最初的一个热编码所困扰,并且内存不足。请注意,我正在培训的虚拟机内存为86GB,但仍然存在问题。在nmt_special_utils_mod.py中,一个热编码超出了分配的内存,我无法通过该阶段。在不丧失功能的情况下,任何替代方法都会很有帮助 Xoh = np.array(list(map(lambda x: to_categorical(
Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))
所有代码如下,以明确问题
导入语料库\u mod.py-
更改1:更新频率较低的单词删除
def data_load():
TrainDataSetPath = 'D:\\Script\\Python\\NLP\\chatbotSeq2SeqWithAtt\\ChatBot\\'
####initializing libraries####
#import numpy as np
#import tensorflow as tf
import re
#import time
########### Data Pre-processing Part 1##########
def clean_text(text):
'''The function will clean known texts and make it more meaningful'''
text = text.lower()
text = re.sub(r"i'm", "i am", text)
text = re.sub(r"he's", "he is", text)
text = re.sub(r"she's", "she is", text)
text = re.sub(r"it's", "it is", text)
text = re.sub(r"let's", "let us", text)
text = re.sub(r"that's", "that is", text)
text = re.sub(r"what's", "what is", text)
text = re.sub(r"where's", "where is", text)
text = re.sub(r"how's", "how is", text)
text = re.sub(r"howz", "how is", text)
text = re.sub(r"\'ll", " will", text)
text = re.sub(r"\'ve", " have", text)
text = re.sub(r"\'re", " are", text)
text = re.sub(r"\'d", " would", text)
text = re.sub(r"don't", "do not", text)
text = re.sub(r"won't", "will not", text)
text = re.sub(r"can't", "cannot", text)
text = re.sub(r"wouldn't", "would not", text)
text = re.sub(r"wasn't", "was not", text)
text = re.sub(r"haven't", "have not", text)
text = re.sub(r"\s+"," ",text)
text = re.sub(r"[-()\"#/@;:<>+=~|{}.?,]", "", text)
#####Add more below this line######
#####Add more above this line######
return text
lines = open(TrainDataSetPath+'movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations = open(TrainDataSetPath+'movie_conversations_short.txt', encoding='utf-8', errors='ignore').read().split('\n')
#Create dictionary which maps each line with its corresponding ID
id2line = {}
for line in lines:
_line = line.split(' +++$+++ ')
if len(_line) == 5:
id2line[_line[0]] = _line[4]
#Create list of all conversation
conversations_ids = []
for conversation in conversations[:-1]: #the last line in conversation is blank hence -1
#Split then pick last part[-1] which is conversation. Then Removing square bracket by [1:-1] and then replacing quotes and space
_conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
# Append to form a list of list separating by comma
conversations_ids.append(_conversation.split(","))
#Separating the question and answer - assuming the first is the question second is the answer in a conversation
questions = []
answers = []
threshold = 5 #If more than 15 counts of words
for conversation in conversations_ids:
for i in range(len(conversation)-1):
questions.append(id2line[conversation[i]])
answers.append(id2line[conversation[i+1]])
# Cleaning all questions
clean_questions = []
for question in questions:
clean_questions.append(clean_text(question))
# Cleaning all answers
clean_answers = []
for answer in answers:
clean_answers.append(clean_text(answer))
# Creating a dictionary that maps each word to its number of occurrence
word2count = {}
for question in clean_questions:
for word in question.split():
if word not in word2count:
word2count[word] = 1
else:
word2count[word] += 1
for answer in clean_answers:
for word in answer.split():
if word not in word2count:
word2count[word] = 1
else:
word2count[word] += 1
#Create dictionary of words which has more occurrence than threshold
for k in list(word2count):
if word2count[k] < threshold:
del word2count[k]
cleanest_questions, cleanest_answers, keys_list = [], [], list(word2count.keys())
for answers in clean_answers:
ans = []
for word in answers.split():
if word in keys_list:
ans.append(word)
else:
ans.append('<unk>')
cleanest_answers.append(' '.join(ans))
for question in clean_questions:
ques = []
for word in question.split():
if word in keys_list:
ques.append(word)
else:
ques.append('<unk>')
cleanest_questions.append(' '.join(ques))
return cleanest_questions, cleanest_answers
from tqdm import tqdm
from import_corpus_mod import data_load
def load_dataset(clean_questions, clean_answers):
"""
Loads a dataset with m examples and vocabularies
:m: the number of examples to generate
"""
human_vocab = set()
machine_vocab = set()
dataset = []
lines = len(clean_questions)
for i in tqdm(range(lines)):
hu, mc = clean_questions[i], clean_answers[i]
if hu is not None:
dataset.append((hu, mc))
human_vocab.update(set(hu.split()))
machine_vocab.update(set(mc.split()))
human = dict(zip(sorted(human_vocab) + ['<pad>'],
list(range(len(human_vocab) + 1))))
#human = dict(zip(sorted(human_vocab) + ['<pad>'],
#list(range(len(human_vocab) + 1))))
#human = dict(zip(sorted(human_vocab),
#list(range(len(human_vocab)))))
machine = dict(zip(sorted(machine_vocab) + ['<pad>'],
list(range(len(machine_vocab) + 1))))
#machine = dict(zip(sorted(machine_vocab) + ['<pad>'],
#list(range(len(machine_vocab) + 1))))
inv_machine = {v:k for k,v in machine.items()}
inv_human = {p:q for q,p in human.items()}
return dataset, human, machine, inv_machine, inv_human
clean_questions, clean_answers = data_load()
dataset, human_vocab, machine_vocab, inv_machine_vocab, inv_human_vocab = load_dataset(clean_questions, clean_answers)
然后根据您的指导立即更改输入
Xi = Input(shape=(Tx,))
X = Embedding( human_vocab_size, 100, embeddings_initializer='uniform', input_length=Tx , trainable=True )(Xi)
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
有很多错误
runfile('D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot/nmt_code_mod.py', wdir='D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot')
Reloaded modules: nmt_data_load_asmain_words, import_corpus_mod, nmt_special_utils_mod
100%|██████████| 384/384 [00:00<00:00, 24615.06it/s]
100%|██████████| 384/384 [00:00<?, ?it/s]
X.shape: (384, 8)
Y.shape: (384, 8)
D:\Python\Anaconda3\lib\site-packages\keras\engine\topology.py:1592: UserWarning: Model inputs must come from a Keras Input layer, they cannot be the output of a previous non-Input layer. Here, a tensor specified as input to "model_2" was not an Input tensor, it was generated by layer embedding_1.
Note that input tensors are instantiated via `tensor = Input(shape)`.
The tensor that caused the issue was: embedding_1/Gather:0
str(x.name))
Traceback (most recent call last):
File "<ipython-input-44-addb6f9e6bc1>", line 1, in <module>
runfile('D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot/nmt_code_mod.py', wdir='D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot')
File "D:\Python\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "D:\Python\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot/nmt_code_mod.py", line 138, in <module>
model = model(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab))
File "D:/Script/Python/NLP/chatbotSeq2SeqWithAtt/ChatBot/nmt_code_mod.py", line 132, in model
model = Model(inputs=[X,s0,c0],outputs=outputs)
File "D:\Python\Anaconda3\lib\site-packages\keras\legacy\interfaces.py", line 91, in wrapper
return func(*args, **kwargs)
File "D:\Python\Anaconda3\lib\site-packages\keras\engine\topology.py", line 1652, in __init__
layer.__class__.__name__))
TypeError: Input layers to a `Model` must be `InputLayer` objects. Received inputs: [<tf.Tensor 'embedding_1/Gather:0' shape=(?, 8, 100) dtype=float32>, <tf.Tensor 's0_1:0' shape=(?, 64) dtype=float32>, <tf.Tensor 'c0_1:0' shape=(?, 64) dtype=float32>]. Input 0 (0-based) originates from layer type `Embedding`
runfile('D:/Script/Python/NLP/chatbotseq2seq with att/ChatBot/nmt_code_mod.py',wdir='D:/Script/Python/NLP/chatbotseq2seq with att/ChatBot')
重新加载的模块:nmt_数据_加载_asmain_words、import_corpus_mod、nmt_special_utils_mod
100%|██████████| 384/384[00:00我不建议使用一个热编码和密集矩阵。
如果您的词汇量为100.000个单词,那么100.000 x 100.000会消耗超过70Gb的RAM
您可以尝试使用稀疏矩阵。但我想这会改变您的其余代码。您可以看看这个
您可以使用is单词嵌入表示法,该表示法紧凑、内存友好,可供所有最先进的NLP系统使用
在任何情况下,有人认为您必须处理模型,即使用适当的方法处理嵌入的输入。
该层存储嵌入矩阵一次,然后您可以构建训练样本,只给出一个表示词汇表中单词索引的整数
如果您想要一个热编码,您可以使用一个。其中N是词汇表的大小。然后您可以将单词的索引作为整数传递作为输入。这将增加模型的大小,但会减少批处理的大小
如果您想要word2vec嵌入,可以加载一个嵌入矩阵,其中N是词汇表的大小,V是嵌入的维度。您会注意到,V通常设置为100或200个维度,这比N小得多。节省了大量内存
编辑:要澄清在您的案例中嵌入的用法:
你可以:
X = Input(shape=(Tx, human_vocab_size))
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
相反,您可以通过以下方式进行一次热编码:
Xi = Input(shape=(Tx,))
X = Embedding( human_vocab_size, human_vocab_size, embeddings_initializer=keras.initializers.Identity, input_length=Tx )(Xi)
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
通过这样做,您可以仅使用单词索引而不是一个热向量来构建训练样本。这将使您在训练样本中节省一些空间,但您的模型将更大。
如果它仍然太大,您将别无选择,只能使用密集嵌入。为此,您可以执行以下操作:
Xi = Input(shape=(Tx,))
X = Embedding( human_vocab_size, 100, embeddings_initializer='uniform', input_length=Tx , trainable=True )(Xi)
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
这将使用紧凑的表示法(维度100而不是人声大小)随机初始化嵌入。这将节省大量内存
最后,你可以用小写字母或用特殊的标记“稀有”替换稀有词(在语料库中只出现一到两次)来减少词汇量问题不在于热编码,而在于将整个数据集存储在内存中。明智的选择是使用生成器,或者使用允许您动态加载和编码数据的生成器。例如,这通常适用于大型图像数据集
我建议您执行所有预处理并保存输入、输出对,而不将其编码为csv文件,然后您可以创建一个生成器,用于延迟加载和编码:
class MySequence(Sequence):
def __init__(self, data, batch_size):
self.data_file = data
self.batch_size = batch_size
def __len__(self):
return int(np.ceil(len(self.x) / float(self.batch_size)))
def __getitem__(self, batch_id):
# Get corresponding batch data...
# one-hot encode
return X, Y
注意生成器(或序列[i])返回单个批处理。你说得对。这是一个启用了池的虚拟机-我可以看到它在崩溃之前的内存使用率将达到120GB。使用稀疏矩阵将改变一切。精度也可能会下降?通过单词嵌入,你的意思是像Glove或gensim一样?这不是更需要内存吗?啊,是的,我为此介绍了,只是没有介绍代码yet-不确定它将如何运行?您的问题是,向量的维数太大。因此,当您加载数据集时,它不适合内存。因此,您可以按照nuric所说的加载较小的批(注意了解完整的词汇表大小)。或者你可以使用Keras的嵌入层。我更新了我的回复,详细介绍了如何使用该嵌入层。我尽了最大努力将其分类,但在挠头2天后我放弃了。看起来太难了,超出了我的能力范围-我在本地也没有人讨论。我正在po中添加所有代码st-我不确定在编辑完文章后,如何继续使用unk替换的罕见功能。对于模型件,我尝试使用您的建议-但我猜Xoh和Yoh用于model.fit语句输出=列表(Yoh.swapaxes(0,1))model.fit([Xoh,s0,c0],输出,年代=年代,批次大小=5)我正在考虑使用手套矢量编码-但不确定我是否能很好地绕过它。我认为我们没有太多的选择余地了谢谢-我从来没有使用过keras序列。我将尝试一下。只是想知道这不是seq2seq训练的一个很常见的问题吗?有没有一个选项可以进行强化学习?我在ba的哪里学习t检查并更新重量等?这是一件很常见的事情,这就是为什么生成器返回单个批次,训练然后重复。我已经在这里更新了整个代码。但是无法在内存中找到合适的代码。
Xi = Input(shape=(Tx,))
X = Embedding( human_vocab_size, human_vocab_size, embeddings_initializer=keras.initializers.Identity, input_length=Tx )(Xi)
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
Xi = Input(shape=(Tx,))
X = Embedding( human_vocab_size, 100, embeddings_initializer='uniform', input_length=Tx , trainable=True )(Xi)
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
class MySequence(Sequence):
def __init__(self, data, batch_size):
self.data_file = data
self.batch_size = batch_size
def __len__(self):
return int(np.ceil(len(self.x) / float(self.batch_size)))
def __getitem__(self, batch_id):
# Get corresponding batch data...
# one-hot encode
return X, Y