Nlp 句子编码器中的键错误

Nlp 句子编码器中的键错误,nlp,word2vec,Nlp,Word2vec,“语料库”是字符串(段落)的列表。错误出现在“语句嵌入=模型.encode(dist\u repr,is\u pretokenized=True)”行。我尝试了is_pretokenized=False,这是默认值,它给出了未知单词的keyrerror。然后,我首先添加了一个矢量器,然后将转换后的矩阵传递给编码器。因此出现了这个错误。你能看看你能不能在网上找到一些解决办法吗 from sentence_transformers import SentenceTransformer, Loggin

“语料库”是字符串(段落)的列表。错误出现在“语句嵌入=模型.encode(dist\u repr,is\u pretokenized=True)”行。我尝试了is_pretokenized=False,这是默认值,它给出了未知单词的keyrerror。然后,我首先添加了一个矢量器,然后将转换后的矩阵传递给编码器。因此出现了这个错误。你能看看你能不能在网上找到一些解决办法吗

from sentence_transformers import SentenceTransformer, LoggingHandler
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer(stop_words='english')  
vectorizer.fixed_vocabulary_ = True
dist_repr = vectorizer.fit_transform(corpus)
dist_repr = dist_repr.todense()
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
sentence_embeddings = model.encode(dist_repr, is_pretokenized = True)


语句转换器
的文档似乎更喜欢演示对非标记化输入字符串的操作-参见示例。因此,作为早期步骤,您可能不想使用某种
CountVectorizer
表示法(这将是一个大的稀疏字数数组,而不是字数标记)来进行未来化。我建议您编辑您的问题,以便更好地描述您收到的原始
KeyError
的代码/错误/堆栈。修复这个问题可能比进一步使预处理复杂化要好。您还应该显示触发错误的确切最小输入,例如触发错误的实际句子。

272 dist_repr = dist_repr.todense()
273 model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
--> 274 sentence_embeddings = model.encode(dist_repr, is_pretokenized = True)
275 dist_repr = sentence_embeddings
276

~/opt/anaconda3/lib/python3.7/site-packages/sentence_transformers/SentenceTransformer.py in encode(self, sentences, batch_size, show_progress_bar, output_value, convert_to_numpy, convert_to_tensor, is_pretokenized, device, num_workers)
173 iterator = tqdm(inp_dataloader, desc="Batches")
174
--> 175 for features in iterator:
176 for feature_name in features:
177 features[feature_name] = features[feature_name].to(device)

~/opt/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py in __next__(self)
343
344 def __next__(self):
--> 345 data = self._next_data()
346 self._num_yielded += 1
347 if self._dataset_kind == _DatasetKind.Iterable and \

~/opt/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _next_data(self)
383 def _next_data(self):
384 index = self._next_index() # may raise StopIteration
--> 385 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
386 if self._pin_memory:
387 data = _utils.pin_memory.pin_memory(data)

~/opt/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
45 else:
46 data = self.dataset[possibly_batched_index]
---> 47 return self.collate_fn(data)

~/opt/anaconda3/lib/python3.7/site-packages/sentence_transformers/SentenceTransformer.py in smart_batching_collate_text_only(self, batch)
421
422 for text in batch:
--> 423 sentence_features = self.get_sentence_features(text, max_seq_len)
424 for feature_name in sentence_features:
425 if feature_name not in feature_lists:

~/opt/anaconda3/lib/python3.7/site-packages/sentence_transformers/SentenceTransformer.py in get_sentence_features(self, *features)
320
321 def get_sentence_features(self, *features):
--> 322 return self._first_module().get_sentence_features(*features)
323
324 def get_sentence_embedding_dimension(self):

~/opt/anaconda3/lib/python3.7/site-packages/sentence_transformers/models/Transformer.py in get_sentence_features(self, tokens, pad_seq_length)
77 return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, padding='max_length', return_tensors='pt', truncation=True, prepend_batch_axis=True)
78 else:
---> 79 return self.tokenizer.prepare_for_model(tokens[0], tokens[1], max_length=pad_seq_length, padding='max_length', return_tensors='pt', truncation='longest_first', prepend_batch_axis=True)
80
81 def get_config_dict(self):

~/opt/anaconda3/lib/python3.7/site-packages/numpy/matrixlib/defmatrix.py in __getitem__(self, index)
191
192 try:
--> 193 out = N.ndarray.__getitem__(self, index)
194 finally:
195 self._getitem = False

IndexError: index 1 is out of bounds for axis 0 with size 1