Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/337.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 在scikit学习管道中使用gensim word2vec_Python_Scikit Learn_Word2vec_Gensim - Fatal编程技术网

Python 在scikit学习管道中使用gensim word2vec

Python 在scikit学习管道中使用gensim word2vec,python,scikit-learn,word2vec,gensim,Python,Scikit Learn,Word2vec,Gensim,我试图在scikit学习管道中使用word2vec from sklearn.base import BaseEstimator, TransformerMixin import pandas as pd import numpy as np class ItemSelector(BaseEstimator, TransformerMixin): def __init__(self, key): self.key = key def fit(self, x,

我试图在scikit学习管道中使用
word2vec

from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


from sklearn.pipeline import Pipeline
from gensim.sklearn_api import W2VTransformer
pipeline_word2vec = Pipeline([
                ('selector', ItemSelector(key='X')),
                ('w2v', W2VTransformer()),
            ])

pipeline_word2vec.fit(pd.DataFrame({'X':['hello world','is amazing']}), np.array([1,0]))
这给了我

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-11-9e2dd309d07c> in <module>()
     23                 ('w2v', W2VTransformer()),
     24             ])
---> 25 pipeline_word2vec.fit(pd.DataFrame({'X':['hello world','is amazing']}), np.array([1,0]))

/usr/local/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    248         Xt, fit_params = self._fit(X, y, **fit_params)
    249         if self._final_estimator is not None:
--> 250             self._final_estimator.fit(Xt, y, **fit_params)
    251         return self
    252 

/usr/local/anaconda3/lib/python3.6/site-packages/gensim/sklearn_api/w2vmodel.py in fit(self, X, y)
     62             sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean,
     63             hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word, trim_rule=self.trim_rule,
---> 64             sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
     65         )
     66         return self

/usr/local/anaconda3/lib/python3.6/site-packages/gensim/models/word2vec.py in __init__(self, sentences, size, alpha, window, min_count, max_vocab_size, sample, seed, workers, min_alpha, sg, hs, negative, cbow_mean, hashfxn, iter, null_word, trim_rule, sorted_vocab, batch_words, compute_loss, callbacks)
    525             batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed,
    526             hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss,
--> 527             fast_version=FAST_VERSION)
    528 
    529     def _do_train_job(self, sentences, alpha, inits):

/usr/local/anaconda3/lib/python3.6/site-packages/gensim/models/base_any2vec.py in __init__(self, sentences, workers, vector_size, epochs, callbacks, batch_words, trim_rule, sg, alpha, window, seed, hs, negative, cbow_mean, min_alpha, compute_loss, fast_version, **kwargs)
    336             self.train(
    337                 sentences, total_examples=self.corpus_count, epochs=self.epochs, start_alpha=self.alpha,
--> 338                 end_alpha=self.min_alpha, compute_loss=compute_loss)
    339         else:
    340             if trim_rule is not None:

/usr/local/anaconda3/lib/python3.6/site-packages/gensim/models/word2vec.py in train(self, sentences, total_examples, total_words, epochs, start_alpha, end_alpha, word_count, queue_factor, report_delay, compute_loss, callbacks)
    609             sentences, total_examples=total_examples, total_words=total_words,
    610             epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
--> 611             queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks)
    612 
    613     def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor=2, report_delay=1):

/usr/local/anaconda3/lib/python3.6/site-packages/gensim/models/base_any2vec.py in train(self, sentences, total_examples, total_words, epochs, start_alpha, end_alpha, word_count, queue_factor, report_delay, compute_loss, callbacks)
    567             sentences, total_examples=total_examples, total_words=total_words,
    568             epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
--> 569             queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks)
    570 
    571     def _get_job_params(self, cur_epoch):

/usr/local/anaconda3/lib/python3.6/site-packages/gensim/models/base_any2vec.py in train(self, data_iterable, epochs, total_examples, total_words, queue_factor, report_delay, callbacks, **kwargs)
    239             epochs=epochs,
    240             total_examples=total_examples,
--> 241             total_words=total_words, **kwargs)
    242 
    243         for callback in self.callbacks:

/usr/local/anaconda3/lib/python3.6/site-packages/gensim/models/base_any2vec.py in _check_training_sanity(self, epochs, total_examples, total_words, **kwargs)
    599 
    600         if not self.wv.vocab:  # should be set by `build_vocab`
--> 601             raise RuntimeError("you must first build vocabulary before training the model")
    602         if not len(self.wv.vectors):
    603             raise RuntimeError("you must initialize vectors before training the model")

RuntimeError: you must first build vocabulary before training the model
---------------------------------------------------------------------------
运行时错误回溯(上次最近调用)
在()
23('w2v',w2v变压器()),
24             ])
--->25 pipeline_word2vec.fit(pd.DataFrame({'X':['hello world','is sapping']}),np.array([1,0]))
/usr/local/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py-in-fit(self,X,y,**fit_参数)
248 Xt,拟合参数=自拟合(X,y,**拟合参数)
249如果self.\u final\u估计器不是无:
-->250自我最终估计值拟合(Xt,y,**拟合参数)
251返回自我
252
/usr/local/anaconda3/lib/python3.6/site-packages/gensim/sklearn_api/w2vmodel.py适合(self,X,y)
62 sg=self.sg,hs=self.hs,negative=self.negative,cbow_mean=self.cbow_mean,
63 hashfxn=self.hashfxn,iter=self.iter,null\u word=self.null\u word,trim\u rule=self.trim\u rule,
--->64排序的单词=self.sorted的单词,批处理的单词=self.batch的单词
65         )
66回归自我
/usr/local/anaconda3/lib/python3.6/site-packages/gensim/models/word2vec.py in_uuu__init_uu(self、句子、大小、字母、窗口、最小计数、最大词汇量、样本、种子、工作者、最小字母、sg、hs、否定、cbow_均值、hashfxn、iter、空字、修剪规则、排序词汇量、批字、计算损失、回调)
525批字=批字,修剪规则=修剪规则,sg=sg,alpha=alpha,window=window,seed=seed,
526 hs=hs,负值=负值,cbow_平均值=cbow_平均值,min_alpha=min_alpha,compute_损耗=compute_损耗,
-->527快速版本=快速版本)
528
529定义做培训工作(自我、句子、字母、初始):
/usr/local/anaconda3/lib/python3.6/site-packages/gensim/models/base_any2vec.py in_____init____(自我、句子、工作者、向量大小、时代、回调、批处理词、修剪规则、sg、alpha、窗口、种子、hs、负数、cbow_均值、min_alpha、计算损失、快速版、**kwargs)
336自动列车(
337个句子,总例数=self.corpus\u计数,epochs=self.epochs,start\u alpha=self.alpha,
-->338 end_alpha=self.min_alpha,compute_损耗=compute_损耗)
339其他:
340如果修剪规则不是无:
/usr/local/anaconda3/lib/python3.6/site-packages/gensim/models/word2vec.py in-train(self、句子、示例总数、单词总数、年代、起始字母、结束字母、单词计数、队列系数、报告延迟、计算丢失、回调)
609个句子,共例=共例,共词=共词,
610时代=时代,开始字母=开始字母,结束字母=结束字母,单词计数=单词计数,
-->611队列系数=队列系数,报告延迟=报告延迟,计算损失=计算损失,回调=回调)
612
613 def分数(自我、句子、句子总数=int(1e6)、chunksize=100、队列系数=2、报告延迟=1):
/usr/local/anaconda3/lib/python3.6/site-packages/gensim/models/base_any2vec.py in train(self、句子、示例总数、单词总数、年代、起始字母、结束字母、单词计数、队列系数、报告延迟、计算丢失、回调)
567个句子,共例=共例,共词=共词,
568时代=时代,开始字母=开始字母,结束字母=结束字母,单词计数=单词计数,
-->569队列系数=队列系数,报告延迟=报告延迟,计算损失=计算损失,回调=回调)
570
571定义获得工作参数(自身、当前):
/usr/local/anaconda3/lib/python3.6/site-packages/gensim/models/base_any2vec.py in train(self、data_iterable、epochs、total_示例、total_words、queue_factor、report_delay、callbacks、**kwargs)
239个时代=时代,
240个总示例=总示例,
-->241总字数=总字数,**kwargs)
242
243对于self.callbacks中的回调:
/usr/local/anaconda3/lib/python3.6/site-packages/gensim/models/base\u any2vec.py in\u check\u training\u santy(自我、时代、总示例、总单词、**kwargs)
599
600如果不是self.wv.vocab:#应该由'build_vocab'设置`
-->601 raise RUNTIMERROR(“在训练模型之前必须首先构建词汇表”)
602如果不是len(自矢量):
603 raise RUNTIMERROR(“训练模型之前必须初始化向量”)
RuntimeError:在训练模型之前,必须首先构建词汇表

在jupyter笔记本中。相反,我寻求一个训练有素的模特。如何解决此问题?

W2V变压器的
min\u count
参数默认为5。因此,这个错误仅仅是因为您只提供了2个文档,但要求词汇表中的每个单词至少出现在5个文档中

可能的解决办法:

  • 减少
    min\u计数

  • 为模型提供更多文档