Api 类型错误:'<';在';str';和';int';Doc2Vec

Api 类型错误:'<';在';str';和';int';Doc2Vec,api,flask,doc2vec,Api,Flask,Doc2vec,你知道为什么会抛出这个错误吗 “TypeError:”如果您试图查找模型中不存在的字符串文档标记,不幸的是,您会得到这个令人困惑的错误,而不是更清晰的错误。(请参阅gensim的公开问题:) data['doc1']中的任何内容都不是模型中的标记 在尝试most_similable()操作之前,您可以通过查看data['doc1']是否在model.docvecs中,isTrueTypeError:“你的意思是标记'doc1'不在我的模型中吗?我的数据文件包含我训练过的.txt文档。在model

你知道为什么会抛出这个错误吗
“TypeError:”如果您试图查找模型中不存在的字符串文档标记,不幸的是,您会得到这个令人困惑的错误,而不是更清晰的错误。(请参阅gensim的公开问题:)

data['doc1']
中的任何内容都不是模型中的标记


在尝试
most_similable()
操作之前,您可以通过查看
data['doc1']是否在model.docvecs
中,is
True
TypeError:“你的意思是标记'doc1'不在我的模型中吗?我的数据文件包含我训练过的.txt文档。在model.py中使用我的jupyter笔记本中的相同代码,它在笔记本中工作正常。不,不管是什么,
数据['doc1']
正在返回,因此作为单个位置参数传递给
最相似()
,是或包含不在模型中的doctag。什么是
数据['doc1']
以及模型中的
数据['doc1']返回什么。docvecs
返回什么?
File "C:\Users\ibrahimm\AppData\Local\Continuum\anaconda3\lib\site-packages\flask\app.py", line 
2463, in __call__
return self.wsgi_app(environ, start_response)

File "C:\Users\ibrahimm\AppData\Local\Continuum\anaconda3\lib\site-packages\flask\app.py", line 
2449, in wsgi_app
response = self.handle_exception(e)

File "C:\Users\ibrahimm\AppData\Local\Continuum\anaconda3\lib\site-packages\flask\app.py", line 
1866, in handle_exception
reraise(exc_type, exc_value, tb)

File "C:\Users\ibrahimm\AppData\Local\Continuum\anaconda3\lib\site-packages\flask\_compat.py", line 
39, in reraise
raise value

File "C:\Users\ibrahimm\AppData\Local\Continuum\anaconda3\lib\site-packages\flask\app.py", line 
2446, in wsgi_app
response = self.full_dispatch_request()

File "C:\Users\ibrahimm\AppData\Local\Continuum\anaconda3\lib\site-packages\flask\app.py", line 
1951, in full_dispatch_request
rv = self.handle_user_exception(e)

File "C:\Users\ibrahimm\AppData\Local\Continuum\anaconda3\lib\site-packages\flask\app.py", line 
1820, 
in handle_user_exception
reraise(exc_type, exc_value, tb)

File "C:\Users\ibrahimm\AppData\Local\Continuum\anaconda3\lib\site-packages\flask\_compat.py", line 
39, in reraise
raise value

File "C:\Users\ibrahimm\AppData\Local\Continuum\anaconda3\lib\site-packages\flask\app.py", line 
1949, 
in full_dispatch_request
rv = self.dispatch_request()

File "C:\Users\ibrahimm\AppData\Local\Continuum\anaconda3\lib\site-packages\flask\app.py", line 
1935, 
in dispatch_request
return self.view_functions[rule.endpoint](**req.view_args)

File "C:\Users\ibrahimm\Desktop\doc2vec-compare-doc-demo\app.py", line 56, in api_compare_2
vec1 = d2v_model.docvecs.most_similar(data['doc1'])

File "C:\Users\ibrahimm\AppData\Local\Continuum\anaconda3\lib\site- 
packages\gensim\models\keyedvectors.py", line 1715, in most_similar

elif doc in self.doctags or doc < self.count:
TypeError: '<' not supported between instances of 'str' and 'int'\
@app.route('/api/compare_2', methods=['POST'])
def api_compare_2():
    data = request.get_json()
    if not 'doc1' in data or not 'doc2' in data:
        return 'ERROR'

    vec1 = d2v_model.docvecs.most_similar(data['doc1'])
    vec2 = d2v_model.docvecs.most_similar(data['doc2'])

    vec1 = gensim.matutils.full2sparse(vec1)
    vec2 = gensim.matutils.full2sparse(vec2)

    print (data)
    print (vec2)
    print (vec1)

    return jsonify(sim=gensim.matutils.cossim(vec1, vec2))


@app.route('/api/compare_all', methods=['POST'])
def api_compare_all():
    data = request.get_json()
    if not 'doc' in data:
        return 'ERROR'

    vec = d2v_model.docvecs.most_similar(data['doc'])
    res = d2v_model.docvecs.most_similar([vec], topn=5)

    return jsonify(list=res)
def load_model():
    try:
        return gensim.models.doc2vec.Doc2Vec.load("doc2vec.model2")
    except:
        print ('Model not found!')
        return None

def train_model():
    #path to the input corpus files
    data="data"

    #tagging the text files
    class DocIterator(object):
        def __init__(self, doc_list, labels_list):
            self.labels_list = labels_list
            self.doc_list = doc_list

        def __iter__(self):
            for idx, doc in enumerate(self.doc_list):
                yield TaggedDocument(words=doc.split(), tags=[self.labels_list[idx]])

    docLabels = [f for f in listdir(data) if f.endswith('.txt')]
    print(docLabels)
    data = []
    for doc in docLabels:
        data.append(open(r'C:\Users\ibrahimm\Desktop\doc2vec-compare-doc-demo\data\\' + doc, 
    encoding='cp437').read())

    tokenizer = RegexpTokenizer(r'\w+')
    stopword_set = set(stopwords.words('english'))

    #This function does all cleaning of data using two objects above

def nlp_clean(data):
    new_data = []
    for d in data:
        new_str = d.lower()
        dlist = tokenizer.tokenize(new_str)
        dlist = list(set(dlist).difference(stopword_set))
        new_data.append(dlist)
        return new_data

        data = nlp_clean(data)
        it = DocIterator(data, docLabels)


    #train doc2vec model
    model = gensim.models.Doc2Vec(size=300, window=15, min_count=4, workers=10,alpha=0.025, min_alpha=0.025, iter=20) # use fixed learning rate
    model.build_vocab(it)
    model.train(it, epochs=model.iter, total_examples=model.corpus_count)


    model.save("doc2vec.model2")