Python 为实体使用多个令牌时Spacy自定义NER模型训练错误_Python_Nlp_Spacy_Ner

Python 为实体使用多个令牌时Spacy自定义NER模型训练错误

python nlp

Python 为实体使用多个令牌时Spacy自定义NER模型训练错误,python,nlp,spacy,ner,Python,Nlp,Spacy,Ner,我正在训练Spacy自定义NER（命名实体识别）模型。我遵循了链接中的步骤及根据提供的示例，每个单词都有一个实体标签。当我训练每个标签使用一个单词时，我成功地完成了它但在我的场景中，我需要用一个以上的单词或句子来训练实体。有时可以是7个或更多单词，例如：“部分损失30桶”或“总损失=50桶”……等等。但是，当我这样做时，代码出现以下错误： ---------------------------------------------------------------------------

我正在训练Spacy自定义NER（命名实体识别）模型。我遵循了链接中的步骤及

根据提供的示例，每个单词都有一个实体标签。当我训练每个标签使用一个单词时，我成功地完成了它但在我的场景中，我需要用一个以上的单词或句子来训练实体。有时可以是7个或更多单词，例如：“部分损失30桶”或“总损失=50桶”……等等。但是，当我这样做时，代码出现以下错误：

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-45-a9c9ec92bab3> in <module>
    110 n_iter= 30 #("Number of training iterations", "option", "n", int))
    111 
--> 112 train_test(model, new_model_name, output_dir, n_iter)

<ipython-input-45-a9c9ec92bab3> in train_test(model, new_model_name, output_dir, n_iter)
     74                 texts, annotations = zip(*batch)
     75                 nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
---> 76                            losses=losses)
     77             print('Losses', losses)
     78 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\spacy\language.py in update(self, docs, golds, drop, sgd, losses, component_cfg)
    513             kwargs = component_cfg.get(name, {})
    514             kwargs.setdefault("drop", drop)
--> 515             proc.update(docs, golds, sgd=get_grads, losses=losses, **kwargs)
    516             for key, (W, dW) in grads.items():
    517                 sgd(W, dW, key=key)

nn_parser.pyx in spacy.syntax.nn_parser.Parser.update()

nn_parser.pyx in spacy.syntax.nn_parser.Parser._init_gold_batch()

ner.pyx in spacy.syntax.ner.BiluoPushDown.preprocess_gold()

ner.pyx in spacy.syntax.ner.BiluoPushDown.has_gold()

TypeError: object of type 'NoneType' has no len()

感谢您的帮助，因为我在这一点上陷入困境，我无法在互联网上找到解决方案

我认为问题与数据（json文件）有关。因此，我添加了一个条件，用于打印列数据中注释的循环内的行：用于annotations.get（'entities'）中的ent。如果没有值或对象的长度不大于0。现在它正在打印其中的许多，但我仍然得到这个错误。所以我不确定怎样才能解决这个错误。有什么想法吗？

#!/usr/bin/env python
# coding: utf8

# Training additional entity types using spaCy
from __future__ import unicode_literals, print_function
import pickle
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# New entity labels
# Specify the new entity labels which you want to add here
LABELSS = ["MY_CUSTOM_ENTITY" , "U-Tag"] 

# Loading training data 
with open ('C:\\Users\\NER\\\ner_corpus_spacy_format_data.json', 'rb') as fp:
    TRAIN_DATA = pickle.load(fp)

@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))

def train_test(model=None, new_model_name='AnyName43', output_dir=None, n_iter=30):
    """Setting up the pipeline and entity recognizer, and training the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe('ner')


    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])


    #for i in LABELSS:
        #ner.add_label(i)   # Add new entity labels to entity recognizer

    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.entity.create_optimizer()

    # Get names of other pipes to disable them during training to train only NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
            print('Losses', losses)

    # Test the trained model
    test_text = 'This is the text that has the instance of my custom entity. I am not using actual data since it is confidential, it can be something like: Had total loss of 60 BBLs or total losses = 85 BBLs. I have dataframe which consists of thousands of records.' 
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # Save model 
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # Test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)


#if __name__ == '__main__':
#plac.call(train_test)


model= None #"en_core_web_sm" #("Model name. Defaults to blank 'en' model.", "option", "m", str),
new_model_name= "MyModelName" #("New model name for model meta.", "option", "nm", str),
output_dir= 'C:\\Users\\NER\\TRAIN_TEST_OUTPUT' #("Optional output directory", "option", "o", Path),
n_iter= 30 #("Number of training iterations", "option", "n", int))

train_test(model, new_model_name, output_dir, n_iter)