Python 为实体使用多个令牌时Spacy自定义NER模型训练错误
我正在训练Spacy自定义NER(命名实体识别)模型。我遵循了链接中的步骤 及 根据提供的示例,每个单词都有一个实体标签。当我训练每个标签使用一个单词时,我成功地完成了它但在我的场景中,我需要用一个以上的单词或句子来训练实体。有时可以是7个或更多单词,例如:“部分损失30桶”或“总损失=50桶”……等等。但是,当我这样做时,代码出现以下错误:Python 为实体使用多个令牌时Spacy自定义NER模型训练错误,python,nlp,spacy,ner,Python,Nlp,Spacy,Ner,我正在训练Spacy自定义NER(命名实体识别)模型。我遵循了链接中的步骤 及 根据提供的示例,每个单词都有一个实体标签。当我训练每个标签使用一个单词时,我成功地完成了它但在我的场景中,我需要用一个以上的单词或句子来训练实体。有时可以是7个或更多单词,例如:“部分损失30桶”或“总损失=50桶”……等等。但是,当我这样做时,代码出现以下错误: ---------------------------------------------------------------------------
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-45-a9c9ec92bab3> in <module>
110 n_iter= 30 #("Number of training iterations", "option", "n", int))
111
--> 112 train_test(model, new_model_name, output_dir, n_iter)
<ipython-input-45-a9c9ec92bab3> in train_test(model, new_model_name, output_dir, n_iter)
74 texts, annotations = zip(*batch)
75 nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
---> 76 losses=losses)
77 print('Losses', losses)
78
~\AppData\Local\Continuum\anaconda3\lib\site-packages\spacy\language.py in update(self, docs, golds, drop, sgd, losses, component_cfg)
513 kwargs = component_cfg.get(name, {})
514 kwargs.setdefault("drop", drop)
--> 515 proc.update(docs, golds, sgd=get_grads, losses=losses, **kwargs)
516 for key, (W, dW) in grads.items():
517 sgd(W, dW, key=key)
nn_parser.pyx in spacy.syntax.nn_parser.Parser.update()
nn_parser.pyx in spacy.syntax.nn_parser.Parser._init_gold_batch()
ner.pyx in spacy.syntax.ner.BiluoPushDown.preprocess_gold()
ner.pyx in spacy.syntax.ner.BiluoPushDown.has_gold()
TypeError: object of type 'NoneType' has no len()
感谢您的帮助,因为我在这一点上陷入困境,我无法在互联网上找到解决方案 我认为问题与数据(json文件)有关。因此,我添加了一个条件,用于打印列数据中注释的循环内的行:用于annotations.get('entities')中的ent。如果没有值或对象的长度不大于0。现在它正在打印其中的许多,但我仍然得到这个错误。所以我不确定怎样才能解决这个错误。有什么想法吗?
#!/usr/bin/env python
# coding: utf8
# Training additional entity types using spaCy
from __future__ import unicode_literals, print_function
import pickle
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
# New entity labels
# Specify the new entity labels which you want to add here
LABELSS = ["MY_CUSTOM_ENTITY" , "U-Tag"]
# Loading training data
with open ('C:\\Users\\NER\\\ner_corpus_spacy_format_data.json', 'rb') as fp:
TRAIN_DATA = pickle.load(fp)
@plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
new_model_name=("New model name for model meta.", "option", "nm", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int))
def train_test(model=None, new_model_name='AnyName43', output_dir=None, n_iter=30):
"""Setting up the pipeline and entity recognizer, and training the new entity."""
if model is not None:
nlp = spacy.load(model) # load existing spacy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank('en') # create blank Language class
print("Created blank 'en' model")
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
else:
ner = nlp.get_pipe('ner')
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
#for i in LABELSS:
#ner.add_label(i) # Add new entity labels to entity recognizer
if model is None:
optimizer = nlp.begin_training()
else:
optimizer = nlp.entity.create_optimizer()
# Get names of other pipes to disable them during training to train only NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
losses=losses)
print('Losses', losses)
# Test the trained model
test_text = 'This is the text that has the instance of my custom entity. I am not using actual data since it is confidential, it can be something like: Had total loss of 60 BBLs or total losses = 85 BBLs. I have dataframe which consists of thousands of records.'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
print(ent.label_, ent.text)
# Save model
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.meta['name'] = new_model_name # rename model
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# Test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc2 = nlp2(test_text)
for ent in doc2.ents:
print(ent.label_, ent.text)
#if __name__ == '__main__':
#plac.call(train_test)
model= None #"en_core_web_sm" #("Model name. Defaults to blank 'en' model.", "option", "m", str),
new_model_name= "MyModelName" #("New model name for model meta.", "option", "nm", str),
output_dir= 'C:\\Users\\NER\\TRAIN_TEST_OUTPUT' #("Optional output directory", "option", "o", Path),
n_iter= 30 #("Number of training iterations", "option", "n", int))
train_test(model, new_model_name, output_dir, n_iter)