Python 3.x spacy遗忘旧训练数据的原因及解决方法

Python 3.x spacy遗忘旧训练数据的原因及解决方法,python-3.x,nlp,spacy,named-entity-recognition,Python 3.x,Nlp,Spacy,Named Entity Recognition,我正在尝试为ner训练一个spacy模型。我有一个2940行的数据集,我用这些数据训练了一个基本模型,它的名称是current\u model,我得到了另外10个不同的数据集,每个数据集的行数从200到530行不等,所以我使用spacy的spacy.load(“current\u model”)加载了我的当前\u模型,然后我使用我的每个数据集进行了训练。我尝试使用测试数据预测ner,它在新数据集中识别ner,但在旧数据集中似乎忘记了ner。我这样做是为了减少训练时间。请查看下面我的代码以查看我尝

我正在尝试为ner训练一个spacy模型。我有一个2940行的数据集,我用这些数据训练了一个基本模型,它的名称是
current\u model
,我得到了另外10个不同的数据集,每个数据集的行数从200到530行不等,所以我使用spacy的
spacy.load(“current\u model”)
加载了我的当前\u模型,然后我使用我的每个数据集进行了训练。我尝试使用
测试数据预测ner,它在新数据集中识别ner,但在旧数据集中似乎忘记了ner。我这样做是为了减少训练时间。请查看下面我的代码以查看我尝试执行的操作

基本型号培训代码

import spacy
from spacy.util import minibatch,compounding
import random
from pathlib import Path
from spacy import displacy
import re
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
imporcytoolz import partition_all
import os
from os import path
import shutil
import json


df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")
def populate_train_data(df):
train_data = []
i =0
for d_index, row in df.iterrows():
    print(row["annotations"])
    content = row["annotations"].replace("\\n", "\n").replace("\n", " ")
    content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)

# Finding tags and entities and store values in a entity list-----
soup = BeautifulSoup(content, "html.parser")
text = soup.get_text()
entities = []
for tag in soup.find_all():
    if tag.string is None:
        # failing silently for invalid tag
        print(f'Tagging is invalid: {row["_id"], tag.name}, on row {i+2}skipping..')
        continue

tag_index = content.split(str(tag))[0].count(tag.string)
try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no {i+2}")

continue

i += 1

if entities:

train_data.append((text, {"entities": entities}))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = {}

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took {}".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", {"entities":[(7,12,"DAY"),(17,24,"DAY")]}),

# ("Today is my lucky day", {"entities":[(1,5,"DAY")]}),

# ("Yesterday and Today are two same days of a month",{"entities":[(14,19,"DAY")]}),

# ("May Today is Best Day",{"entities":[(4,9,"DAY")]}),

# ("Have a Nice Today and Every Day",{"entities":[(12,17,"DAY")]}),

# ("Hey How are feeling Today",{"entities":[(20,25,"DAY")]}),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])cytoolz import partition_all

import os

from os import path

import shutil

import json



df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")

def populate_train_data(df):

train_data = []

i =0

for d_index, row in df.iterrows():

print(row["annotations"])

content = row["annotations"].replace("\\n", "\n").replace("\n", " ")

content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)


# Finding tags and entities and store values in a entity list-----

soup = BeautifulSoup(content, "html.parser")

text = soup.get_text()

entities = []

for tag in soup.find_all():

if tag.string is None:

# failing silently for invalid tag

print(f'Tagging is invalid: {row["_id"], tag.name}, on row {i+2}skipping..')

continue


tag_index = content.split(str(tag))[0].count(tag.string)

try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no {i+2}")

continue

i += 1

if entities:

train_data.append((text, {"entities": entities}))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = {}

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took {}".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", {"entities":[(7,12,"DAY"),(17,24,"DAY")]}),

# ("Today is my lucky day", {"entities":[(1,5,"DAY")]}),

# ("Yesterday and Today are two same days of a month",{"entities":[(14,19,"DAY")]}),

# ("May Today is Best Day",{"entities":[(4,9,"DAY")]}),

# ("Have a Nice Today and Every Day",{"entities":[(12,17,"DAY")]}),

# ("Hey How are feeling Today",{"entities":[(20,25,"DAY")]}),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])


#to do train using batched

#add drop rate



import spacy

from spacy.util import minibatch,compounding

import random

from pathlib import Path

from spacy import displacy

import re

import pandas as pd

from bs4 import BeautifulSoup

from datetime import datetime

imporcytoolz import partition_all

import os

from os import path

import shutil

import json



df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")

def populate_train_data(df):

train_data = []

i =0

for d_index, row in df.iterrows():

print(row["annotations"])

content = row["annotations"].replace("\\n", "\n").replace("\n", " ")

content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)


# Finding tags and entities and store values in a entity list-----

soup = BeautifulSoup(content, "html.parser")

text = soup.get_text()

entities = []

for tag in soup.find_all():

if tag.string is None:

# failing silently for invalid tag

print(f'Tagging is invalid: {row["_id"], tag.name}, on row {i+2}skipping..')

continue


tag_index = content.split(str(tag))[0].count(tag.string)

try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no {i+2}")

continue

i += 1

if entities:

train_data.append((text, {"entities": entities}))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = {}

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took {}".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", {"entities":[(7,12,"DAY"),(17,24,"DAY")]}),

# ("Today is my lucky day", {"entities":[(1,5,"DAY")]}),

# ("Yesterday and Today are two same days of a month",{"entities":[(14,19,"DAY")]}),

# ("May Today is Best Day",{"entities":[(4,9,"DAY")]}),

# ("Have a Nice Today and Every Day",{"entities":[(12,17,"DAY")]}),

# ("Hey How are feeling Today",{"entities":[(20,25,"DAY")]}),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])cytoolz import partition_all

import os

from os import path

import shutil

import json



df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")

def populate_train_data(df):

train_data = []

i =0

for d_index, row in df.iterrows():

print(row["annotations"])

content = row["annotations"].replace("\\n", "\n").replace("\n", " ")

content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)


# Finding tags and entities and store values in a entity list-----

soup = BeautifulSoup(content, "html.parser")

text = soup.get_text()

entities = []

for tag in soup.find_all():

if tag.string is None:

# failing silently for invalid tag

print(f'Tagging is invalid: {row["_id"], tag.name}, on row {i+2}skipping..')

continue


tag_index = content.split(str(tag))[0].count(tag.string)

try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no {i+2}")

continue

i += 1

if entities:

train_data.append((text, {"entities": entities}))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = {}

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took {}".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", {"entities":[(7,12,"DAY"),(17,24,"DAY")]}),

# ("Today is my lucky day", {"entities":[(1,5,"DAY")]}),

# ("Yesterday and Today are two same days of a month",{"entities":[(14,19,"DAY")]}),

# ("May Today is Best Day",{"entities":[(4,9,"DAY")]}),

# ("Have a Nice Today and Every Day",{"entities":[(12,17,"DAY")]}),

# ("Hey How are feeling Today",{"entities":[(20,25,"DAY")]}),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])


#to do train using batched

#add drop rate


如您所见,我还尝试加载旧的数据集标记。但我还是有这个问题

我知道有些人面临这个问题,如果有人解决了这个问题,请帮助我

提前感谢您对朋友的帮助

我正在尝试为ner训练一个spacy模型。我有一个2940行的数据集,我训练了一个基地

模型让它的名字是
current\u model
,有了这些数据,我得到了另外10个不同的数据集

每个都有200到530行的行,因此我使用spacy的
spacy.load(“当前_模型”)加载了当前的_模型,然后使用我的每个数据集进行了训练。我试着预测内尔

使用
测试数据
可以识别新数据集中的ner,但似乎忘记了旧数据集中的ner

我这样做是为了减少训练时间。请查看下面的代码以查看我尝试了什么

我要做什么

基本型号培训代码

import spacy
from spacy.util import minibatch,compounding
import random
from pathlib import Path
from spacy import displacy
import re
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
imporcytoolz import partition_all
import os
from os import path
import shutil
import json


df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")
def populate_train_data(df):
train_data = []
i =0
for d_index, row in df.iterrows():
    print(row["annotations"])
    content = row["annotations"].replace("\\n", "\n").replace("\n", " ")
    content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)

# Finding tags and entities and store values in a entity list-----
soup = BeautifulSoup(content, "html.parser")
text = soup.get_text()
entities = []
for tag in soup.find_all():
    if tag.string is None:
        # failing silently for invalid tag
        print(f'Tagging is invalid: {row["_id"], tag.name}, on row {i+2}skipping..')
        continue

tag_index = content.split(str(tag))[0].count(tag.string)
try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no {i+2}")

continue

i += 1

if entities:

train_data.append((text, {"entities": entities}))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = {}

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took {}".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", {"entities":[(7,12,"DAY"),(17,24,"DAY")]}),

# ("Today is my lucky day", {"entities":[(1,5,"DAY")]}),

# ("Yesterday and Today are two same days of a month",{"entities":[(14,19,"DAY")]}),

# ("May Today is Best Day",{"entities":[(4,9,"DAY")]}),

# ("Have a Nice Today and Every Day",{"entities":[(12,17,"DAY")]}),

# ("Hey How are feeling Today",{"entities":[(20,25,"DAY")]}),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])cytoolz import partition_all

import os

from os import path

import shutil

import json



df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")

def populate_train_data(df):

train_data = []

i =0

for d_index, row in df.iterrows():

print(row["annotations"])

content = row["annotations"].replace("\\n", "\n").replace("\n", " ")

content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)


# Finding tags and entities and store values in a entity list-----

soup = BeautifulSoup(content, "html.parser")

text = soup.get_text()

entities = []

for tag in soup.find_all():

if tag.string is None:

# failing silently for invalid tag

print(f'Tagging is invalid: {row["_id"], tag.name}, on row {i+2}skipping..')

continue


tag_index = content.split(str(tag))[0].count(tag.string)

try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no {i+2}")

continue

i += 1

if entities:

train_data.append((text, {"entities": entities}))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = {}

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took {}".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", {"entities":[(7,12,"DAY"),(17,24,"DAY")]}),

# ("Today is my lucky day", {"entities":[(1,5,"DAY")]}),

# ("Yesterday and Today are two same days of a month",{"entities":[(14,19,"DAY")]}),

# ("May Today is Best Day",{"entities":[(4,9,"DAY")]}),

# ("Have a Nice Today and Every Day",{"entities":[(12,17,"DAY")]}),

# ("Hey How are feeling Today",{"entities":[(20,25,"DAY")]}),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])


#to do train using batched

#add drop rate



import spacy

from spacy.util import minibatch,compounding

import random

from pathlib import Path

from spacy import displacy

import re

import pandas as pd

from bs4 import BeautifulSoup

from datetime import datetime

imporcytoolz import partition_all

import os

from os import path

import shutil

import json



df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")

def populate_train_data(df):

train_data = []

i =0

for d_index, row in df.iterrows():

print(row["annotations"])

content = row["annotations"].replace("\\n", "\n").replace("\n", " ")

content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)


# Finding tags and entities and store values in a entity list-----

soup = BeautifulSoup(content, "html.parser")

text = soup.get_text()

entities = []

for tag in soup.find_all():

if tag.string is None:

# failing silently for invalid tag

print(f'Tagging is invalid: {row["_id"], tag.name}, on row {i+2}skipping..')

continue


tag_index = content.split(str(tag))[0].count(tag.string)

try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no {i+2}")

continue

i += 1

if entities:

train_data.append((text, {"entities": entities}))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = {}

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took {}".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", {"entities":[(7,12,"DAY"),(17,24,"DAY")]}),

# ("Today is my lucky day", {"entities":[(1,5,"DAY")]}),

# ("Yesterday and Today are two same days of a month",{"entities":[(14,19,"DAY")]}),

# ("May Today is Best Day",{"entities":[(4,9,"DAY")]}),

# ("Have a Nice Today and Every Day",{"entities":[(12,17,"DAY")]}),

# ("Hey How are feeling Today",{"entities":[(20,25,"DAY")]}),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])cytoolz import partition_all

import os

from os import path

import shutil

import json



df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")

def populate_train_data(df):

train_data = []

i =0

for d_index, row in df.iterrows():

print(row["annotations"])

content = row["annotations"].replace("\\n", "\n").replace("\n", " ")

content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)


# Finding tags and entities and store values in a entity list-----

soup = BeautifulSoup(content, "html.parser")

text = soup.get_text()

entities = []

for tag in soup.find_all():

if tag.string is None:

# failing silently for invalid tag

print(f'Tagging is invalid: {row["_id"], tag.name}, on row {i+2}skipping..')

continue


tag_index = content.split(str(tag))[0].count(tag.string)

try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no {i+2}")

continue

i += 1

if entities:

train_data.append((text, {"entities": entities}))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = {}

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took {}".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", {"entities":[(7,12,"DAY"),(17,24,"DAY")]}),

# ("Today is my lucky day", {"entities":[(1,5,"DAY")]}),

# ("Yesterday and Today are two same days of a month",{"entities":[(14,19,"DAY")]}),

# ("May Today is Best Day",{"entities":[(4,9,"DAY")]}),

# ("Have a Nice Today and Every Day",{"entities":[(12,17,"DAY")]}),

# ("Hey How are feeling Today",{"entities":[(20,25,"DAY")]}),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])


#to do train using batched

#add drop rate


如您所见,我还尝试加载旧的数据集标记。但我还是有这个问题

我知道有些人面临这个问题,如果有人解决了这个问题,请帮助我


提前感谢您为朋友提供的帮助。

您是在培训新模型还是在现有spacy模型的基础上增加?如果您执行的是后者,则所有NN(学习的重量、特征)将被取消学习和未对齐,从而导致精度损失。当我想训练spacy无法识别的韩国人和日本人的名字时,我是根据自己的经验来讲述这一点的。你也可以尝试FastText、Flair和Polyglot,看看它是否达到了你的目的。尝试使用所有这些工具,你应该有良好的输出。这就是我最后使用的解决方案。

Hi,@Syenix感谢您的时间,首先我加载spacy的“en_core_web_sm”)模型,然后再添加到我无法编辑的模型中。这是github上已有的Spacy中的一个bug:)