Python 3.x 如何合并多字标签?
我目前正在使用allennlp进行NER标记 代码: 是否有任何解析器可以合并下面的输出,使其返回“Top Gun”并标记“WORK\u OF_ART”Python 3.x 如何合并多字标签?,python-3.x,ner,natural-language-processing,allennlp,Python 3.x,Ner,Natural Language Processing,Allennlp,我目前正在使用allennlp进行NER标记 代码: 是否有任何解析器可以合并下面的输出,使其返回“Top Gun”并标记“WORK\u OF_ART” 您可以更改模型路径并尝试使用您的模型路径 from allennlp.predictors.predictor import Predictor predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.1
您可以更改模型路径并尝试使用您的模型路径
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz") # change model path
sentence = "Did Uriah honestly think he could beat The Legend of Zelda in under three hours?"
result = predictor.predict(sentence)
lang = {}
completeWord = ""
for word, tag in zip(result["words"], result["tags"]):
if(tag.startswith("B")):
completeWord = completeWord + " " +word
completeWord = completeWord + " " +word
elif(tag.startswith("L")):
completeWord = completeWord + " " +word
lang[completeWord] = tag.split("-")[1]
completeWord = ""
else:
lang[word] = tag
print(lang)
>>>{' The Legend of Zelda': 'MISC',
'?': 'O',
'Did': 'O',
'Uriah': 'U-PER',
'beat': 'O',
'could': 'O',
'he': 'O',
'honestly': 'O',
'hours': 'O',
'in': 'O',
'think': 'O',
'three': 'O',
'under': 'O'}
如果有用,请将其标记为已接受
import allennlp
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.09.03.tar.gz")
document = """The U.S. is a country of 50 states covering a vast swath of North America, with Alaska in the northwest and Hawaii extending the nation’s presence into the Pacific Ocean. Major Atlantic Coast cities are New York, a global finance and culture center, and capital Washington, DC. Midwestern metropolis Chicago is known for influential architecture and on the west coast, Los Angeles' Hollywood is famed for filmmaking"""
####### Convert Entities ##########
def convert_results(allen_results):
ents = set()
for word, tag in zip(allen_results["words"], allen_results["tags"]):
if tag != "O":
ent_position, ent_type = tag.split("-")
if ent_position == "U":
ents.add((word,ent_type))
else:
if ent_position == "B":
w = word
elif ent_position == "I":
w += " " + word
elif ent_position == "L":
w += " " + word
ents.add((w,ent_type))
return ents
def allennlp_ner(document):
return convert_results(predictor.predict(sentence=document))
results = predictor.predict(sentence=document)
[tuple(i) for i in zip(results["words"],results["tags"])]
##Output##
[('The', 'O'),
('U.S.', 'U-LOC'),
('is', 'O'),
('a', 'O'),
('country', 'O'),
('of', 'O'),
('50', 'O'),
('states', 'O'),
('covering', 'O'),
('a', 'O'),
('vast', 'O'),
('swath', 'O'),
('of', 'O'),
('North', 'B-LOC'),
('America', 'L-LOC'),
(',', 'O'),
('with', 'O'),
('Alaska', 'U-LOC'),
('in', 'O'),
('the', 'O'),
('northwest', 'O'),
('and', 'O'),
('Hawaii', 'U-LOC'),
('extending', 'O'),
('the', 'O'),
('nation', 'O'),
('’s', 'O'),
('presence', 'O'),
('into', 'O'),
('the', 'O'),
('Pacific', 'B-LOC'),
('Ocean', 'L-LOC'),
('.', 'O'),
('Major', 'B-LOC'),
('Atlantic', 'I-LOC'),
('Coast', 'L-LOC'),
('cities', 'O'),
('are', 'O'),
('New', 'B-LOC'),
('York', 'L-LOC'),
(',', 'O'),
('a', 'O'),
('global', 'O'),
('finance', 'O'),
('and', 'O'),
('culture', 'O'),
('center', 'O'),
(',', 'O'),
('and', 'O'),
('capital', 'O'),
('Washington', 'U-LOC'),
(',', 'O'),
('DC', 'U-LOC'),
('.', 'O'),
('Midwestern', 'U-MISC'),
('metropolis', 'O'),
('Chicago', 'U-LOC'),
('is', 'O'),
('known', 'O'),
('for', 'O'),
('influential', 'O'),
('architecture', 'O'),
('and', 'O'),
('on', 'O'),
('the', 'O'),
('west', 'O'),
('coast', 'O'),
(',', 'O'),
('Los', 'B-LOC'),
('Angeles', 'L-LOC'),
("'", 'O'),
('Hollywood', 'U-LOC'),
('is', 'O'),
('famed', 'O'),
('for', 'O'),
('filmmaking', 'O')]
# Merging Multiword NER Tags using convert_results
allennlp_ner(document)
# the output print like this
{('Alaska', 'LOC'),
('Chicago', 'LOC'),
('DC', 'LOC'),
('Hawaii', 'LOC'),
('Hollywood', 'LOC'),
('Los', 'LOC'),
('Los Angeles', 'LOC'),
('Major', 'LOC'),
('Major Atlantic', 'LOC'),
('Major Atlantic Coast', 'LOC'),
('Midwestern', 'MISC'),
('New', 'LOC'),
('New York', 'LOC'),
('North', 'LOC'),
('North America', 'LOC'),
('Pacific', 'LOC'),
('Pacific Ocean', 'LOC'),
('U.S.', 'LOC'),
('Washington', 'LOC')}
我在下面给出了解决方案,请检查并让我知道使用转换结果合并多字NER标记
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz") # change model path
sentence = "Did Uriah honestly think he could beat The Legend of Zelda in under three hours?"
result = predictor.predict(sentence)
lang = {}
completeWord = ""
for word, tag in zip(result["words"], result["tags"]):
if(tag.startswith("B")):
completeWord = completeWord + " " +word
completeWord = completeWord + " " +word
elif(tag.startswith("L")):
completeWord = completeWord + " " +word
lang[completeWord] = tag.split("-")[1]
completeWord = ""
else:
lang[word] = tag
print(lang)
>>>{' The Legend of Zelda': 'MISC',
'?': 'O',
'Did': 'O',
'Uriah': 'U-PER',
'beat': 'O',
'could': 'O',
'he': 'O',
'honestly': 'O',
'hours': 'O',
'in': 'O',
'think': 'O',
'three': 'O',
'under': 'O'}
| ------|--------------------------------------|
| BEGIN | The first token of a final entity |
| ------|--------------------------------------|
| IN | An inner token of a final entity |
| ------|--------------------------------------|
| LAST | The final token of a final entity |
| ------|--------------------------------------|
| Unit | A single-token entity |
| ------|--------------------------------------|
| Out | A non-entity token entity |
| ------|--------------------------------------|
import allennlp
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.09.03.tar.gz")
document = """The U.S. is a country of 50 states covering a vast swath of North America, with Alaska in the northwest and Hawaii extending the nation’s presence into the Pacific Ocean. Major Atlantic Coast cities are New York, a global finance and culture center, and capital Washington, DC. Midwestern metropolis Chicago is known for influential architecture and on the west coast, Los Angeles' Hollywood is famed for filmmaking"""
####### Convert Entities ##########
def convert_results(allen_results):
ents = set()
for word, tag in zip(allen_results["words"], allen_results["tags"]):
if tag != "O":
ent_position, ent_type = tag.split("-")
if ent_position == "U":
ents.add((word,ent_type))
else:
if ent_position == "B":
w = word
elif ent_position == "I":
w += " " + word
elif ent_position == "L":
w += " " + word
ents.add((w,ent_type))
return ents
def allennlp_ner(document):
return convert_results(predictor.predict(sentence=document))
results = predictor.predict(sentence=document)
[tuple(i) for i in zip(results["words"],results["tags"])]
##Output##
[('The', 'O'),
('U.S.', 'U-LOC'),
('is', 'O'),
('a', 'O'),
('country', 'O'),
('of', 'O'),
('50', 'O'),
('states', 'O'),
('covering', 'O'),
('a', 'O'),
('vast', 'O'),
('swath', 'O'),
('of', 'O'),
('North', 'B-LOC'),
('America', 'L-LOC'),
(',', 'O'),
('with', 'O'),
('Alaska', 'U-LOC'),
('in', 'O'),
('the', 'O'),
('northwest', 'O'),
('and', 'O'),
('Hawaii', 'U-LOC'),
('extending', 'O'),
('the', 'O'),
('nation', 'O'),
('’s', 'O'),
('presence', 'O'),
('into', 'O'),
('the', 'O'),
('Pacific', 'B-LOC'),
('Ocean', 'L-LOC'),
('.', 'O'),
('Major', 'B-LOC'),
('Atlantic', 'I-LOC'),
('Coast', 'L-LOC'),
('cities', 'O'),
('are', 'O'),
('New', 'B-LOC'),
('York', 'L-LOC'),
(',', 'O'),
('a', 'O'),
('global', 'O'),
('finance', 'O'),
('and', 'O'),
('culture', 'O'),
('center', 'O'),
(',', 'O'),
('and', 'O'),
('capital', 'O'),
('Washington', 'U-LOC'),
(',', 'O'),
('DC', 'U-LOC'),
('.', 'O'),
('Midwestern', 'U-MISC'),
('metropolis', 'O'),
('Chicago', 'U-LOC'),
('is', 'O'),
('known', 'O'),
('for', 'O'),
('influential', 'O'),
('architecture', 'O'),
('and', 'O'),
('on', 'O'),
('the', 'O'),
('west', 'O'),
('coast', 'O'),
(',', 'O'),
('Los', 'B-LOC'),
('Angeles', 'L-LOC'),
("'", 'O'),
('Hollywood', 'U-LOC'),
('is', 'O'),
('famed', 'O'),
('for', 'O'),
('filmmaking', 'O')]
# Merging Multiword NER Tags using convert_results
allennlp_ner(document)
# the output print like this
{('Alaska', 'LOC'),
('Chicago', 'LOC'),
('DC', 'LOC'),
('Hawaii', 'LOC'),
('Hollywood', 'LOC'),
('Los', 'LOC'),
('Los Angeles', 'LOC'),
('Major', 'LOC'),
('Major Atlantic', 'LOC'),
('Major Atlantic Coast', 'LOC'),
('Midwestern', 'MISC'),
('New', 'LOC'),
('New York', 'LOC'),
('North', 'LOC'),
('North America', 'LOC'),
('Pacific', 'LOC'),
('Pacific Ocean', 'LOC'),
('U.S.', 'LOC'),
('Washington', 'LOC')}