Python 如何使用“不定式”获得动词的不定式形式;诗节;?
如何用节找出句子中的不定式动词 例如:Python 如何使用“不定式”获得动词的不定式形式;诗节;?,python,stanford-nlp,pos,stanza,Python,Stanford Nlp,Pos,Stanza,如何用节找出句子中的不定式动词 例如: doc = "I need you to find the verbes in this sentence" en_nlp = stanza.Pipeline('en', processors='tokenize,lemma,mwt,pos,depparse', verbose=False, use_gpu=False) processed = en_nlp(doc) print(*[f"id: {word.id}\t wo
doc = "I need you to find the verbes in this sentence"
en_nlp = stanza.Pipeline('en', processors='tokenize,lemma,mwt,pos,depparse', verbose=False, use_gpu=False)
processed = en_nlp(doc)
print(*[f"id: {word.id}\t word: {word.text}\t POS: {word.pos}\t head id: {word.head}\t head: {sent.words[word.head-1].text if word.head > 0 else 'root'} \t deprel: {word.deprel}" for sent in processed.sentences for word in sent.words], sep='\n')
输出:
id: 1 word: I POS: PRON head id: 2 head: need deprel: nsubj
id: 2 word: need POS: VERB head id: 0 head: root deprel: root
id: 3 word: you POS: PRON head id: 2 head: need deprel: obj
id: 4 word: to POS: PART head id: 5 head: find deprel: mark
id: 5 word: find POS: VERB head id: 2 head: need deprel: xcomp
id: 6 word: the POS: DET head id: 7 head: verbes deprel: det
id: 7 word: verbes POS: NOUN head id: 5 head: find deprel: obj
id: 8 word: in POS: ADP head id: 10 head: sentence deprel: case
id: 9 word: this POS: DET head id: 10 head: sentence deprel: det
id: 10 word: sentence POS: NOUN head id: 5 head: find deprel: obl
然而,在这方面:
id:5单词:查找位置:动词head id:2 head:need deprel:xcomp
我需要说它是一个不定式动词。我也有同样的问题,我不想打断标记器,最后调整了句子 feats这个词表示如ID7中所示的动词形式,我还没有测试过它的可靠性
test_resp = "He was a little scared to knock on the door"
res = nlp(test_resp)
res.sentences[0].words[4:8]
给这个
[{
"id": 5,
"text": "scared",
"lemma": "scared",
"upos": "ADJ",
"xpos": "JJ",
"feats": "Degree=Pos",
"head": 0,
"deprel": "root",
"misc": "start_char=16|end_char=22"
},
{
"id": 6,
"text": "to",
"lemma": "to",
"upos": "PART",
"xpos": "TO",
"head": 7,
"deprel": "mark",
"misc": "start_char=23|end_char=25"
},
{
"id": 7,
"text": "knock",
"lemma": "knock",
"upos": "VERB",
"xpos": "VB",
"feats": "VerbForm=Inf",
"head": 5,
"deprel": "advcl",
"misc": "start_char=26|end_char=31"
},
{
"id": 8,
"text": "on",
"lemma": "on",
"upos": "ADP",
"xpos": "IN",
"head": 10,
"deprel": "case",
"misc": "start_char=32|end_char=34"
}]
出于我的目的,将字符串“to verb”视为单个词汇项,并将word.text更新为“to_verb”,并将动词的字符跨度更新为匹配更为有用。这使得动词的word.lemma和word.upos作为动词保持不变,但需要减少动词和后续单词的头部和单词位置索引,以说明删除“to”的原因
deepcopy保护原始示例以供演示,如果可能,最好避免使用
import re
import sys
from copy import deepcopy
def patch_inf_verb(processed):
"""hack the parse to treat 'to VERB' as one word"""
# modified sentence
results = deepcopy(processed)
# regex to captures the text and numerals in word.misc,
# e.g., 'start_char=11|stop_char=13'
misc_vals_re = re.compile("(start_char=)(\d+)(\|end_char=)(?P<end>\d+)")
for result in results.sentences:
for wdx, word in enumerate(result.words):
# peek back for "to"
if wdx > 0 and word.pos == "VERB":
one_back = result.words[wdx - 1]
if one_back.text.lower() == "to" and one_back.head == word.id:
word.text = "to_" + word.text
# word.upos = "VERB_INF" # update upos tag or leave as is
# parse verb's character span string
vals = misc_vals_re.match(word.misc).groups()
assert vals is not None
# nudge word.misc start_char back to span one-back "to"
word.misc = f"{vals[0]}{int(vals[1])-3}{vals[2]}{int(vals[3])}"
assert misc_vals_re.match(word.misc) is not None
# decrement the indexes for verb position and beyond,
# the character spans don't change
for tdx in range(len(result.words)):
if result.words[tdx].id > wdx: result.words[tdx].id -= 1
if result.words[tdx].head > wdx: result.words[tdx].head -= 1
# clobber the "to" after
del result.words[wdx - 1]
return results
def format_results(results):
"""results in table format"""
results_str = '\n'.join(
[
"\t".join(
[
f"{key:5s}: {val}"
for key, val in word.to_dict().items()
if key not in ["lemma", "feats"]
]
)
for sent in results.sentences
for word in sent.words
]
)
return results_str
我也问了同样的问题,不想打断标记器,结果修改了节句子 feats这个词表示如ID7中所示的动词形式,我还没有测试过它的可靠性
test_resp = "He was a little scared to knock on the door"
res = nlp(test_resp)
res.sentences[0].words[4:8]
给这个
[{
"id": 5,
"text": "scared",
"lemma": "scared",
"upos": "ADJ",
"xpos": "JJ",
"feats": "Degree=Pos",
"head": 0,
"deprel": "root",
"misc": "start_char=16|end_char=22"
},
{
"id": 6,
"text": "to",
"lemma": "to",
"upos": "PART",
"xpos": "TO",
"head": 7,
"deprel": "mark",
"misc": "start_char=23|end_char=25"
},
{
"id": 7,
"text": "knock",
"lemma": "knock",
"upos": "VERB",
"xpos": "VB",
"feats": "VerbForm=Inf",
"head": 5,
"deprel": "advcl",
"misc": "start_char=26|end_char=31"
},
{
"id": 8,
"text": "on",
"lemma": "on",
"upos": "ADP",
"xpos": "IN",
"head": 10,
"deprel": "case",
"misc": "start_char=32|end_char=34"
}]
出于我的目的,将字符串“to verb”视为单个词汇项,并将word.text更新为“to_verb”,并将动词的字符跨度更新为匹配更为有用。这使得动词的word.lemma和word.upos作为动词保持不变,但需要减少动词和后续单词的头部和单词位置索引,以说明删除“to”的原因
deepcopy保护原始示例以供演示,如果可能,最好避免使用
import re
import sys
from copy import deepcopy
def patch_inf_verb(processed):
"""hack the parse to treat 'to VERB' as one word"""
# modified sentence
results = deepcopy(processed)
# regex to captures the text and numerals in word.misc,
# e.g., 'start_char=11|stop_char=13'
misc_vals_re = re.compile("(start_char=)(\d+)(\|end_char=)(?P<end>\d+)")
for result in results.sentences:
for wdx, word in enumerate(result.words):
# peek back for "to"
if wdx > 0 and word.pos == "VERB":
one_back = result.words[wdx - 1]
if one_back.text.lower() == "to" and one_back.head == word.id:
word.text = "to_" + word.text
# word.upos = "VERB_INF" # update upos tag or leave as is
# parse verb's character span string
vals = misc_vals_re.match(word.misc).groups()
assert vals is not None
# nudge word.misc start_char back to span one-back "to"
word.misc = f"{vals[0]}{int(vals[1])-3}{vals[2]}{int(vals[3])}"
assert misc_vals_re.match(word.misc) is not None
# decrement the indexes for verb position and beyond,
# the character spans don't change
for tdx in range(len(result.words)):
if result.words[tdx].id > wdx: result.words[tdx].id -= 1
if result.words[tdx].head > wdx: result.words[tdx].head -= 1
# clobber the "to" after
del result.words[wdx - 1]
return results
def format_results(results):
"""results in table format"""
results_str = '\n'.join(
[
"\t".join(
[
f"{key:5s}: {val}"
for key, val in word.to_dict().items()
if key not in ["lemma", "feats"]
]
)
for sent in results.sentences
for word in sent.words
]
)
return results_str