Python 如何使用“不定式”获得动词的不定式形式；诗节；？_Python_Stanford Nlp_Pos_Stanza

Python 如何使用“不定式”获得动词的不定式形式；诗节；？

python stanford-nlp

Python 如何使用“不定式”获得动词的不定式形式；诗节；？,python,stanford-nlp,pos,stanza,Python,Stanford Nlp,Pos,Stanza,如何用节找出句子中的不定式动词例如： doc = "I need you to find the verbes in this sentence" en_nlp = stanza.Pipeline('en', processors='tokenize,lemma,mwt,pos,depparse', verbose=False, use_gpu=False) processed = en_nlp(doc) print(*[f"id: {word.id}\t wo

如何用节找出句子中的不定式动词

例如：

doc = "I need you to find the verbes in this sentence"
en_nlp = stanza.Pipeline('en', processors='tokenize,lemma,mwt,pos,depparse', verbose=False, use_gpu=False)
processed = en_nlp(doc)

print(*[f"id: {word.id}\t word: {word.text}\t POS: {word.pos}\t head id: {word.head}\t head: {sent.words[word.head-1].text if word.head > 0 else 'root'} \t deprel: {word.deprel}" for sent in processed.sentences for word in sent.words], sep='\n')

输出：

id: 1    word: I     POS: PRON   head id: 2  head: need      deprel: nsubj
id: 2    word: need  POS: VERB   head id: 0  head: root      deprel: root
id: 3    word: you   POS: PRON   head id: 2  head: need      deprel: obj
id: 4    word: to    POS: PART   head id: 5  head: find      deprel: mark
id: 5    word: find  POS: VERB   head id: 2  head: need      deprel: xcomp
id: 6    word: the   POS: DET    head id: 7  head: verbes    deprel: det
id: 7    word: verbes    POS: NOUN   head id: 5  head: find      deprel: obj
id: 8    word: in    POS: ADP    head id: 10     head: sentence      deprel: case
id: 9    word: this  POS: DET    head id: 10     head: sentence      deprel: det
id: 10   word: sentence  POS: NOUN   head id: 5  head: find      deprel: obl

然而，在这方面：

id:5单词：查找位置：动词head id:2 head:need deprel:xcomp

我需要说它是一个不定式动词。

我也有同样的问题，我不想打断标记器，最后调整了句子

feats这个词表示如ID7中所示的动词形式，我还没有测试过它的可靠性

test_resp = "He was a little scared to knock on the door"
res = nlp(test_resp)
res.sentences[0].words[4:8]

给这个

[{
   "id": 5,
   "text": "scared",
   "lemma": "scared",
   "upos": "ADJ",
   "xpos": "JJ",
   "feats": "Degree=Pos",
   "head": 0,
   "deprel": "root",
   "misc": "start_char=16|end_char=22"
 },
 {
   "id": 6,
   "text": "to",
   "lemma": "to",
   "upos": "PART",
   "xpos": "TO",
   "head": 7,
   "deprel": "mark",
   "misc": "start_char=23|end_char=25"
 },
 {
   "id": 7,
   "text": "knock",
   "lemma": "knock",
   "upos": "VERB",
   "xpos": "VB",
   "feats": "VerbForm=Inf",
   "head": 5,
   "deprel": "advcl",
   "misc": "start_char=26|end_char=31"
 },
 {
   "id": 8,
   "text": "on",
   "lemma": "on",
   "upos": "ADP",
   "xpos": "IN",
   "head": 10,
   "deprel": "case",
   "misc": "start_char=32|end_char=34"
 }]

出于我的目的，将字符串“to verb”视为单个词汇项，并将word.text更新为“to_verb”，并将动词的字符跨度更新为匹配更为有用。这使得动词的word.lemma和word.upos作为动词保持不变，但需要减少动词和后续单词的头部和单词位置索引，以说明删除“to”的原因

deepcopy保护原始示例以供演示，如果可能，最好避免使用

import re
import sys
from copy import deepcopy

def patch_inf_verb(processed):
    """hack the parse to treat 'to VERB' as one word"""
 
    # modified sentence
    results = deepcopy(processed)
    
    # regex to captures the text and numerals in  word.misc, 
    # e.g., 'start_char=11|stop_char=13'
    misc_vals_re = re.compile("(start_char=)(\d+)(\|end_char=)(?P<end>\d+)")

    for result in results.sentences:
        for wdx, word in enumerate(result.words):
            
            # peek back for "to"
            if wdx > 0 and word.pos == "VERB":
                one_back =  result.words[wdx - 1]
                if one_back.text.lower() == "to" and one_back.head == word.id:
                    
                    word.text = "to_" + word.text
                    # word.upos = "VERB_INF"  # update upos tag or leave as is

                    # parse verb's character span string
                    vals = misc_vals_re.match(word.misc).groups()
                    assert vals is not None
   
                    # nudge word.misc start_char back to span one-back "to"
                    word.misc = f"{vals[0]}{int(vals[1])-3}{vals[2]}{int(vals[3])}"
                    assert misc_vals_re.match(word.misc) is not None

                    # decrement the indexes for verb position and beyond,
                    # the character spans don't change
                    for tdx in range(len(result.words)):
                        if result.words[tdx].id > wdx: result.words[tdx].id -= 1
                        if result.words[tdx].head > wdx: result.words[tdx].head -= 1
                    
                    # clobber the "to" after
                    del result.words[wdx - 1]
    return results

def format_results(results):
    """results in table format"""
    results_str = '\n'.join(
        [
            "\t".join(
                    [
                        f"{key:5s}: {val}" 
                        for key, val in word.to_dict().items() 
                        if key not in ["lemma", "feats"]
                    ]
                )
                for sent in results.sentences 
                for word in sent.words
            ]
        )
    return results_str

我也问了同样的问题，不想打断标记器，结果修改了节句子

feats这个词表示如ID7中所示的动词形式，我还没有测试过它的可靠性

test_resp = "He was a little scared to knock on the door"
res = nlp(test_resp)
res.sentences[0].words[4:8]

给这个

[{
   "id": 5,
   "text": "scared",
   "lemma": "scared",
   "upos": "ADJ",
   "xpos": "JJ",
   "feats": "Degree=Pos",
   "head": 0,
   "deprel": "root",
   "misc": "start_char=16|end_char=22"
 },
 {
   "id": 6,
   "text": "to",
   "lemma": "to",
   "upos": "PART",
   "xpos": "TO",
   "head": 7,
   "deprel": "mark",
   "misc": "start_char=23|end_char=25"
 },
 {
   "id": 7,
   "text": "knock",
   "lemma": "knock",
   "upos": "VERB",
   "xpos": "VB",
   "feats": "VerbForm=Inf",
   "head": 5,
   "deprel": "advcl",
   "misc": "start_char=26|end_char=31"
 },
 {
   "id": 8,
   "text": "on",
   "lemma": "on",
   "upos": "ADP",
   "xpos": "IN",
   "head": 10,
   "deprel": "case",
   "misc": "start_char=32|end_char=34"
 }]

deepcopy保护原始示例以供演示，如果可能，最好避免使用

import re
import sys
from copy import deepcopy

def patch_inf_verb(processed):
    """hack the parse to treat 'to VERB' as one word"""
 
    # modified sentence
    results = deepcopy(processed)
    
    # regex to captures the text and numerals in  word.misc, 
    # e.g., 'start_char=11|stop_char=13'
    misc_vals_re = re.compile("(start_char=)(\d+)(\|end_char=)(?P<end>\d+)")

    for result in results.sentences:
        for wdx, word in enumerate(result.words):
            
            # peek back for "to"
            if wdx > 0 and word.pos == "VERB":
                one_back =  result.words[wdx - 1]
                if one_back.text.lower() == "to" and one_back.head == word.id:
                    
                    word.text = "to_" + word.text
                    # word.upos = "VERB_INF"  # update upos tag or leave as is

                    # parse verb's character span string
                    vals = misc_vals_re.match(word.misc).groups()
                    assert vals is not None
   
                    # nudge word.misc start_char back to span one-back "to"
                    word.misc = f"{vals[0]}{int(vals[1])-3}{vals[2]}{int(vals[3])}"
                    assert misc_vals_re.match(word.misc) is not None

                    # decrement the indexes for verb position and beyond,
                    # the character spans don't change
                    for tdx in range(len(result.words)):
                        if result.words[tdx].id > wdx: result.words[tdx].id -= 1
                        if result.words[tdx].head > wdx: result.words[tdx].head -= 1
                    
                    # clobber the "to" after
                    del result.words[wdx - 1]
    return results

def format_results(results):
    """results in table format"""
    results_str = '\n'.join(
        [
            "\t".join(
                    [
                        f"{key:5s}: {val}" 
                        for key, val in word.to_dict().items() 
                        if key not in ["lemma", "feats"]
                    ]
                )
                for sent in results.sentences 
                for word in sent.words
            ]
        )
    return results_str