为R中的特定单词标记词性

为R中的特定单词标记词性,r,nlp,R,Nlp,我有一个a栏有句子,B栏有一些单词。我想检查词性B栏中的单词是否属于A栏中的句子 目前,我可以使用以下代码获取单个句子的词性: 我试图得到文本文件中每个句子对应的词性。请建议此代码 s <- unlist(lapply(posText, function(x) { str_split(x, "\n") })) tagPOS <- function(x, ...) { s <- as.String(x) word_token_annotator <- Maxen

我有一个a栏有句子,B栏有一些单词。我想检查词性B栏中的单词是否属于A栏中的句子

目前,我可以使用以下代码获取单个句子的词性:

我试图得到文本文件中每个句子对应的词性。请建议此代码

s <- unlist(lapply(posText, function(x) { str_split(x, "\n") }))

tagPOS <-  function(x, ...) {
  s <- as.String(x)
  word_token_annotator <- Maxent_Word_Token_Annotator()
  a2 <- Annotation(1L, "sentence", 1L, nchar(s))
  a2 <- annotate(s, word_token_annotator, a2)
  a3 <- annotate(s, Maxent_POS_Tag_Annotator(), a2)
  a3w <- a3[a3$type == "word"]
  POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
  POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
  list(POStagged = POStagged, POStags = POStags)
}

tagged_str <-  tagPOS(s)

s使用lappy可以标记多个句子。由于您没有提供可复制的数据,我创建了自己的数据

代码

#Reproducible data - Quotes from  Wuthering Heights by  Emily Bronte
posText<- "I gave him my heart, and he took and pinched it to death; and flung it back to me.
           People feel with their hearts, Ellen, and since he has destroyed mine, I have not power to feel for him."

library(stringr)
#Spliting into sentence based on carriage return
s <- unlist(lapply(posText, function(x) { str_split(x, "\n") }))

library(NLP)
library(openNLP)

tagPOS <-  function(x, ...) {
  s <- as.String(x)
  word_token_annotator <- Maxent_Word_Token_Annotator()
  a2 <- Annotation(1L, "sentence", 1L, nchar(s))
  a2 <- annotate(s, word_token_annotator, a2)
  a3 <- annotate(s, Maxent_POS_Tag_Annotator(), a2)
  a3w <- a3[a3$type == "word"]
  POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
  POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
  list(POStagged = POStagged, POStags = POStags)
}

result <- lapply(s,tagPOS)
result <- as.data.frame(do.call(rbind,result))

使用lapply可以标记多个句子。由于您没有提供可复制的数据,我创建了自己的数据

代码

#Reproducible data - Quotes from  Wuthering Heights by  Emily Bronte
posText<- "I gave him my heart, and he took and pinched it to death; and flung it back to me.
           People feel with their hearts, Ellen, and since he has destroyed mine, I have not power to feel for him."

library(stringr)
#Spliting into sentence based on carriage return
s <- unlist(lapply(posText, function(x) { str_split(x, "\n") }))

library(NLP)
library(openNLP)

tagPOS <-  function(x, ...) {
  s <- as.String(x)
  word_token_annotator <- Maxent_Word_Token_Annotator()
  a2 <- Annotation(1L, "sentence", 1L, nchar(s))
  a2 <- annotate(s, word_token_annotator, a2)
  a3 <- annotate(s, Maxent_POS_Tag_Annotator(), a2)
  a3w <- a3[a3$type == "word"]
  POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
  POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
  list(POStagged = POStagged, POStags = POStags)
}

result <- lapply(s,tagPOS)
result <- as.data.frame(do.call(rbind,result))

我维护的标记器包在这里可能有助于简化生活。它有一些类似Python的行为,我将在下面演示:

数据 标记 选择标记(regex也可用) 基本POS类型 获取POS计数 仅获取POS标签
我维护的标记器包在这里可能有助于简化生活。它有一些类似Python的行为,我将在下面演示:

数据 标记 选择标记(regex也可用) 基本POS类型 获取POS计数 仅获取POS标签
你能发布一些数据和期望的结果吗?你能发布一些数据和期望的结果吗?
posText<- "I gave him my heart, and he took and pinched it to death; and flung it back to me.
           People feel with their hearts, Ellen, and since he has destroyed mine, I have not power to feel for him."
if (!require("pacman")) install.packages("pacman")
pacman::p_load_gh(c(
    "trinker/termco", 
    "trinker/tagger",
    "trinker/textshape"
))
tagged <- tag_pos(split_sentence(posText)[[1]])

tagged

## 1] "I/PRP gave/VBD him/PRP my/PRP$ heart/NN ,/, and/CC he/PRP took/VBD and/CC pinched/VBD it/PRP to/TO death/NN ;/: and/CC flung/VBD it/PRP back/RB to/TO me/PRP ./."                            
## [2] "People/NNS feel/VBP with/IN their/PRP$ hearts/NNS ,/, Ellen/NNP ,/, and/CC since/IN he/PRP has/VBZ destroyed/VBN mine/NN ,/, I/PRP have/VBP not/RB power/NN to/TO feel/VB for/IN him/PRP ./."
c(tagged)

## [[1]]
##       PRP       VBD       PRP      PRP$        NN         ,        CC       PRP 
##       "I"    "gave"     "him"      "my"   "heart"       ","     "and"      "he" 
##       VBD        CC       VBD       PRP        TO        NN         :        CC 
##    "took"     "and" "pinched"      "it"      "to"   "death"       ";"     "and" 
##       VBD       PRP        RB        TO       PRP         . 
##   "flung"      "it"    "back"      "to"      "me"       "." 
## 
## [[2]]
##         NNS         VBP          IN        PRP$         NNS           , 
##    "People"      "feel"      "with"     "their"    "hearts"         "," 
##         NNP           ,          CC          IN         PRP         VBZ 
##     "Ellen"         ","       "and"     "since"        "he"       "has" 
##         VBN          NN           ,         PRP         VBP          RB 
## "destroyed"      "mine"         ","         "I"      "have"       "not" 
##          NN          TO          VB          IN         PRP           . 
##     "power"        "to"      "feel"       "for"       "him"         "." 
select_tags(tagged, c("NN", "NNP", "NNPS", "NNS"))

## [1] "heart/NN death/NN"                               
## [2] "People/NNS hearts/NNS Ellen/NNP mine/NN power/NN"
as_basic(tagged)

## [1] "I/pronoun gave/verb him/pronoun my/pronoun heart/noun ,/. and/conjunction he/pronoun took/verb and/conjunction pinched/verb it/pronoun to/preposition death/noun ;/. and/conjunction flung/verb it/pronoun back/adverb to/preposition me/pronoun ./."                     
## [2] "People/noun feel/verb with/preposition their/pronoun hearts/noun ,/. Ellen/noun ,/. and/conjunction since/preposition he/pronoun has/verb destroyed/verb mine/noun ,/. I/pronoun have/verb not/adverb power/noun to/preposition feel/verb for/preposition him/pronoun ./."
count_tags(tagged, pretty = FALSE)

##    n.tokens , . : CC IN NN NNP NNS PRP PRP$ RB TO VB VBD VBN VBP VBZ id
## 1:       22 1 1 1  3  0  2   0   0   6    1  1  2  0   4   0   0   0  1
## 2:       24 3 1 0  1  3  2   1   2   3    1  1  1  1   0   1   2   1  2
lapply(tagged, names)

## [[1]]
##  [1] "PRP"  "VBD"  "PRP"  "PRP$" "NN"   ","    "CC"   "PRP"  "VBD"  "CC"  
## [11] "VBD"  "PRP"  "TO"   "NN"   ":"    "CC"   "VBD"  "PRP"  "RB"   "TO"  
## [21] "PRP"  "."   
## 
## [[2]]
##  [1] "NNS"  "VBP"  "IN"   "PRP$" "NNS"  ","    "NNP"  ","    "CC"   "IN"  
## [11] "PRP"  "VBZ"  "VBN"  "NN"   ","    "PRP"  "VBP"  "RB"   "NN"   "TO"  
## [21] "VB"   "IN"   "PRP"  "."