使用短语机器（R）后从提取短语列表创建文档特征矩阵_R_Dplyr_Nlp_Plyr_Nested Lists

使用短语机器（R）后从提取短语列表创建文档特征矩阵

r nlp

使用短语机器（R）后从提取短语列表创建文档特征矩阵,r,dplyr,nlp,plyr,nested-lists,R,Dplyr,Nlp,Plyr,Nested Lists,应用phrasemachine（）后，我有一个包含短语的嵌套列表。现在，我想创建一个文档特征矩阵，第一列包含文档（用户），其余列包含所有特征，每个用户在单元格中的使用频率 library(rJava) library(phrasemachine) library(quanteda) #creating dummy data id <- c(1:2) text <- c("Election day is coming up and I am super excited. Electi

应用

phrasemachine（）

后，我有一个包含短语的嵌套列表。现在，我想创建一个文档特征矩阵，第一列包含文档（用户），其余列包含所有特征，每个用户在单元格中的使用频率

library(rJava)
library(phrasemachine)
library(quanteda)

#creating dummy data
id <- c(1:2)
text <- c("Election day is coming up and I am super excited. Election day. Wooho. I voted President Obama.", "School is boring. Partying is cool. Happy Birthday to me. When is Election Day?")
test <- data.frame(id, text)
test$text <- as.character(test$text)

corpus_test <- corpus(test[["text"]], docnames = test[["id"]])
tokens_test <- tokens(corpus_test)
phrases_test <- phrasemachine(tokens_test, minimum_ngram_length = 2, maximum_ngram_length = 3, return_phrase_vectors = TRUE, return_tag_sequences = TRUE)
phrases_test

# > phrases_test
# [[1]]
# [[1]]$phrases
# [1] "Election_day"    "Election_day"    "President_Obama"
# 
# [[1]]$tags
# [1] "NN" "NN" "NN"
# 
# 
# [[2]]
# [[2]]$phrases
# [1] "Happy_Birthday" "Election_Day"  
# 
# [[2]]$tags
# [1] "AN" "NN"

我试着使用

lappy

，但由于每个用户的短语都有不同的维度，所以这不起作用

以下是我尝试过的：

library(plyr)
phrases_user <- laply(phrases_test, function(x) laply(x, identity)) #Error: Results must have the same dimensions.

library(dplyr)
phrases_user <- lapply(phrases_test, `[`, "phrases")

库（plyr）
短语用户使用udpipe和短语机器的示例
library(udpipe)
text <- c("Election day is coming up and I am super excited. Election day. Wooho. I voted President Obama.", "School is boring. Partying is cool. Happy Birthday to me. When is Election Day?")
x <- udpipe(text, "english")
x$tags <- as_phrasemachine(x$upos, type = "upos")

keyw <- keywords_phrases(x$tags, 
                         term = x$token, pattern = "(A|N)*N(P+D*(A|N)*N)*", 
                         is_regex = TRUE, detailed = FALSE)
head(keyw)
x$term <- txt_recode_ngram(x$token, 
                           compound = keyw$keyword, 
                           ngram = keyw$ngram)
dtm <- document_term_frequencies(x, document = "doc_id", term = c("term", "token"))
dtm <- document_term_matrix(dtm)

库（udpipe）
文本
corpus_test_2 <- corpus(phrases_user[["phrases"]], docnames = phrases_user[["id"]])
dfm_test <- dfm(corpus_test_2)

library(udpipe)
text <- c("Election day is coming up and I am super excited. Election day. Wooho. I voted President Obama.", "School is boring. Partying is cool. Happy Birthday to me. When is Election Day?")
x <- udpipe(text, "english")
x$tags <- as_phrasemachine(x$upos, type = "upos")

keyw <- keywords_phrases(x$tags, 
                         term = x$token, pattern = "(A|N)*N(P+D*(A|N)*N)*", 
                         is_regex = TRUE, detailed = FALSE)
head(keyw)
x$term <- txt_recode_ngram(x$token, 
                           compound = keyw$keyword, 
                           ngram = keyw$ngram)
dtm <- document_term_frequencies(x, document = "doc_id", term = c("term", "token"))
dtm <- document_term_matrix(dtm)