R 文本挖掘-统计短语的频率(多个单词)
我熟悉使用tm库创建tdm和计算术语频率 但这些术语都是一个词 如何计算多词短语在文档和/或语料库中出现的次数 编辑: 我正在添加代码,以改进/澄清我的帖子 这是构建术语文档矩阵的标准代码:R 文本挖掘-统计短语的频率(多个单词),r,nlp,text-mining,n-gram,R,Nlp,Text Mining,N Gram,我熟悉使用tm库创建tdm和计算术语频率 但这些术语都是一个词 如何计算多词短语在文档和/或语料库中出现的次数 编辑: 我正在添加代码,以改进/澄清我的帖子 这是构建术语文档矩阵的标准代码: library(tm) cname <- ("C:/Users/George/Google Drive/R Templates/Gospels corpus") corpus <- Corpus(DirSource(cname)) #Cleaning corpus <- tm
library(tm)
cname <- ("C:/Users/George/Google Drive/R Templates/Gospels corpus")
corpus <- Corpus(DirSource(cname))
#Cleaning
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, c("a","the","an","that","and"))
#convert to a plain text file
corpus <- tm_map(corpus, PlainTextDocument)
#Create a term document matrix
tdm1 <- TermDocumentMatrix(corpus)
m1 <- as.matrix(tdm1)
word.freq <- sort(rowSums(m1), decreasing=T)
word.freq<-word.freq[1:100]
我希望能够在语料库中搜索多词词汇。例如,“来自”而不是单独的“来自”和“来自”
谢谢。鉴于文本:
text <- "This is my little R text example and I want to count the frequency of some pattern (and - is - my - of). This is my little R text example and I want to count the frequency of some patter."
对于模式的频率:
attr(regexpr('is', text), "match.length")
[1] 3
我创建了以下函数来获取单词n-grams及其相应的频率
library(tau)
library(data.table)
# given a string vector and size of ngrams this function returns word ngrams with corresponding frequencies
createNgram <-function(stringVector, ngramSize){
ngram <- data.table()
ng <- textcnt(stringVector, method = "string", n=ngramSize, tolower = FALSE)
if(ngramSize==1){
ngram <- data.table(w1 = names(ng), freq = unclass(ng), length=nchar(names(ng)))
}
else {
ngram <- data.table(w1w2 = names(ng), freq = unclass(ng), length=nchar(names(ng)))
}
return(ngram)
}
下面是一个使用Tidytext的代码示例: 同样的技术可以扩展到更大的n值
bigram_tf_idf <- bigrams %>%
count(year, bigram) %>%
filter(n > 2) %>%
bind_tf_idf(bigram, year, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf.plot <- bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
filter(tf_idf > 0) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram))))
bigram_tf_idf.plot %>%
group_by(year) %>%
top_n(10) %>%
ungroup %>%
ggplot(aes(bigram, tf_idf, fill = year)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~year, ncol = 3, scales = "free") +
theme(text = element_text(size = 10)) +
coord_flip()
bigram\u tf\u idf%
计数(年份,双字符)%>%
过滤器(n>2)%>%
绑定\u tf\u idf(二元,年份,n)%>%
安排(描述(tf_idf))
bigram_tf_idf.plot%
排列(描述(tf_idf))%>%
过滤器(tf_idf>0)%>%
突变(bigram=因子(bigram,levels=rev(unique(bigram)))
bigram\u tf\u idf.plot%>%
组别(年份)%>%
排名靠前的(10)%>%
解组%>%
ggplot(aes(二元图,tf_idf,填充=年))+
几何坐标(show.legend=FALSE)+
实验室(x=NULL,y=“tf idf”)+
面_包裹(~year,ncol=3,scales=“free”)+
主题(文本=元素\文本(大小=10))+
coord_flip()
请阅读有关和如何给出建议的信息。这将使其他人更容易帮助你。你也可能会发现有用的方法。请参阅软件包文档中的tokenize\u ngram()
。
library(tau)
library(data.table)
# given a string vector and size of ngrams this function returns word ngrams with corresponding frequencies
createNgram <-function(stringVector, ngramSize){
ngram <- data.table()
ng <- textcnt(stringVector, method = "string", n=ngramSize, tolower = FALSE)
if(ngramSize==1){
ngram <- data.table(w1 = names(ng), freq = unclass(ng), length=nchar(names(ng)))
}
else {
ngram <- data.table(w1w2 = names(ng), freq = unclass(ng), length=nchar(names(ng)))
}
return(ngram)
}
text <- "This is my little R text example and I want to count the frequency of some pattern (and - is - my - of). This is my little R text example and I want to count the frequency of some patter."
res <- createNgram(text, 2)
w1w2 freq length
1: I want 2 6
2: R text 2 6
3: This is 2 7
4: and I 2 5
5: and is 1 6
6: count the 2 9
7: example and 2 11
8: frequency of 2 12
9: is my 3 5
10: little R 2 8
11: my little 2 9
12: my of 1 5
13: of This 1 7
14: of some 2 7
15: pattern and 1 11
16: some patter 1 11
17: some pattern 1 12
18: text example 2 12
19: the frequency 2 13
20: to count 2 8
21: want to 2 7
bigram_tf_idf <- bigrams %>%
count(year, bigram) %>%
filter(n > 2) %>%
bind_tf_idf(bigram, year, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf.plot <- bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
filter(tf_idf > 0) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram))))
bigram_tf_idf.plot %>%
group_by(year) %>%
top_n(10) %>%
ungroup %>%
ggplot(aes(bigram, tf_idf, fill = year)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~year, ncol = 3, scales = "free") +
theme(text = element_text(size = 10)) +
coord_flip()