R 文本挖掘-统计短语的频率(多个单词)

R 文本挖掘-统计短语的频率(多个单词),r,nlp,text-mining,n-gram,R,Nlp,Text Mining,N Gram,我熟悉使用tm库创建tdm和计算术语频率 但这些术语都是一个词 如何计算多词短语在文档和/或语料库中出现的次数 编辑: 我正在添加代码,以改进/澄清我的帖子 这是构建术语文档矩阵的标准代码: library(tm) cname <- ("C:/Users/George/Google Drive/R Templates/Gospels corpus") corpus <- Corpus(DirSource(cname)) #Cleaning corpus <- tm

我熟悉使用tm库创建tdm和计算术语频率

但这些术语都是一个词

如何计算多词短语在文档和/或语料库中出现的次数

编辑:

我正在添加代码,以改进/澄清我的帖子

这是构建术语文档矩阵的标准代码:

library(tm)


cname <- ("C:/Users/George/Google Drive/R Templates/Gospels corpus")   

corpus <- Corpus(DirSource(cname))

#Cleaning
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, c("a","the","an","that","and"))

#convert to a plain text file
corpus <- tm_map(corpus, PlainTextDocument)

#Create a term document matrix
tdm1 <- TermDocumentMatrix(corpus)

m1 <- as.matrix(tdm1)
word.freq <- sort(rowSums(m1), decreasing=T)
word.freq<-word.freq[1:100]
我希望能够在语料库中搜索多词词汇。例如,“来自”而不是单独的“来自”和“来自”

谢谢。

鉴于文本:

text <- "This is my little R text example and I want to count the frequency of some pattern (and - is - my - of). This is my little R text example and I want to count the frequency of some patter."
对于模式的频率:

attr(regexpr('is', text), "match.length")

[1] 3

我创建了以下函数来获取单词n-grams及其相应的频率

library(tau) 
library(data.table)
# given a string vector and size of ngrams this function returns     word ngrams with corresponding frequencies
createNgram <-function(stringVector, ngramSize){

  ngram <- data.table()

  ng <- textcnt(stringVector, method = "string", n=ngramSize, tolower = FALSE)

  if(ngramSize==1){
    ngram <- data.table(w1 = names(ng), freq = unclass(ng), length=nchar(names(ng)))  
  }
  else {
    ngram <- data.table(w1w2 = names(ng), freq = unclass(ng), length=nchar(names(ng)))
  }
  return(ngram)
}

下面是一个使用Tidytext的代码示例:

同样的技术可以扩展到更大的n值

bigram_tf_idf <- bigrams %>%
  count(year, bigram) %>%
  filter(n > 2) %>%
  bind_tf_idf(bigram, year, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf.plot <- bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  filter(tf_idf > 0) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram))))

bigram_tf_idf.plot %>% 
  group_by(year) %>% 
  top_n(10) %>% 
  ungroup %>%
  ggplot(aes(bigram, tf_idf, fill = year)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~year, ncol = 3, scales = "free") +
  theme(text = element_text(size = 10)) +
  coord_flip()
bigram\u tf\u idf%
计数(年份,双字符)%>%
过滤器(n>2)%>%
绑定\u tf\u idf(二元,年份,n)%>%
安排(描述(tf_idf))
bigram_tf_idf.plot%
排列(描述(tf_idf))%>%
过滤器(tf_idf>0)%>%
突变(bigram=因子(bigram,levels=rev(unique(bigram)))
bigram\u tf\u idf.plot%>%
组别(年份)%>%
排名靠前的(10)%>%
解组%>%
ggplot(aes(二元图,tf_idf,填充=年))+
几何坐标(show.legend=FALSE)+
实验室(x=NULL,y=“tf idf”)+
面_包裹(~year,ncol=3,scales=“free”)+
主题(文本=元素\文本(大小=10))+
coord_flip()
请阅读有关和如何给出建议的信息。这将使其他人更容易帮助你。你也可能会发现有用的方法。请参阅软件包文档中的
tokenize\u ngram()
library(tau) 
library(data.table)
# given a string vector and size of ngrams this function returns     word ngrams with corresponding frequencies
createNgram <-function(stringVector, ngramSize){

  ngram <- data.table()

  ng <- textcnt(stringVector, method = "string", n=ngramSize, tolower = FALSE)

  if(ngramSize==1){
    ngram <- data.table(w1 = names(ng), freq = unclass(ng), length=nchar(names(ng)))  
  }
  else {
    ngram <- data.table(w1w2 = names(ng), freq = unclass(ng), length=nchar(names(ng)))
  }
  return(ngram)
}
text <- "This is my little R text example and I want to count the frequency of some pattern (and - is - my - of). This is my little R text example and I want to count the frequency of some patter."
res <- createNgram(text, 2)
           w1w2      freq   length
 1:        I want    2      6
 2:        R text    2      6
 3:       This is    2      7
 4:         and I    2      5
 5:        and is    1      6
 6:     count the    2      9
 7:   example and    2     11
 8:  frequency of    2     12
 9:         is my    3      5
10:      little R    2      8
11:     my little    2      9
12:         my of    1      5
13:       of This    1      7
14:       of some    2      7
15:   pattern and    1     11
16:   some patter    1     11
17:  some pattern    1     12
18:  text example    2     12
19: the frequency    2     13
20:      to count    2      8
21:       want to    2      7
bigram_tf_idf <- bigrams %>%
  count(year, bigram) %>%
  filter(n > 2) %>%
  bind_tf_idf(bigram, year, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf.plot <- bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  filter(tf_idf > 0) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram))))

bigram_tf_idf.plot %>% 
  group_by(year) %>% 
  top_n(10) %>% 
  ungroup %>%
  ggplot(aes(bigram, tf_idf, fill = year)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~year, ncol = 3, scales = "free") +
  theme(text = element_text(size = 10)) +
  coord_flip()