R 文本挖掘-统计短语的频率（多个单词）_R_Nlp_Text Mining_N Gram

R 文本挖掘-统计短语的频率（多个单词）

r nlp

R 文本挖掘-统计短语的频率（多个单词）,r,nlp,text-mining,n-gram,R,Nlp,Text Mining,N Gram,我熟悉使用tm库创建tdm和计算术语频率但这些术语都是一个词如何计算多词短语在文档和/或语料库中出现的次数编辑：我正在添加代码，以改进/澄清我的帖子这是构建术语文档矩阵的标准代码： library(tm) cname <- ("C:/Users/George/Google Drive/R Templates/Gospels corpus") corpus <- Corpus(DirSource(cname)) #Cleaning corpus <- tm

我熟悉使用tm库创建tdm和计算术语频率

但这些术语都是一个词

如何计算多词短语在文档和/或语料库中出现的次数

编辑：

我正在添加代码，以改进/澄清我的帖子

这是构建术语文档矩阵的标准代码：

library(tm)


cname <- ("C:/Users/George/Google Drive/R Templates/Gospels corpus")   

corpus <- Corpus(DirSource(cname))

#Cleaning
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, c("a","the","an","that","and"))

#convert to a plain text file
corpus <- tm_map(corpus, PlainTextDocument)

#Create a term document matrix
tdm1 <- TermDocumentMatrix(corpus)

m1 <- as.matrix(tdm1)
word.freq <- sort(rowSums(m1), decreasing=T)
word.freq<-word.freq[1:100]

我希望能够在语料库中搜索多词词汇。例如，“来自”而不是单独的“来自”和“来自”

谢谢。

鉴于文本：

text <- "This is my little R text example and I want to count the frequency of some pattern (and - is - my - of). This is my little R text example and I want to count the frequency of some patter."

对于模式的频率：

attr(regexpr('is', text), "match.length")

[1] 3

我创建了以下函数来获取单词n-grams及其相应的频率

library(tau) 
library(data.table)
# given a string vector and size of ngrams this function returns     word ngrams with corresponding frequencies
createNgram <-function(stringVector, ngramSize){

  ngram <- data.table()

  ng <- textcnt(stringVector, method = "string", n=ngramSize, tolower = FALSE)

  if(ngramSize==1){
    ngram <- data.table(w1 = names(ng), freq = unclass(ng), length=nchar(names(ng)))  
  }
  else {
    ngram <- data.table(w1w2 = names(ng), freq = unclass(ng), length=nchar(names(ng)))
  }
  return(ngram)
}

下面是一个使用Tidytext的代码示例：

同样的技术可以扩展到更大的n值

bigram_tf_idf <- bigrams %>%
  count(year, bigram) %>%
  filter(n > 2) %>%
  bind_tf_idf(bigram, year, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf.plot <- bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  filter(tf_idf > 0) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram))))

bigram_tf_idf.plot %>% 
  group_by(year) %>% 
  top_n(10) %>% 
  ungroup %>%
  ggplot(aes(bigram, tf_idf, fill = year)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~year, ncol = 3, scales = "free") +
  theme(text = element_text(size = 10)) +
  coord_flip()

bigram\u tf\u idf%
计数（年份，双字符）%>%
过滤器（n>2）%>%
绑定\u tf\u idf（二元，年份，n）%>%
安排（描述（tf_idf））
bigram_tf_idf.plot%
排列（描述（tf_idf））%>%
过滤器（tf_idf>0）%>%
突变（bigram=因子（bigram，levels=rev（unique（bigram）））
bigram\u tf\u idf.plot%>%
组别(年份)%>%
排名靠前的（10）%>%
解组%>%
ggplot（aes（二元图，tf_idf，填充=年））+
几何坐标（show.legend=FALSE）+
实验室（x=NULL，y=“tf idf”）+
面_包裹（~year，ncol=3，scales=“free”）+
主题（文本=元素\文本（大小=10））+
coord_flip（）

请阅读有关和如何给出建议的信息。这将使其他人更容易帮助你。你也可能会发现有用的方法。请参阅软件包文档中的

tokenize\u ngram（）

。

library(tau) 
library(data.table)
# given a string vector and size of ngrams this function returns     word ngrams with corresponding frequencies
createNgram <-function(stringVector, ngramSize){

  ngram <- data.table()

  ng <- textcnt(stringVector, method = "string", n=ngramSize, tolower = FALSE)

  if(ngramSize==1){
    ngram <- data.table(w1 = names(ng), freq = unclass(ng), length=nchar(names(ng)))  
  }
  else {
    ngram <- data.table(w1w2 = names(ng), freq = unclass(ng), length=nchar(names(ng)))
  }
  return(ngram)
}

text <- "This is my little R text example and I want to count the frequency of some pattern (and - is - my - of). This is my little R text example and I want to count the frequency of some patter."

res <- createNgram(text, 2)

           w1w2      freq   length
 1:        I want    2      6
 2:        R text    2      6
 3:       This is    2      7
 4:         and I    2      5
 5:        and is    1      6
 6:     count the    2      9
 7:   example and    2     11
 8:  frequency of    2     12
 9:         is my    3      5
10:      little R    2      8
11:     my little    2      9
12:         my of    1      5
13:       of This    1      7
14:       of some    2      7
15:   pattern and    1     11
16:   some patter    1     11
17:  some pattern    1     12
18:  text example    2     12
19: the frequency    2     13
20:      to count    2      8
21:       want to    2      7

bigram_tf_idf <- bigrams %>%
  count(year, bigram) %>%
  filter(n > 2) %>%
  bind_tf_idf(bigram, year, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf.plot <- bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  filter(tf_idf > 0) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram))))

bigram_tf_idf.plot %>% 
  group_by(year) %>% 
  top_n(10) %>% 
  ungroup %>%
  ggplot(aes(bigram, tf_idf, fill = year)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~year, ncol = 3, scales = "free") +
  theme(text = element_text(size = 10)) +
  coord_flip()