将语料库分成N个单词的块,每个单词在R中
我需要把一个语料库分成N个单词。假设这是我的语料库:将语料库分成N个单词的块,每个单词在R中,r,corpus,R,Corpus,我需要把一个语料库分成N个单词。假设这是我的语料库: corpus <- "I need to break this corpus into chunks of ~3 words each" corpus要将字符串拆分为N个单词,可以使用tokenizers::chunk\u text(): library(tidytext) corpus_df <- as.data.frame(text = corpus) tokens <- corpus_df %&g
corpus <- "I need to break this corpus into chunks of ~3 words each"
corpus要将字符串拆分为N个单词,可以使用tokenizers::chunk\u text()
:
library(tidytext)
corpus_df <- as.data.frame(text = corpus)
tokens <- corpus_df %>% unnest_tokens(word, text)
chunk <- 3
n <- nrow(tokens)
r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
d <- split(tokens,r)
corpus <- "I need to break this corpus into chunks of ~3 words each"
library(tokenizers)
library(tidytext)
library(tibble)
corpus %>%
chunk_text(3)
[[1]]
[1] "i need to"
[[2]]
[1] "break this corpus"
[[3]]
[1] "into chunks of"
[[4]]
[1] "3 words each"
corpus %>%
chunk_text(3) %>%
enframe(name = "group", value = "text") %>%
unnest_tokens(word, text)
# A tibble: 12 x 2
group word
<int> <chr>
1 1 i
2 1 need
3 1 to
4 2 break
5 2 this
6 2 corpus
7 3 into
8 3 chunks
9 3 of
10 4 3
11 4 words
12 4 each
corpus %>%
chunk_text(3) %>%
enframe(name = "group", value = "text") %>%
unnest_tokens(word, text) %>%
group_split(group)