将字符串拆分为较小的字符串以在数据帧中创建新行(在R中)
我是一个新的R用户,目前正在努力解决如何分割数据帧每行中的字符串,然后使用修改后的字符串创建新行(同时修改原始字符串)。下面是一个示例,但实际数据集要大得多将字符串拆分为较小的字符串以在数据帧中创建新行(在R中),r,string,dplyr,R,String,Dplyr,我是一个新的R用户,目前正在努力解决如何分割数据帧每行中的字符串,然后使用修改后的字符串创建新行(同时修改原始字符串)。下面是一个示例,但实际数据集要大得多 library(dplyr) library(stringr) library(tidyverse) library(utils) posts_sentences <- data.frame("element_id" = c(1, 1, 2, 2, 2), "sentence_id" = c(1, 2, 1, 2, 3),
library(dplyr)
library(stringr)
library(tidyverse)
library(utils)
posts_sentences <- data.frame("element_id" = c(1, 1, 2, 2, 2), "sentence_id" = c(1, 2, 1, 2, 3),
"sentence" = c("You know, when I grew up, I grew up in a very religious family, I had the same sought of troubles people have, I was excelling in alot of ways, but because there was alot of trouble at home, we were always moving around", "Im at breaking point.I have no one to talk to about this and if I’m honest I think I’m too scared to tell anyone because if I do then it becomes real.I dont know what to do.", "I feel like I’m going to explode.", "I have so many thoughts and feelings inside and I don't know who to tell and I was going to tell my friend about it but I'm not sure.", "I keep saying omg!it's too much"),
"sentence_wc" = c(60, 30, 7, 20, 7), stringsAsFactors=FALSE)
库(dplyr)
图书馆(stringr)
图书馆(tidyverse)
图书馆(utils)
posts_句子编辑:我对整个答案进行了编辑,以更详细地解决具体问题
这并不是完全通用的,因为它假设组完全基于元素\u id
创建
split_too_long <- function(str, max.words=15L, ...) {
cuts <- stringi::stri_locate_all_words(str)[[1L]]
# return one of these
if (nrow(cuts) <= max.words) {
c(str, NA_character_)
}
else {
left <- substr(str, 1L, cuts[max.words, 2L])
right <- substr(str, cuts[max.words + 1L, 1L], nchar(str))
c(left, right)
}
}
recursive_split <- function(not_done, done=NULL, ...) {
left_right <- split_too_long(not_done, ...)
# return one of these
if (is.na(left_right[2L]))
c(done, left_right[1L])
else
recursive_split(left_right[2L], done=c(done, left_right[1L]), ...)
}
collapse_split <- function(sentences, regex="[.;:] ?", ...) {
sentences <- paste(sentences, collapse=". ")
sentences <- unlist(strsplit(sentences, split=regex))
# return
unlist(lapply(sentences, recursive_split, done=NULL, ...))
}
group_fun <- function(grouped_df, ...) {
# initialize new data frame with new number of rows
new_df <- data.frame(sentence=collapse_split(grouped_df$sentence, ...),
stringsAsFactors=FALSE)
# count words
new_df$sentence_wc <- stringi::stri_count_words(new_df$sentence)
# add sentence_id
new_df$sentence_id <- 1L:nrow(new_df)
# element_id must be equal because it is a grouping variable,
# so take 1 to repeat it in output
new_df$element_id <- grouped_df$element_id[1L]
# return
dplyr::filter(new_df, sentence_wc > 0L)
}
out <- posts_sentences %>%
group_by(element_id) %>%
do(group_fun(., max.words=5L, regex="[.;:!] ?"))
split_too_long这里有一个tidyverse
方法,允许您指定自己的启发式方法,我认为这应该是最适合您的情况。关键是使用pmap
创建每一行的列表,然后在必要时使用map\u if
进行拆分。在我看来,这种情况很难单独使用dplyr
,因为我们在操作中添加行,所以rowwise
很难使用
split\u too\u long()
的结构基本上是:
使用dplyr::mutate
和tokenizers::count_words
获取每个句子的字数
使用purr::pmap
,使每一行成为列表的一个元素,该列表接受dataframe作为列列表作为输入
使用purrr::map_if
检查字数是否大于所需限制
如果满足上述条件,则使用tidyr::separate_rows
将句子拆分为多行
然后用新的字数替换字数,并用过滤器
(由加倍分隔符创建)删除所有空行
当我们意识到元素需要进一步拆分时,我们可以将其应用于不同的分离器。在这里,我使用与您提到的启发式对应的这些模式:
“[\\.\?\!]?”
哪个匹配任何代码>和一个可选空格
,?(?=[:upper:])“
匹配的,
,可选空格,在大写字母之前
和?(?=[:upper:])”
匹配大写字母前面的和
可选空格
它正确地返回与预期输出中相同的分句。语句id
很容易添加到行号的末尾,错误的前导/尾随空格可以通过stringr::str\u trim
删除
注意事项:
- 我写这篇文章是为了在探索性分析中的可读性,因此每次都会分成列表并绑定在一起。如果您提前决定需要什么分隔符,您可以将它放在一个
map
步骤中,这可能会加快速度,尽管我没有在大型数据集上对此进行分析
- 根据评论,在这些拆分之后仍然有超过15个单词的句子。您必须决定要拆分的其他符号/正则表达式的长度
- 目前,列名硬编码为
split\u过长
。如果能够在函数调用中指定列名对您很重要,我建议您研究使用dplyr进行编程的vignette(应该只需进行一些调整即可实现)
posts\u句子%
pmap(函数(…)可编辑(…)%>%
地图(
.p=~.$wc>最大长度,
.f=~独立的_行(,句子,sep=regexp)
) %>%
绑定_行()%>%
变异(wc=计数单词(句子))%>%
过滤器(wc!=0)
}
帖子\句子%>%
分组依据(元素id)%>%
总结(句子=str_c(句子,折叠“.”)%>%
解组()%>%
拆分太长(“[\\.\?\\!]”,15)%>%
拆分太长(“,?(?=[:上限:)”,15)%
拆分太长(“and”(?=[:upper:]),15)%>%
分组依据(元素id)%>%
变异(
句子=str_trim(句子),
句子编号=行编号()
) %>%
选择(元素id、句子id、句子、wc)
#>#A tibble:13 x 4
#>#组:元素_id[2]
#>元素\u id语句\u id语句wc
#>
#>你知道,当我长大的时候
#>2 1 2我在一个非常虔诚的家庭中长大8
#>3 1 3我也遇到过同样的麻烦~9
#>4 1 4我在很多方面都很出色,但是贝卡
#>5 1 5 Im在断点4处
#>6 1 6关于这件事我没有人可以谈,我
#>我不知道该怎么办
#>8 2 1我觉得我要爆炸了7
#>9 2我的思想和感情太多了
#>我不知道该告诉谁
#>11 2 4我本来打算把这件事告诉我的朋友的
#>12 2 5我一直在说天哪4
#>太多了
由(v0.2.0)于2018年5月21日创建。备选方案tidyverse
解决方案:
library(dplyr)
library(tidyr)
library(stringr)
library(tidyverse)
library(utils)
check_and_split <- function(element_id, sentence_id, sentence, sentence_wc,
word_count, attmpt){
methods <- c("\\.", ",\\s?(?=[I])", "and\\s?(?=[A-Z])")
df <- data.frame(element_id=element_id,
sentence_id=sentence_id,
sentence=sentence,
sentence_wc=sentence_wc,
word_count=word_count,
attmpt=attmpt,
stringsAsFactors = FALSE)
if(word_count<=15 | attmpt>=3){
return(df) #early return
} else{
df %>%
tidyr::separate_rows(sentence, sep=methods[attmpt+1]) %>%
mutate(word_count=str_count(sentence,'\\w+'),
attmpt = attmpt+1)
}
}
posts_sentences %>%
mutate(word_count=str_count(sentence,'\\w+'),
attmpt=0) %>%
pmap_dfr(check_and_split) %>%
pmap_dfr(check_and_split) %>%
pmap_dfr(check_and_split)
库(dplyr)
图书馆(tidyr)
图书馆(stringr)
图书馆(tidyverse)
图书馆(utils)
检查和拆分%
变异(单词计数=str计数(句子“\\w+”),
attmpt=0)%>%
pmap_dfr(检查_和_分割)%>%
pmap_dfr(检查_和_分割)%>%
pmap_dfr(检查_和_分割)
在这里,我们创建了一个helper函数,它接受一行(按元素分解,由purr::pmap()
提供服务),我们将其重新组装到数据帧中,检查单词计数是否超过15,以及之前对句子的尝试次数。然后我们使用tidyr::separate_rows()
w
library(dplyr)
library(tidyr)
library(stringr)
library(tidyverse)
library(utils)
check_and_split <- function(element_id, sentence_id, sentence, sentence_wc,
word_count, attmpt){
methods <- c("\\.", ",\\s?(?=[I])", "and\\s?(?=[A-Z])")
df <- data.frame(element_id=element_id,
sentence_id=sentence_id,
sentence=sentence,
sentence_wc=sentence_wc,
word_count=word_count,
attmpt=attmpt,
stringsAsFactors = FALSE)
if(word_count<=15 | attmpt>=3){
return(df) #early return
} else{
df %>%
tidyr::separate_rows(sentence, sep=methods[attmpt+1]) %>%
mutate(word_count=str_count(sentence,'\\w+'),
attmpt = attmpt+1)
}
}
posts_sentences %>%
mutate(word_count=str_count(sentence,'\\w+'),
attmpt=0) %>%
pmap_dfr(check_and_split) %>%
pmap_dfr(check_and_split) %>%
pmap_dfr(check_and_split)
posts_sentences <- data.frame("element_id" = c(1, 1, 2, 2, 2), "sentence_id" = c(1, 2, 1, 2, 3),
"sentence" = c("You know, when I grew up, I grew up in a very religious family, I had the same sought of troubles people have, I was excelling in alot of ways, but because there was alot of trouble at home, we were always moving around", "Im at breaking point.I have no one to talk to about this and if I’m honest I think I’m too scared to tell anyone because if I do then it becomes real.I dont know what to do.", "I feel like I’m going to explode.", "I have so many thoughts and feelings inside and I don't know who to tell and I was going to tell my friend about it but I'm not sure.", "I keep saying omg!it's too much"),
"sentence_wc" = c(60, 30, 7, 20, 7), stringsAsFactors=FALSE)
# To create an empty data frame to save the new elements
new_posts_sentences <- data.frame(element_id = as.numeric(),
sentence_id =as.numeric(),
sentence = character(),
sentence_wc = as.numeric(), stringsAsFactors=FALSE)
limit_words <- 15 # 15 for this data set
countSentences <- 0
for (sentence in posts_sentences[,3]) {
vector <- character()
Velement_id <- posts_sentences$element_id[countSentences + 1]
vector <- c(vector, sentence) #To create a vector with the sentences
vector <- vector[!vector %in% ''] #remove empty elements from vector
## First we will separate the sentences that start with a uppercase after of a capital letter
if(lengths(gregexpr("[A-z]\\W+", sentence)) > limit_words ){
vector <- vector[!vector %in% sentence]
split_points <- unlist(gregexpr("[:,:]\\s[A-Z]", sentence)) # To get the character position
## If a sentences is still over the limit words value. Let's split it for each comma or period
sentences_1 <- substring(sentence, c(1, split_points + 2), c(split_points -1, nchar(sentence)))
for(sentence in sentences_1){
vector <- c(vector, sentence)
vector <- vector[!vector %in% '']
if(lengths(gregexpr("[A-z]\\W+", sentence)) > limit_words){
vector <- vector[!vector %in% sentence]
split_points <- unlist(gregexpr("[:,:]|[:.:]", sentence))
sentences_2 <- substring(sentence, c(1, split_points + 1), c(split_points -1, nchar(sentence)))
## If a sentence is still s still over the limit words value. Let's split it for each capital letter
for(sentence in sentences_2){
vector <- c(vector, sentence)
vector <- vector[!vector %in% '']
if(lengths(gregexpr("[A-z]\\W+", sentence)) > limit_words){
vector <- vector[!vector %in% sentence]
split_points <- unlist(gregexpr("[A-Z]", sentence))
sentences_3 <- substring(sentence,c(1, split_points), c(split_points -1, nchar(sentence)))
vector <- c(vector, sentences_3)
vector <- vector[!vector %in% '']
}
}
}
}
}
## To make a data frame o each original sentence
element_id <- rep(Velement_id, length(vector))
sentence_id <- 1:length(vector)
sentence_wc <- character()
for (element in vector){sentence_wc <- c(sentence_wc, (lengths(gregexpr("[A-z]\\W+", element)))) }
sentenceDataFrame <- data.frame(element_id, sentence_id, vector, sentence_wc)
## To join it with the final dataframe
new_posts_sentences <- rbind(new_posts_sentences, sentenceDataFrame)
countSentences <- countSentences + 1
}
print(new_posts_sentences)
element_id sentence_id vector sentence_wc
1 1 1 You know, when I grew up 5
2 1 2 I grew up in a very religious family 7
3 1 3 I had the same sought of troubles people have 8
4 1 4 I was excelling in alot of ways 6
5 1 5 but because there was alot of trouble at home 8
6 1 6 we were always moving around 4
7 1 1 Im at breaking point 3
8 1 2 I have no one to talk to about this and if 11
9 1 3 I’m honest 3
10 1 4 I think 2
11 1 5 I’m too scared to tell anyone because if 9
12 1 6 I do then it becomes real 5
13 1 7 I dont know what to do 5
14 2 1 I feel like I’m going to explode. 8
15 2 1 I have so many thoughts and feelings inside and 9
16 2 2 I don't know who to tell and 8
17 2 3 I was going to tell my friend about it but 10
18 2 4 I'm not sure 3
19 2 1 I keep saying omg!it's too much 7
sentences_split = posts_sentences %>%
mutate(text_split=str_split(sentence, pattern = "\\.")) %>%
unnest(text_split) %>%
#Count number of words in text_split
mutate(wc_split = str_count(text_split, "\\w+")) %>%
filter(wc_split!=0) %>%
#Split again if text_split column has >15 words
mutate(text_split_again = ifelse(wc_split>15, str_split(text_split, pattern = ",\\sI"), text_split)) %>%
unnest(text_split_again)