R 将字符串拆分为多列(按特定顺序)
我收到的数据格式不是很好,我无法在上游更改它们。有一列需要重新排序,并根据某些关键字拆分为10+个其他列 下面是我收到的一个数据示例——对于每个人,他们选择了3种不同的食物。他们对每种食物类别food1、food2、food3的选择都紧跟在正文之后:R 将字符串拆分为多列(按特定顺序),r,R,我收到的数据格式不是很好,我无法在上游更改它们。有一列需要重新排序,并根据某些关键字拆分为10+个其他列 下面是我收到的一个数据示例——对于每个人,他们选择了3种不同的食物。他们对每种食物类别food1、food2、food3的选择都紧跟在正文之后: list1 <- c(' food1 pasta food2 apple food3 carrot ') list2 <- c(' food2 banana food3 cucumber food1 brown rice ') list
list1 <- c(' food1 pasta food2 apple food3 carrot ')
list2 <- c(' food2 banana food3 cucumber food1 brown rice ')
list3 <- c(' food3 bell pepper food2 plum food1 bread ')
foodListDF <- as.data.frame(matrix(c(1,2,3, list1, list2, list3), nrow = 3), stringsAsFactors = FALSE)
colnames(foodListDF) <- c('Person', 'Choices')
foodListDF
Person Choices
1 1 food1 pasta food2 apple food3 carrot
2 2 food2 banana food3 cucumber food1 brown rice
3 3 food3 bell pepper food2 plum food1 bread
我知道我可以通过这样的方式进行选择:
library(stringr)
as.data.frame(str_split_fixed(foodListDF$Choices, c(' food1 | food2 | food3 '), 4))[,2:4]
V2 V3 V4
1 pasta apple carrot
2 banana cucumber brown rice
3 bell pepper plum bread
但这显然并没有将他们分成适当的组/顺序,这是非常必要的
我真的在努力思考如何从合适的人群中为每个人提取合适的食物。有什么想法吗?您可以分别提取食物编号和食物项目t1和t2,将它们连接在一起,取消对数据的测试,并将其转换为广泛的格式
library(dplyr)
library(tidyr)
foodListDF %>%
mutate(food = stringr::str_extract_all(Choices, 'food\\d+')) %>%
select(-Choices) -> t1
foodListDF %>%
separate_rows(Choices, sep = 'food\\d+') %>%
filter(Choices != ' ') %>%
mutate(Choices = trimws(Choices)) %>%
group_by(Person) %>%
summarise(col = list(Choices)) -> t2
inner_join(t1, t2, by = 'Person') %>%
unnest(c(food, col)) %>%
pivot_wider(names_from = food, values_from = col)
# Person food1 food2 food3
# <chr> <chr> <chr> <chr>
#1 1 pasta apple carrot
#2 2 brown rice banana cucumber
#3 3 bread plum bell pepper
基尔
这里有两种基本的R方法,都涉及regmatches和gregexpr
第一种方法使用unstack。它会产生一个data.frame
splitstackshape+data.table
另一个选择是使用splitstackshape包中的cSplit以及一些非常简单的gsub工作,然后使用dcast来实现更广泛的形式
library(splitstackshape)
# library(data.table) # if required
# Basic helper function
fun <- function(string) {
list(gsub("(food\\d+) (.*)", "\\1", string),
gsub("(food\\d+) (.*)", "\\2", string))
}
cSplit(as.data.table(foodListDF)[, Choices := gsub(" food", ",food", trimws(Choices))],
"Choices", ",", "long")[, fun(Choices), Person][, dcast(.SD, Person ~ V1, value.var = "V2")]
# Person food1 food2 food3
# 1: 1 pasta apple carrot
# 2: 2 brown rice banana cucumber
# 3: 3 bread plum bell pepper
dplyr+tidyr
将上述内容改编为dplyr+tidyr,您可以尝试:
library(dplyr)
library(tidyr)
foodListDF %>%
mutate(Choices = gsub(" food", ",food", trimws(Choices))) %>%
separate_rows(Choices, sep = ",") %>%
separate(Choices, c("var", "val"), extra = "merge") %>%
pivot_wider(names_from = var, values_from = val)
# # A tibble: 3 x 4
# Person food1 food2 food3
# <chr> <chr> <chr> <chr>
# 1 1 pasta apple carrot
# 2 2 brown rice banana cucumber
# 3 3 bread plum bell pepper
在一个卷积基R表达式中:
data.frame(cbind(Person = foodListDF$Person,
do.call("rbind", Map(function(x){y <- setNames(x[[2]], x[[1]]); y[order(x[[1]])]},
lapply(strsplit(foodListDF$Choices, "\\s+"), function(x) {
res <- data.frame(t(grep("food\\d+", x, value = TRUE)), stringsAsFactors = FALSE)
res2 <- unlist(strsplit(gsub("^&&\\s*", "",
paste0(Filter(function(y){y != ""}, Vectorize(gsub)("food\\d+", "&&", x)),
collapse = " ")), "\\s*&&\\s*"))
list(res, res2)
}
)
)
)
), stringsAsFactors = FALSE)
您可以在strsplit中使用food作为分隔符,对结果进行排序,删除带有子字符串的第一个字符,并将结果返回到数据集
foodListDF[paste0("food",1:3)] <- t(sapply(strsplit(foodListDF$Choices, "food"),
function (x) trimws(substring(sort(x[-1]), 2))))
foodListDF[-2]
# Person food1 food2 food3
#1 1 pasta apple carrot
#2 2 brown rice banana cucumber
#3 3 bread plum bell pepper
或者,如果并非所有时间都存在所有级别:
j <- sort(unique(unlist(regmatches(foodListDF$Choices, gregexpr("food\\d+",
foodListDF$Choices)))))
k <- sub("food", "", j)
foodListDF[j] <- t(sapply(strsplit(foodListDF$Choices, "food"), function(x)
trimws(sub("^\\d+", "", x[charmatch(k, x)]))))
foodListDF[-2]
# Person food1 food2 food3
#1 1 pasta apple carrot
#2 2 brown rice banana cucumber
#3 3 bread plum bell pepper
非常感谢。我想我遵循了一般原则,但我在最终的内部联接中遇到了以下错误:mutate\u impl.data中的错误,dots:Column cfood,col`必须是3行数或1行数,而不是6`您是在共享的相同数据或不同的数据集上使用它吗?刚刚打开了一个新的RStudio会话,从这篇文章中复制了我制作foodListDF的数据,然后按照你的代码进行操作,我得到了同样的错误。我只是做了同样的操作,它对我有效。也许我们有软件包版本问题。packageVersion'dplyr'[1]“1.0.2”和packageVersion'tidyr'[1]“1.1.2”
library(dplyr)
library(tidyr)
foodListDF %>%
mutate(Choices = gsub(" food", ",food", trimws(Choices))) %>%
separate_rows(Choices, sep = ",") %>%
separate(Choices, c("var", "val"), extra = "merge") %>%
pivot_wider(names_from = var, values_from = val)
# # A tibble: 3 x 4
# Person food1 food2 food3
# <chr> <chr> <chr> <chr>
# 1 1 pasta apple carrot
# 2 2 brown rice banana cucumber
# 3 3 bread plum bell pepper
data.frame(cbind(Person = foodListDF$Person,
do.call("rbind", Map(function(x){y <- setNames(x[[2]], x[[1]]); y[order(x[[1]])]},
lapply(strsplit(foodListDF$Choices, "\\s+"), function(x) {
res <- data.frame(t(grep("food\\d+", x, value = TRUE)), stringsAsFactors = FALSE)
res2 <- unlist(strsplit(gsub("^&&\\s*", "",
paste0(Filter(function(y){y != ""}, Vectorize(gsub)("food\\d+", "&&", x)),
collapse = " ")), "\\s*&&\\s*"))
list(res, res2)
}
)
)
)
), stringsAsFactors = FALSE)
foodListDF[paste0("food",1:3)] <- t(sapply(strsplit(foodListDF$Choices, "food"),
function (x) trimws(substring(sort(x[-1]), 2))))
foodListDF[-2]
# Person food1 food2 food3
#1 1 pasta apple carrot
#2 2 brown rice banana cucumber
#3 3 bread plum bell pepper
j <- sort(unique(unlist(regmatches(foodListDF$Choices, gregexpr("food\\d+",
foodListDF$Choices)))))
k <- sub("food", "", j)
foodListDF[j] <- t(sapply(strsplit(foodListDF$Choices, "food"), function(x)
trimws(sub("^\\d+", "", x[charmatch(k, x)]))))
foodListDF[-2]
# Person food1 food2 food3
#1 1 pasta apple carrot
#2 2 brown rice banana cucumber
#3 3 bread plum bell pepper