R 将字符串拆分为多列(按特定顺序)

R 将字符串拆分为多列(按特定顺序),r,R,我收到的数据格式不是很好,我无法在上游更改它们。有一列需要重新排序,并根据某些关键字拆分为10+个其他列 下面是我收到的一个数据示例——对于每个人,他们选择了3种不同的食物。他们对每种食物类别food1、food2、food3的选择都紧跟在正文之后: list1 <- c(' food1 pasta food2 apple food3 carrot ') list2 <- c(' food2 banana food3 cucumber food1 brown rice ') list

我收到的数据格式不是很好,我无法在上游更改它们。有一列需要重新排序,并根据某些关键字拆分为10+个其他列

下面是我收到的一个数据示例——对于每个人,他们选择了3种不同的食物。他们对每种食物类别food1、food2、food3的选择都紧跟在正文之后:

list1 <- c(' food1 pasta food2 apple food3 carrot ')
list2 <- c(' food2 banana food3 cucumber food1 brown rice ')
list3 <- c(' food3 bell pepper food2 plum food1 bread ')

foodListDF <- as.data.frame(matrix(c(1,2,3, list1, list2, list3), nrow = 3), stringsAsFactors = FALSE)
colnames(foodListDF) <- c('Person', 'Choices')
foodListDF


  Person                                        Choices
1      1          food1 pasta food2 apple food3 carrot 
2      2  food2 banana food3 cucumber food1 brown rice 
3      3      food3 bell pepper food2 plum food1 bread 
我知道我可以通过这样的方式进行选择:

library(stringr)
as.data.frame(str_split_fixed(foodListDF$Choices, c(' food1 | food2 | food3 '), 4))[,2:4]
           V2       V3          V4
1       pasta    apple     carrot 
2      banana cucumber brown rice 
3 bell pepper     plum      bread 
但这显然并没有将他们分成适当的组/顺序,这是非常必要的


我真的在努力思考如何从合适的人群中为每个人提取合适的食物。有什么想法吗?

您可以分别提取食物编号和食物项目t1和t2,将它们连接在一起,取消对数据的测试,并将其转换为广泛的格式

library(dplyr)
library(tidyr)

foodListDF %>%
  mutate(food = stringr::str_extract_all(Choices, 'food\\d+')) %>%
  select(-Choices) -> t1
 
  
foodListDF %>%
  separate_rows(Choices, sep = 'food\\d+') %>%
  filter(Choices != ' ') %>%
  mutate(Choices = trimws(Choices)) %>%
  group_by(Person) %>%
  summarise(col = list(Choices)) -> t2


inner_join(t1, t2, by = 'Person') %>%
  unnest(c(food, col)) %>%
  pivot_wider(names_from = food, values_from = col)

#  Person food1      food2  food3      
#  <chr>  <chr>      <chr>  <chr>      
#1 1      pasta      apple  carrot     
#2 2      brown rice banana cucumber   
#3 3      bread      plum   bell pepper
基尔 这里有两种基本的R方法,都涉及regmatches和gregexpr

第一种方法使用unstack。它会产生一个data.frame

splitstackshape+data.table 另一个选择是使用splitstackshape包中的cSplit以及一些非常简单的gsub工作,然后使用dcast来实现更广泛的形式

library(splitstackshape)
# library(data.table) # if required

# Basic helper function
fun <- function(string) {
  list(gsub("(food\\d+) (.*)", "\\1", string),
       gsub("(food\\d+) (.*)", "\\2", string))
}

cSplit(as.data.table(foodListDF)[, Choices := gsub(" food", ",food", trimws(Choices))], 
       "Choices", ",", "long")[, fun(Choices), Person][, dcast(.SD, Person ~ V1, value.var = "V2")]
#    Person      food1  food2       food3
# 1:      1      pasta  apple      carrot
# 2:      2 brown rice banana    cucumber
# 3:      3      bread   plum bell pepper
dplyr+tidyr 将上述内容改编为dplyr+tidyr,您可以尝试:

library(dplyr)
library(tidyr)

foodListDF %>%
  mutate(Choices = gsub(" food", ",food", trimws(Choices))) %>%
  separate_rows(Choices, sep = ",") %>%
  separate(Choices, c("var", "val"), extra = "merge") %>%
  pivot_wider(names_from = var, values_from = val)
# # A tibble: 3 x 4
#   Person food1      food2  food3      
#   <chr>  <chr>      <chr>  <chr>      
# 1 1      pasta      apple  carrot     
# 2 2      brown rice banana cucumber   
# 3 3      bread      plum   bell pepper

在一个卷积基R表达式中:

data.frame(cbind(Person = foodListDF$Person, 
    do.call("rbind", Map(function(x){y <- setNames(x[[2]], x[[1]]); y[order(x[[1]])]},
      lapply(strsplit(foodListDF$Choices, "\\s+"), function(x) {
      res <- data.frame(t(grep("food\\d+", x, value = TRUE)), stringsAsFactors = FALSE)
      res2 <- unlist(strsplit(gsub("^&&\\s*", "", 
                paste0(Filter(function(y){y != ""}, Vectorize(gsub)("food\\d+", "&&", x)), 
                       collapse = " ")), "\\s*&&\\s*"))
      list(res, res2)
        }
      )
    )
  )
), stringsAsFactors = FALSE)
您可以在strsplit中使用food作为分隔符,对结果进行排序,删除带有子字符串的第一个字符,并将结果返回到数据集

foodListDF[paste0("food",1:3)] <- t(sapply(strsplit(foodListDF$Choices, "food"),
 function (x) trimws(substring(sort(x[-1]), 2))))
foodListDF[-2]
#  Person      food1  food2       food3
#1      1      pasta  apple      carrot
#2      2 brown rice banana    cucumber
#3      3      bread   plum bell pepper
或者,如果并非所有时间都存在所有级别:

j <- sort(unique(unlist(regmatches(foodListDF$Choices, gregexpr("food\\d+",
 foodListDF$Choices)))))
k <- sub("food", "", j)
foodListDF[j] <- t(sapply(strsplit(foodListDF$Choices, "food"), function(x)
 trimws(sub("^\\d+", "", x[charmatch(k, x)]))))
foodListDF[-2]
#  Person      food1  food2       food3
#1      1      pasta  apple      carrot
#2      2 brown rice banana    cucumber
#3      3      bread   plum bell pepper

非常感谢。我想我遵循了一般原则,但我在最终的内部联接中遇到了以下错误:mutate\u impl.data中的错误,dots:Column cfood,col`必须是3行数或1行数,而不是6`您是在共享的相同数据或不同的数据集上使用它吗?刚刚打开了一个新的RStudio会话,从这篇文章中复制了我制作foodListDF的数据,然后按照你的代码进行操作,我得到了同样的错误。我只是做了同样的操作,它对我有效。也许我们有软件包版本问题。packageVersion'dplyr'[1]“1.0.2”和packageVersion'tidyr'[1]“1.1.2”
library(dplyr)
library(tidyr)

foodListDF %>%
  mutate(Choices = gsub(" food", ",food", trimws(Choices))) %>%
  separate_rows(Choices, sep = ",") %>%
  separate(Choices, c("var", "val"), extra = "merge") %>%
  pivot_wider(names_from = var, values_from = val)
# # A tibble: 3 x 4
#   Person food1      food2  food3      
#   <chr>  <chr>      <chr>  <chr>      
# 1 1      pasta      apple  carrot     
# 2 2      brown rice banana cucumber   
# 3 3      bread      plum   bell pepper
data.frame(cbind(Person = foodListDF$Person, 
    do.call("rbind", Map(function(x){y <- setNames(x[[2]], x[[1]]); y[order(x[[1]])]},
      lapply(strsplit(foodListDF$Choices, "\\s+"), function(x) {
      res <- data.frame(t(grep("food\\d+", x, value = TRUE)), stringsAsFactors = FALSE)
      res2 <- unlist(strsplit(gsub("^&&\\s*", "", 
                paste0(Filter(function(y){y != ""}, Vectorize(gsub)("food\\d+", "&&", x)), 
                       collapse = " ")), "\\s*&&\\s*"))
      list(res, res2)
        }
      )
    )
  )
), stringsAsFactors = FALSE)
foodListDF[paste0("food",1:3)] <- t(sapply(strsplit(foodListDF$Choices, "food"),
 function (x) trimws(substring(sort(x[-1]), 2))))
foodListDF[-2]
#  Person      food1  food2       food3
#1      1      pasta  apple      carrot
#2      2 brown rice banana    cucumber
#3      3      bread   plum bell pepper
j <- sort(unique(unlist(regmatches(foodListDF$Choices, gregexpr("food\\d+",
 foodListDF$Choices)))))
k <- sub("food", "", j)
foodListDF[j] <- t(sapply(strsplit(foodListDF$Choices, "food"), function(x)
 trimws(sub("^\\d+", "", x[charmatch(k, x)]))))
foodListDF[-2]
#  Person      food1  food2       food3
#1      1      pasta  apple      carrot
#2      2 brown rice banana    cucumber
#3      3      bread   plum bell pepper