R 仅当子字符串是另一个向量的元素时,才使用逗号拆分字符串

R 仅当子字符串是另一个向量的元素时,才使用逗号拆分字符串,r,string,strsplit,R,String,Strsplit,我有一套调查回答,受访者可以选择零个或多个选项来回答“你喜欢什么类型的水果?”。还有一个地方可以写下答案。在结果电子表格中,每个人的回答在一个单元格中,不同类型的水果用逗号隔开,如下所示: (df <- data.frame(id = c("A", "B", "C", "D", "E"), data = c("oranges, apples, peaches, cherries, pineapples, strawberries",

我有一套调查回答,受访者可以选择零个或多个选项来回答“你喜欢什么类型的水果?”。还有一个地方可以写下答案。在结果电子表格中,每个人的回答在一个单元格中,不同类型的水果用逗号隔开,如下所示:

(df <- data.frame(id = c("A", "B", "C", "D", "E"), 
                 data = c("oranges, apples, peaches, cherries, pineapples, strawberries",
                          "oranges, peaches, pears", 
                          "pears, nectarines, cherries (bing, rainier)", 
                          "apples, peaches, nectarines", 
                          ""), 
                 stringsAsFactors = FALSE))

#   id                                                         data
# 1  A oranges, apples, peaches, cherries, pineapples, strawberries
# 2  B                                      oranges, peaches, pears
# 3  C                  pears, nectarines, cherries (bing, rainier)
# 4  D                                  apples, peaches, nectarines
# 5  E  
多项选择选项包括:

mc_answers <- c("oranges", "plums", "apples", "peaches", "pears", "nectarines")

mcu回答像这样的怎么样

do.call(rbind, lapply(split(df, df$id), function(x) {
    v<-unlist(strsplit(x$data, ",\\s?"))
    v<-c(v[v %in% mc_answers], paste(v[!v %in% mc_answers], collapse=", "))
    v<-v[nchar(v)>0]
    if (length(v)>0) {
        data.frame(id=x$id[1], data=v)
    } else {
        NULL
    }
}))
在这一行之后:

datalist <- Filter(length, datalist)
datalist您也可以尝试:

 library(data.table)
 library(devtools)
 source_gist(11380733) ## 

 df1 <- cSplit(df, "data", sep=", ", "long")
 indx <- df1$data %in% mc_answers
 res <- rbindlist(list(df1[indx,], df1[!indx,][, list(data=paste(data, collapse=", ")), by=id]))[order(id)]

  res
  #   id                               data
  #1:  A                            oranges
  #2:  A                             apples
  #3:  A                            peaches
  #4:  A cherries, pineapples, strawberries
  #5:  B                            oranges
  #6:  B                            peaches
  #7:  B                              pears
  #8:  C                              pears
  #9:  C                         nectarines
 #10:  C           cherries (bing, rainier)
 #11:  D                             apples
 #12:  D                            peaches
 #13:  D                         nectarines
库(data.table)
图书馆(devtools)
资料来源(11380733)
df1
    id                               data
A.1  A                            oranges
A.2  A                             apples
A.3  A                            peaches
A.4  A cherries, pineapples, strawberries
B.1  B                            oranges
B.2  B                            peaches
B.3  B                              pears
C.1  C                              pears
C.2  C                         nectarines
C.3  C           cherries (bing, rainier)
D.1  D                             apples
D.2  D                            peaches
D.3  D                         nectarines
datalist <- Filter(length, datalist)
datalist <- lapply(datalist, function(x) {
   if(any(!x %in% mc_answers))
       c(x[x %in% mc_answers], paste(x[!x %in% mc_answers], collapse = ", "))
   else
       x[x %in% mc_answers]
})
> (data_per_person <- do.call('rbind', data_long))
                                  data id
A.1                            oranges  A
A.2                             apples  A
A.3                            peaches  A
A.4 cherries, pineapples, strawberries  A
B.1                            oranges  B
B.2                            peaches  B
B.3                              pears  B
C.1                              pears  C
C.2                         nectarines  C
C.3           cherries (bing, rainier)  C
D.1                             apples  D
D.2                            peaches  D
D.3                         nectarines  D
 library(data.table)
 library(devtools)
 source_gist(11380733) ## 

 df1 <- cSplit(df, "data", sep=", ", "long")
 indx <- df1$data %in% mc_answers
 res <- rbindlist(list(df1[indx,], df1[!indx,][, list(data=paste(data, collapse=", ")), by=id]))[order(id)]

  res
  #   id                               data
  #1:  A                            oranges
  #2:  A                             apples
  #3:  A                            peaches
  #4:  A cherries, pineapples, strawberries
  #5:  B                            oranges
  #6:  B                            peaches
  #7:  B                              pears
  #8:  C                              pears
  #9:  C                         nectarines
 #10:  C           cherries (bing, rainier)
 #11:  D                             apples
 #12:  D                            peaches
 #13:  D                         nectarines