将数据帧中的不均匀列拆分为R中的多个列_R_Split_Dplyr_Tidyr

将数据帧中的不均匀列拆分为R中的多个列

将数据帧中的不均匀列拆分为R中的多个列,r,split,dplyr,tidyr,R,Split,Dplyr,Tidyr,我有一个数据框a如下所示，其中列Info有一些缺少的信息，如Sample2没有任何您为其他人看到的白色或黑色：可复制示例： A <- structure(list(Sample = structure(1:7, .Label = c("Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6", "Sample7" ), class = "factor"), Description = structure(c(7L,

我有一个数据框

如下所示，其中列

Info

有一些缺少的信息，如

Sample2

没有任何您为其他人看到的白色或黑色：

可复制示例：

A <- structure(list(Sample = structure(1:7, .Label = c("Sample1", 
"Sample2", "Sample3", "Sample4", "Sample5", "Sample6", "Sample7"
), class = "factor"), Description = structure(c(7L, 3L, 4L, 2L, 
6L, 1L, 5L), .Label = c("37 years, female, white, alive, 257 days", 
"43 years, male, white, stage:iiic, alive, 598 days", "53 years, male, stage:iiib, alive, 792 days", 
"68 years, female, white, stage:iiic, dead, 740 days", "69 years, female, black or african american, stage:iia, alive, 627 days", 
"74 years, white, stage:i, alive, 1001 days", "82 years, female, white, stage:iiib, alive, 1419 days"
), class = "factor")), class = "data.frame", row.names = c(NA, 
-7L))

要将

Info

列分隔为多个列，我使用了

separate

函数，如下所示

library(dplyr)
library(tidyr)
A2 <- separate(A, 'Info', paste("Info", 1:6, sep="_"), sep=",", extra="drop")

我希望

输出

如下所示，其中缺少的信息需要为空格或NA，最后一列只显示数字，其中没有任何单词

天

：

Sample  Info_1     Info_2   Info_3    Info_4    Info_5   Info_6
Sample1 82 years   female    white   stage:iiib  alive   1419
Sample2 53 years    male             stage:iiib  alive   792    
Sample3 68 years   female    white   stage:iiic  dead    740
Sample4 43 years    male     white   stage:iiic  alive   598
Sample5 74 years             white    stage:i    alive   1001
Sample6 37 years   female    white               alive   257
Sample7 69 years   female    black   stage:iia   alive   627

感谢您的帮助。这里有一个想法。我相信有更好的方法，但这是一个开始

我们可以使用

extract

from

tidyr

来拆分从完整记录开始的列。关键是设置可识别且有意义的列名。在此之后，我们删除带有

NA

的行，并将它们从原始数据帧中删除。然后我们可以再次执行

提取

，假设缺少一些列。最终，我们可以满足所有缺失的条件，并将它们适当地分离。最后一步是合并所有子集数据帧

如果您有许多不同的缺少列组件，则此方法可能过于冗长。但是，如果您确定可能缺少什么。我们可以设计一个函数并将所有这些步骤包装到函数中

library(tidyverse)

# Complete rows
dat2 <- dat %>%
  extract(Info, into = c("Year", "Sex", "Race", "Stage", "Status", "Days"),
          regex = "([0-9]* years), (male|female), (black|white), (stage\\:i[A-Za-z]*), (dead|alive), ([0-9]*)") %>%
  drop_na(Year)

dat <- dat %>% anti_join(dat2, by = "Sample")

# Record with no race
dat3 <- dat %>%
  extract(Info, into = c("Year", "Sex", "Stage", "Status", "Days"),
          regex = "([0-9]* years), (male|female), (stage\\:i[A-Za-z]*), (dead|alive), ([0-9]*)") %>%
  drop_na(Year)

dat <- dat %>% anti_join(dat3, by = "Sample")

# Record with no sex
dat4 <- dat %>%
  extract(Info, into = c("Year", "Race", "Stage", "Status", "Days"),
          regex = "([0-9]* years), (black|white), (stage\\:i[A-Za-z]*), (dead|alive), ([0-9]*)") %>%
  drop_na(Year)

dat <- dat %>% anti_join(dat4, by = "Sample")

# Record with no stage
dat5 <- dat %>%
  extract(Info, into = c("Year", "Sex", "Race", "Status", "Days"),
          regex = "([0-9]* years), (male|female), (black|white), (dead|alive), ([0-9]*)") %>%
  drop_na(Year)

dat <- dat %>% anti_join(dat5, by = "Sample")

# Combine all subset data frame
dat_new <- bind_rows(dat2, dat3, dat4, dat5) %>%
  arrange(Sample)
dat_new
#    Sample     Year    Sex  Race      Stage Status Days
# 1 Sample1 82 years female white stage:iiib  alive 1419
# 2 Sample2 53 years   male  <NA> stage:iiib  alive  792
# 3 Sample3 68 years female white stage:iiic   dead  740
# 4 Sample4 43 years   male white stage:iiic  alive  598
# 5 Sample5 74 years   <NA> white    stage:i  alive 1001
# 6 Sample6 37 years female white       <NA>  alive  257
# 7 Sample7 69 years female black  stage:iia  alive  627

库（tidyverse）
#整行
dat2%
提取（信息，输入=c（“年”、“性别”、“种族”、“阶段”、“状态”、“天数”），
regex=“（[0-9]*年），（男|女），（黑|白），\\\\:i[A-Za-z]*），（死|生），（[0-9]*）”%>%
下降（年）
dat%反连接（dat2，by=“样本”）
#无种族记录
dat3%
提取（信息，输入=c（“年”、“性别”、“阶段”、“状态”、“天数”），
regex=“（[0-9]*年），（男|女），\\\\:i[A-Za-z]*），（死|生），（[0-9]*）”%>%
下降（年）
dat%反连接（dat3，by=“样本”）
#无性别记录
dat4%
提取（信息，输入=c（“年”、“比赛”、“阶段”、“状态”、“天数”），
regex=“（[0-9]*年），（黑|白），（阶段\\:i[A-Za-z]*），（死|活），（[0-9]*）”%>%
下降（年）
dat%反连接（dat4，by=“样本”）
#无舞台记录
dat5%
提取（信息，输入=c（“年”、“性别”、“种族”、“状态”、“天数”），
regex=“（[0-9]*年），（男|女），（黑|白），（死|活），（[0-9]*）”%>%
下降（年）
dat%反连接（dat5，by=“样本”）
#组合所有子集数据帧
dat_新%
安排（样本）
达图新酒店
#样本年份性别比赛阶段状态天数
#1个样本1 82岁女性白人阶段：iiib活着1419
#样本2 53岁男性阶段：iiib活着792
#3个样本3 68岁女性白人阶段：iiic死亡740
#4样本4 43岁男性白人阶段：iiic活着598
#5样本5 74岁白色阶段：我活着1001
#6样本6 37岁女性白人活着257
#7样本7 69岁女性黑人阶段：iia活着627

数据

dat <- read.table(text = "Sample  Info
Sample1 '82 years, female, white, stage:iiib, alive, 1419 days'
                  Sample2 '53 years, male, stage:iiib, alive, 792 days'
                  Sample3 '68 years, female, white, stage:iiic, dead, 740 days'
                  Sample4 '43 years, male, white, stage:iiic, alive, 598 days'
                  Sample5 '74 years, white, stage:i, alive, 1001 days'
                  Sample6 '37 years, female, white, alive, 257 days'
                  Sample7 '69 years, female, black, stage:iia, alive, 627 days'",
                  header = TRUE, stringsAsFactors = FALSE)

dat使用末尾注释中重复显示的数据，我们可以使用read.pattern
和指示的模式pat
，然后删除垃圾列（每隔一列）。如果不要求列名与问题中的列名完全相同，则可以省略标记为##的行
library(gsubfn)

pat <- 
"((\\d+ years), )?((female|male), )?((white|black), )?((stage:\\S+), )?((alive|dead), )?((\\d+) days)?"
r <- read.pattern(text = as.character(DF$Info), pattern = pat, as.is = TRUE)
DF2 <- cbind(Sample = DF$Sample, r[c(FALSE, TRUE)], stringsAsFactors = FALSE)

nc <- ncol(DF2) ## 
names(DF2)[-1] <- paste0("Info_", 1:(nc-1)) ##

DF2

注
可复制形式的输入DF
如下所示
Lines <- "
Sample;Info
Sample1;82 years, female, white, stage:iiib, alive, 1419 days
Sample2;53 years, male, stage:iiib, alive, 792 days
Sample3;68 years, female, white, stage:iiic, dead, 740 days
Sample4;43 years, male, white, stage:iiic, alive, 598 days
Sample5;74 years, white, stage:i, alive, 1001 days
Sample6;37 years, female, white, alive, 257 days
Sample7;69 years, female, black, stage:iia, alive, 627 days"

DF <- read.table(text = Lines, header = TRUE, sep = ";", as.is = TRUE, strip.white = TRUE)

行请共享一个。似乎需要一些逻辑来将值与您未编程的正确列匹配，例如，Info_3包含信息“白色”或“黑色”，Info_4包含字符串“stage:”，后跟几个字母，等等。单独的无法为您理解，它只按一些分隔符分隔字符串。最简单的解决方案是转到源代码，以不同的方式导出数据，以便包含空单元格，而不包含任何信息或“NA”。如果你有A和C，但没有B，你会看到“（…），A，C，（…）”@Sotos，我已经给出了一个例子。我还展示了我所做的尝试，以及我需要的输出应该是什么样子。@user3351523看看我给你的链接。您需要发布可复制的示例（我可以很容易地复制/粘贴到会话中）非常好。谢谢。如果您不介意的话，正如我在必选输出中提到的，我不希望最后一列中出现天数。@user3351523更新帖子以满足您的要求。谢谢。比我的方法更好，因为它是一种更灵活的方法。谢谢分享。
library(gsubfn)

pat <- 
"((\\d+ years), )?((female|male), )?((white|black), )?((stage:\\S+), )?((alive|dead), )?((\\d+) days)?"
r <- read.pattern(text = as.character(DF$Info), pattern = pat, as.is = TRUE)
DF2 <- cbind(Sample = DF$Sample, r[c(FALSE, TRUE)], stringsAsFactors = FALSE)

nc <- ncol(DF2) ## 
names(DF2)[-1] <- paste0("Info_", 1:(nc-1)) ##

DF2

   Sample   Info_1 Info_2 Info_3     Info_4 Info_5 Info_6
1 Sample1 82 years female  white stage:iiib  alive   1419
2 Sample2 53 years   male        stage:iiib  alive    792
3 Sample3 68 years female  white stage:iiic   dead    740
4 Sample4 43 years   male  white stage:iiic  alive    598
5 Sample5 74 years         white    stage:i  alive   1001
6 Sample6 37 years female  white             alive    257
7 Sample7 69 years female  black  stage:iia  alive    627

Lines <- "
Sample;Info
Sample1;82 years, female, white, stage:iiib, alive, 1419 days
Sample2;53 years, male, stage:iiib, alive, 792 days
Sample3;68 years, female, white, stage:iiic, dead, 740 days
Sample4;43 years, male, white, stage:iiic, alive, 598 days
Sample5;74 years, white, stage:i, alive, 1001 days
Sample6;37 years, female, white, alive, 257 days
Sample7;69 years, female, black, stage:iia, alive, 627 days"

DF <- read.table(text = Lines, header = TRUE, sep = ";", as.is = TRUE, strip.white = TRUE)