r将数据从多列折叠为一列
我知道在这个话题上有很多问题,所以如果这是一个重复的问题,我很抱歉。我正在尝试将数据集中的多列折叠为一列: 假设这是我正在处理的数据集的结构r将数据从多列折叠为一列,r,dplyr,aggregate,collapse,R,Dplyr,Aggregate,Collapse,我知道在这个话题上有很多问题,所以如果这是一个重复的问题,我很抱歉。我正在尝试将数据集中的多列折叠为一列: 假设这是我正在处理的数据集的结构 df <- data.frame( cbind( variable_1 = c('Var1', NA, NA,'Var1'), variable_2 = c('Var2', 'No', NA, NA), variable_3 = c(NA, NA, 'Var3', NA), variable
df <- data.frame(
cbind(
variable_1 = c('Var1', NA, NA,'Var1'),
variable_2 = c('Var2', 'No', NA, NA),
variable_3 = c(NA, NA, 'Var3', NA),
variable_4 = c(NA, 'Var4', NA, NA),
variable_5 = c(NA, 'No', 'Var5', NA),
variable_6 = c(NA, NA, 'Var6', NA)
))
variable_1 variable_2 variable_3 variable_4 variable_5 variable_6
Var1 Var2 NA NA NA NA
NA No NA Var4 No NA
NA NA Var3 NA Var5 Var6
Var1 NA NA NA NA NA
非常感谢您提供的任何帮助。我认为,如果有n行,那么目标是创建一个由逗号分隔的字符串组成的n向量,这些字符串在每行中包含字符
Var
。(如果您打算使用其他标准来区分所需值和不需要的值,则相应地更改grep
。)
df$variable_7 <- apply(df, 1, function(x) paste(x[!is.na(x) & x != "No"], collapse = ", "));
df;
# variable_1 variable_2 variable_3 variable_4 variable_5 variable_6
#1 Var1 Var2 <NA> <NA> <NA> <NA>
#2 <NA> No <NA> Var4 No <NA>
#3 <NA> <NA> Var3 <NA> Var5 Var6
#4 Var1 <NA> <NA> <NA> <NA> <NA>
# variable_7
#1 Var1, Var2
#2 Var4
#3 Var3, Var5, Var6
#4 Var1
使用
data.table
“重塑”方法而不是循环/应用
library(data.table)
setDT(df)
df[, id := .I][
melt(df, id.vars = "id")[grepl("Var", value), .(variable_7 = paste0(value, collapse = ",")), by = .(id)]
, on = "id"
, nomatch = 0
][order(id)]
# variable_1 variable_2 variable_3 variable_4 variable_5 variable_6 id variable_7
# 1: Var1 Var2 NA NA NA NA 1 Var1,Var2
# 2: NA No NA Var4 No NA 2 Var4
# 3: NA NA Var3 NA Var5 Var6 3 Var3,Var5,Var6
# 4: Var1 NA NA NA NA NA 4 Var1
使用
dplyr
的解决方案df4
是最终输出。请参见我如何创建数据帧df
。不需要cbind
,最好添加stringsAsFactors=FALSE
,以防止创建因子列
library(dplyr)
library(tidyr)
df2 <- df %>% mutate(ID = 1:n())
df3 <- df2 %>%
gather(Variable, Value, -ID, na.rm = TRUE) %>%
filter(!Value %in% "No") %>%
group_by(ID) %>%
summarise(variable_7 = toString(Value))
df4 <- df2 %>%
left_join(df3, by = "ID") %>%
select(-ID)
df4
# variable_1 variable_2 variable_3 variable_4 variable_5 variable_6 variable_7
# 1 Var1 Var2 <NA> <NA> <NA> <NA> Var1, Var2
# 2 <NA> No <NA> Var4 No <NA> Var4
# 3 <NA> <NA> Var3 <NA> Var5 Var6 Var3, Var5, Var6
# 4 Var1 <NA> <NA> <NA> <NA> <NA> Var1
库(dplyr)
图书馆(tidyr)
df2%变异(ID=1:n())
df3%
聚集(变量,值,-ID,na.rm=TRUE)%>%
筛选器(!值%in%“No”)%%>%
分组依据(ID)%>%
总结(变量_7=toString(值))
df4%
左联合(df3,by=“ID”)%%>%
选择(-ID)
df4
#变量1变量2变量3变量4变量5变量6变量7
#1 Var1 Var2 Var1,Var2
#2无变量4无变量4
#3 Var3 Var5 Var6 Var3 Var5 Var6 Var6
#4 Var1 Var1
数据
df <- data.frame(
variable_1 = c('Var1', NA, NA,'Var1'),
variable_2 = c('Var2', 'No', NA, NA),
variable_3 = c(NA, NA, 'Var3', NA),
variable_4 = c(NA, 'Var4', NA, NA),
variable_5 = c(NA, 'No', 'Var5', NA),
variable_6 = c(NA, NA, 'Var6', NA),
stringsAsFactors = FALSE
)
df这正是我所期望的,Thx一吨。很好,很高兴能帮助@Science11Nice解决方案,但在这个过程中,No
似乎被删除了。@www-I(可能不正确)认为这是requirementOP的预期输出的一部分,但仍然有“No”,但它是次要的,不会影响解决方案的有效性。@www-invariable_7
列?我在variable_5
和variable_2
@www中维护了它们-我认为有些混乱,但我在链中添加了order(id)
,以保持顺序:)
library(data.table)
setDT(df)
df[, id := .I][
melt(df, id.vars = "id")[grepl("Var", value), .(variable_7 = paste0(value, collapse = ",")), by = .(id)]
, on = "id"
, nomatch = 0
][order(id)]
# variable_1 variable_2 variable_3 variable_4 variable_5 variable_6 id variable_7
# 1: Var1 Var2 NA NA NA NA 1 Var1,Var2
# 2: NA No NA Var4 No NA 2 Var4
# 3: NA NA Var3 NA Var5 Var6 3 Var3,Var5,Var6
# 4: Var1 NA NA NA NA NA 4 Var1
library(dplyr)
library(tidyr)
df2 <- df %>% mutate(ID = 1:n())
df3 <- df2 %>%
gather(Variable, Value, -ID, na.rm = TRUE) %>%
filter(!Value %in% "No") %>%
group_by(ID) %>%
summarise(variable_7 = toString(Value))
df4 <- df2 %>%
left_join(df3, by = "ID") %>%
select(-ID)
df4
# variable_1 variable_2 variable_3 variable_4 variable_5 variable_6 variable_7
# 1 Var1 Var2 <NA> <NA> <NA> <NA> Var1, Var2
# 2 <NA> No <NA> Var4 No <NA> Var4
# 3 <NA> <NA> Var3 <NA> Var5 Var6 Var3, Var5, Var6
# 4 Var1 <NA> <NA> <NA> <NA> <NA> Var1
df <- data.frame(
variable_1 = c('Var1', NA, NA,'Var1'),
variable_2 = c('Var2', 'No', NA, NA),
variable_3 = c(NA, NA, 'Var3', NA),
variable_4 = c(NA, 'Var4', NA, NA),
variable_5 = c(NA, 'No', 'Var5', NA),
variable_6 = c(NA, NA, 'Var6', NA),
stringsAsFactors = FALSE
)