R &引用;错误:行的标识符重复;在数据帧及其某些子集中,但不在其他子集中
我遇到了一个相当令人费解的tidyr::spread()错误。当我试图在完整的数据帧中运行代码(下面的示例)时,我得到了“行的重复标识符”错误 我对(非常大的)数据帧进行子集,以调查并重新运行代码。这一次它起了作用(见下文第1DF dput小节)。然后我用另一个子集(subset2df)再次尝试,再次收到错误消息。老实说,我不知道如何理解这一点,并将非常感谢任何帮助 可复制代码如下: 第1df分节:R &引用;错误:行的标识符重复;在数据帧及其某些子集中,但不在其他子集中,r,tidyr,R,Tidyr,我遇到了一个相当令人费解的tidyr::spread()错误。当我试图在完整的数据帧中运行代码(下面的示例)时,我得到了“行的重复标识符”错误 我对(非常大的)数据帧进行子集,以调查并重新运行代码。这一次它起了作用(见下文第1DF dput小节)。然后我用另一个子集(subset2df)再次尝试,再次收到错误消息。老实说,我不知道如何理解这一点,并将非常感谢任何帮助 可复制代码如下: 第1df分节: structure(list(v1 = structure(c(NA, NA, NA, NA,
structure(list(v1 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 1L, NA, NA, NA, NA), .Label = "2", class = "factor"),
v2 = structure(c(1L, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 1L, NA, NA), .Label = "2", class = "factor"),
v3 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_), .Label = character(0), class = "factor"),
v4 = structure(c(NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v5 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v6 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1L), .Label = "2", class = "factor"),
v7 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_), .Label = character(0), class = "factor"),
v8 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_), .Label = character(0), class = "factor"),
v9 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1L, NA, NA, NA, NA, NA), .Label = "1", class = "factor"),
v10 = structure(c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_), .Label = character(0), class = "factor"),
v11 = structure(c(NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v12 = structure(c(NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v13 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L,
NA, NA, NA, NA, NA, NA), .Label = "1", class = "factor"),
v14 = structure(c(NA, NA, NA, NA, NA, NA, 2L, NA, NA, NA,
NA, NA, 1L, NA, NA, NA), .Label = c("1", "2"), class = "factor"),
v15 = structure(c(NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v16 = structure(c(NA, NA, 2L, NA, NA, 2L, NA, NA, NA, NA,
NA, NA, NA, NA, 1L, NA), .Label = c("1", "2"), class = "factor"),
respondentID = structure(c(7L, 7L, 7L, 5L, 6L, 6L, 4L, 4L,
4L, 3L, 3L, 3L, 2L, 2L, 2L, 1L), .Label = c("EO15", "EO17",
"EO19", "EO21", "Eo23", "EO23", "EO24"), class = "factor")), .Names = c("v1",
"v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
"v12", "v13", "v14", "v15", "v16", "respondentID"), row.names = c(NA,
-16L), class = "data.frame")
第2df子类:
structure(list(v2 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 1L, NA, NA, 1L, NA, 1L, NA, 1L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v4 = structure(c(NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v5 = structure(c(NA, NA, NA, 2L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA), .Label = c("1",
"2"), class = "factor"), v6 = structure(c(NA, 1L, NA, NA,
2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), .Label = c("1", "2"), class = "factor"),
v9 = structure(c(NA, NA, NA, NA, NA, 1L, NA, NA, NA, 1L,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v11 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA,
NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "1", class = "factor"),
v12 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2L, NA), .Label = c("1",
"2"), class = "factor"), v13 = structure(c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA,
NA, NA, 2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 2L, NA, NA, NA), .Label = c("1", "2"), class = "factor"),
v14 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v15 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1L, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v16 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, 1L, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
respondentID = structure(c(21L, 20L, 20L, 19L, 18L, 18L,
1L, 1L, 16L, 16L, 16L, 10L, 10L, 17L, 15L, 15L, 15L, 14L,
14L, 14L, 13L, 12L, 12L, 11L, 11L, 11L, 8L, 9L, 9L, 6L, 7L,
7L, 3L, 2L, 3L, 4L, 4L, 4L, 5L), .Label = c("EO11", "Eo14",
"EO14", "EO16", "EO18", "EO26", "EO27", "Eo28", "EO28", "EO3",
"Eo30", "EO32", "EO331", "EO35", "EO37", "EO4", "EO41", "EO6",
"EO6 ", "EO7", "EO7 "), class = "factor")), .Names = c("v2",
"v4", "v5", "v6", "v9", "v11", "v12", "v13", "v14", "v15", "v16",
"respondentID"), row.names = c(NA, -39L), class = "data.frame")
combodf_id(执行代码所需):
代码:
结果\u df%
聚集(键=组合,值=val,-响应ID)%>%
过滤器(!is.na(val))%>%
左联合(combodf\u id,by=“combo”)%>%
安排(响应ID)%>%
重命名(rose_color1=color1,rose_color2=color2,
郁金香颜色1=颜色3,郁金香颜色2=颜色4)%>%
聚集(颜色、价值、玫瑰色1:郁金香色2)%>%
分离(颜色,分为=c('flower','color'))%>%
排列(颜色、值)%>%
变异(val=if_else(val==1,‘玫瑰’、‘郁金香’))%>%
变异(val=if_else(val==flower,1,0))%>%
选择(响应ID、花、颜色1、颜色2、选项=val)
下面@Tung的解决方案与中的非常相似,但两者都不能完全解决问题 在您的
子类t2df中
respondentID
列在某些值中具有空格级别,这些值可能会在以后把事情搞砸(例如“EO7”和“EO7”)subset2df[c(15,17),]
tidyr
函数GARGET
和SPRIDE
将考虑因子的级别,当您有因子变量的子集且并非所有级别都在数据中表示时,这看起来特别奇怪spread(color,value)
的数据帧具有重复的行。您可以通过在其前面添加一行来强制执行此操作,但请注意,必须这样做的唯一原因是因为之前的其他问题
subset2df %>%
as_tibble() %>%
mutate_at(vars(v2:v16), as.integer) %>%
gather(key = combo, value = val, -respondentID, na.rm = T) %>%
filter(!is.na(val)) %>%
left_join(combodf_id, by = "combo") %>%
arrange(respondentID) %>%
rename(rose_color1 = color1, rose_color2 = color2,
tulip_color1 = color3, tulip_color2 = color4) %>%
gather(color, value, rose_color1:tulip_color2) %>%
separate(color, into = c('flower', 'color')) %>%
distinct() %>%
spread(color, value) %>%
mutate(val = if_else(val == 1, 'rose', 'tulip')) %>%
mutate(val = if_else(val == flower, 1, 0)) %>%
select(respondentID, flower, color1, color2, choice = val)
不过,我强烈建议您首先解决上述所有问题,就像这样(请注意,您不需要在链的更深处使用distinct
命令,因为您已经将该命令应用于原始数据)
这两个选项都可能重复(插入group\u by_at(vars(-val))%%>%mutate(row\u id=1:n())%%>%。
result_df <- subset1df %>%
gather(key = combo, value = val, -respondentID) %>%
filter(!is.na(val)) %>%
left_join(combodf_id, by = "combo") %>%
arrange(respondentID) %>%
rename(rose_color1 = color1, rose_color2 = color2,
tulip_color1 = color3, tulip_color2 = color4) %>%
gather(color, value, rose_color1:tulip_color2) %>%
separate(color, into = c('flower', 'color')) %>%
spread(color, value) %>%
mutate(val = if_else(val == 1, 'rose', 'tulip')) %>%
mutate(val = if_else(val == flower, 1, 0)) %>%
select(respondentID, flower, color1, color2, choice = val)
subset2df %>%
as_tibble() %>%
mutate_at(vars(v2:v16), as.integer) %>%
gather(key = combo, value = val, -respondentID, na.rm = T) %>%
filter(!is.na(val)) %>%
left_join(combodf_id, by = "combo") %>%
arrange(respondentID) %>%
rename(rose_color1 = color1, rose_color2 = color2,
tulip_color1 = color3, tulip_color2 = color4) %>%
gather(color, value, rose_color1:tulip_color2) %>%
separate(color, into = c('flower', 'color')) %>%
distinct() %>%
spread(color, value) %>%
mutate(val = if_else(val == 1, 'rose', 'tulip')) %>%
mutate(val = if_else(val == flower, 1, 0)) %>%
select(respondentID, flower, color1, color2, choice = val)
subset2df %>%
as_tibble() %>% # tibble has better printing methods
mutate_at(vars(-respondentID), as.integer) %>% # convert response to numeric
mutate(respondentID = as.character(respondentID)) %>% # convert to char
mutate(respondentID = trimws(respondentID)) %>% # remove whitespace
distinct() %>% # remove duplicate rows
gather(key = combo, value = val, -respondentID, na.rm = T) %>%
left_join(combodf_id, by = "combo") %>%
mutate_at(vars(color1:color4), as.character) %>% # convert colors to char
rename(rose_color1 = color1, rose_color2 = color2,
tulip_color1 = color3, tulip_color2 = color4) %>%
gather(color, value, rose_color1:tulip_color2) %>%
separate(color, into = c('flower', 'color')) %>%
spread(color, value) %>%
mutate(val = if_else(val == 1, 'rose', 'tulip')) %>%
mutate(val = if_else(val == flower, 1L, 0L)) %>%
select(respondentID, flower, color1, color2, choice = val)