R &引用;错误:行的标识符重复;在数据帧及其某些子集中,但不在其他子集中

R &引用;错误:行的标识符重复;在数据帧及其某些子集中,但不在其他子集中,r,tidyr,R,Tidyr,我遇到了一个相当令人费解的tidyr::spread()错误。当我试图在完整的数据帧中运行代码(下面的示例)时,我得到了“行的重复标识符”错误 我对(非常大的)数据帧进行子集,以调查并重新运行代码。这一次它起了作用(见下文第1DF dput小节)。然后我用另一个子集(subset2df)再次尝试,再次收到错误消息。老实说,我不知道如何理解这一点,并将非常感谢任何帮助 可复制代码如下: 第1df分节: structure(list(v1 = structure(c(NA, NA, NA, NA,

我遇到了一个相当令人费解的tidyr::spread()错误。当我试图在完整的数据帧中运行代码(下面的示例)时,我得到了“行的重复标识符”错误

我对(非常大的)数据帧进行子集,以调查并重新运行代码。这一次它起了作用(见下文第1DF dput小节)。然后我用另一个子集(subset2df)再次尝试,再次收到错误消息。老实说,我不知道如何理解这一点,并将非常感谢任何帮助

可复制代码如下:

第1df分节:

structure(list(v1 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, 1L, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v2 = structure(c(1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, 1L, NA, NA), .Label = "2", class = "factor"), 
    v3 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_), .Label = character(0), class = "factor"), 
    v4 = structure(c(NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v5 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, 
    NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v6 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, 1L), .Label = "2", class = "factor"), 
    v7 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_), .Label = character(0), class = "factor"), 
    v8 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_), .Label = character(0), class = "factor"), 
    v9 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    1L, NA, NA, NA, NA, NA), .Label = "1", class = "factor"), 
    v10 = structure(c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_), .Label = character(0), class = "factor"), 
    v11 = structure(c(NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, 
    NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v12 = structure(c(NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v13 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, 
    NA, NA, NA, NA, NA, NA), .Label = "1", class = "factor"), 
    v14 = structure(c(NA, NA, NA, NA, NA, NA, 2L, NA, NA, NA, 
    NA, NA, 1L, NA, NA, NA), .Label = c("1", "2"), class = "factor"), 
    v15 = structure(c(NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v16 = structure(c(NA, NA, 2L, NA, NA, 2L, NA, NA, NA, NA, 
    NA, NA, NA, NA, 1L, NA), .Label = c("1", "2"), class = "factor"), 
    respondentID = structure(c(7L, 7L, 7L, 5L, 6L, 6L, 4L, 4L, 
    4L, 3L, 3L, 3L, 2L, 2L, 2L, 1L), .Label = c("EO15", "EO17", 
    "EO19", "EO21", "Eo23", "EO23", "EO24"), class = "factor")), .Names = c("v1", 
"v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", 
"v12", "v13", "v14", "v15", "v16", "respondentID"), row.names = c(NA, 
-16L), class = "data.frame")
第2df子类:

structure(list(v2 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, 1L, NA, NA, 1L, NA, 1L, NA, 1L, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v4 = structure(c(NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v5 = structure(c(NA, NA, NA, 2L, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA), .Label = c("1", 
    "2"), class = "factor"), v6 = structure(c(NA, 1L, NA, NA, 
    2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), .Label = c("1", "2"), class = "factor"), 
    v9 = structure(c(NA, NA, NA, NA, NA, 1L, NA, NA, NA, 1L, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v11 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, 
    NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "1", class = "factor"), 
    v12 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2L, NA), .Label = c("1", 
    "2"), class = "factor"), v13 = structure(c(NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, 
    NA, NA, 2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, 2L, NA, NA, NA), .Label = c("1", "2"), class = "factor"), 
    v14 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v15 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    1L, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v16 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, 1L, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    respondentID = structure(c(21L, 20L, 20L, 19L, 18L, 18L, 
    1L, 1L, 16L, 16L, 16L, 10L, 10L, 17L, 15L, 15L, 15L, 14L, 
    14L, 14L, 13L, 12L, 12L, 11L, 11L, 11L, 8L, 9L, 9L, 6L, 7L, 
    7L, 3L, 2L, 3L, 4L, 4L, 4L, 5L), .Label = c("EO11", "Eo14", 
    "EO14", "EO16", "EO18", "EO26", "EO27", "Eo28", "EO28", "EO3", 
    "Eo30", "EO32", "EO331", "EO35", "EO37", "EO4", "EO41", "EO6", 
    "EO6 ", "EO7", "EO7 "), class = "factor")), .Names = c("v2", 
"v4", "v5", "v6", "v9", "v11", "v12", "v13", "v14", "v15", "v16", 
"respondentID"), row.names = c(NA, -39L), class = "data.frame")
combodf_id(执行代码所需):

代码:

结果\u df%
聚集(键=组合,值=val,-响应ID)%>%
过滤器(!is.na(val))%>%
左联合(combodf\u id,by=“combo”)%>%
安排(响应ID)%>%
重命名(rose_color1=color1,rose_color2=color2,
郁金香颜色1=颜色3,郁金香颜色2=颜色4)%>%
聚集(颜色、价值、玫瑰色1:郁金香色2)%>%
分离(颜色,分为=c('flower','color'))%>%
排列(颜色、值)%>%
变异(val=if_else(val==1,‘玫瑰’、‘郁金香’))%>%
变异(val=if_else(val==flower,1,0))%>%
选择(响应ID、花、颜色1、颜色2、选项=val)

下面@Tung的解决方案与中的非常相似,但两者都不能完全解决问题

在您的
子类t2df中

  • 您已经删除了一些响应列(例如v1、v3、v7等)。。。这可能就是为什么有一堆行/响应者没有任何响应(全部NA)

  • 您的
    respondentID
    列在某些值中具有空格级别,这些值可能会在以后把事情搞砸(例如“EO7”和“EO7”)

  • 有重复的行,例如
    subset2df[c(15,17),]

  • 你所有的专栏都是因素。。。特别是对于带有整数值的响应列,我发现这很奇怪
    tidyr
    函数
    GARGET
    SPRIDE
    将考虑因子的级别,当您有因子变量的子集且并非所有级别都在数据中表示时,这看起来特别奇怪

  • 您可能应该首先解决这些问题,因为它们可能会导致您以后的问题,但是。。。出现错误“行的重复标识符”的原因是,要传递到
    spread(color,value)
    的数据帧具有重复的行。您可以通过在其前面添加一行来强制执行此操作,但请注意,必须这样做的唯一原因是因为之前的其他问题

    subset2df %>%
      as_tibble() %>% 
      mutate_at(vars(v2:v16), as.integer) %>% 
      gather(key = combo, value = val, -respondentID, na.rm = T) %>%
      filter(!is.na(val)) %>%
      left_join(combodf_id, by = "combo") %>%
      arrange(respondentID) %>% 
      rename(rose_color1 = color1, rose_color2 = color2,
             tulip_color1 = color3, tulip_color2 = color4) %>% 
      gather(color, value, rose_color1:tulip_color2) %>% 
      separate(color, into = c('flower', 'color')) %>% 
      distinct() %>% 
      spread(color, value) %>% 
      mutate(val = if_else(val == 1, 'rose', 'tulip')) %>% 
      mutate(val = if_else(val == flower, 1, 0)) %>% 
      select(respondentID, flower, color1, color2, choice = val)
    
    不过,我强烈建议您首先解决上述所有问题,就像这样(请注意,您不需要在链的更深处使用
    distinct
    命令,因为您已经将该命令应用于原始数据)

    这两个选项都可能重复(插入
    group\u by_at(vars(-val))%%>%mutate(row\u id=1:n())%%>%。
    
    result_df <- subset1df %>%
      gather(key = combo, value = val, -respondentID) %>%
      filter(!is.na(val)) %>%
      left_join(combodf_id, by = "combo") %>%
      arrange(respondentID) %>% 
      rename(rose_color1 = color1, rose_color2 = color2,
             tulip_color1 = color3, tulip_color2 = color4) %>% 
      gather(color, value, rose_color1:tulip_color2) %>% 
      separate(color, into = c('flower', 'color')) %>% 
      spread(color, value) %>% 
      mutate(val = if_else(val == 1, 'rose', 'tulip')) %>% 
      mutate(val = if_else(val == flower, 1, 0)) %>% 
      select(respondentID, flower, color1, color2, choice = val)
    
    subset2df %>%
      as_tibble() %>% 
      mutate_at(vars(v2:v16), as.integer) %>% 
      gather(key = combo, value = val, -respondentID, na.rm = T) %>%
      filter(!is.na(val)) %>%
      left_join(combodf_id, by = "combo") %>%
      arrange(respondentID) %>% 
      rename(rose_color1 = color1, rose_color2 = color2,
             tulip_color1 = color3, tulip_color2 = color4) %>% 
      gather(color, value, rose_color1:tulip_color2) %>% 
      separate(color, into = c('flower', 'color')) %>% 
      distinct() %>% 
      spread(color, value) %>% 
      mutate(val = if_else(val == 1, 'rose', 'tulip')) %>% 
      mutate(val = if_else(val == flower, 1, 0)) %>% 
      select(respondentID, flower, color1, color2, choice = val)
    
    subset2df %>%
      as_tibble() %>%  # tibble has better printing methods
      mutate_at(vars(-respondentID), as.integer) %>%  # convert response to numeric
      mutate(respondentID = as.character(respondentID)) %>%  # convert to char
      mutate(respondentID = trimws(respondentID)) %>%  # remove whitespace
      distinct() %>%  # remove duplicate rows
      gather(key = combo, value = val, -respondentID, na.rm = T) %>%
      left_join(combodf_id, by = "combo") %>%
      mutate_at(vars(color1:color4), as.character) %>%  # convert colors to char
      rename(rose_color1 = color1, rose_color2 = color2, 
             tulip_color1 = color3, tulip_color2 = color4) %>% 
      gather(color, value, rose_color1:tulip_color2) %>% 
      separate(color, into = c('flower', 'color')) %>% 
      spread(color, value) %>% 
      mutate(val = if_else(val == 1, 'rose', 'tulip')) %>% 
      mutate(val = if_else(val == flower, 1L, 0L)) %>% 
      select(respondentID, flower, color1, color2, choice = val)