按“rleid”组折叠行,存在重复值时除外
我在按“rleid”组折叠行,存在重复值时除外,r,dplyr,R,Dplyr,我在话语中有语音数据,在A_aoi、B_aoi和C_aoi列中有凝视数据。一些语句行是重复的: df <- data.frame( line = c(1,2,3,4,4,4,5,6,6,7,8), speaker = c("b", "a", NA, "c", "c", "c", NA, "c", "c", "a", &quo
话语
中有语音数据,在A_aoi
、B_aoi
和C_aoi
列中有凝视数据。一些语句
行是重复的:
df <- data.frame(
line = c(1,2,3,4,4,4,5,6,6,7,8),
speaker = c("b", "a", NA, "c", "c", "c", NA, "c", "c", "a", "a"),
utterance = c("Hey sweetheart!", "Louise!", "(0.234)", "What?", "What?", "What?", "(0.778)", "um::", "um::", "Wake up,", "breakfast's ready"),
A_aoi = c("B", "B", "C", "B", NA, "C", "C", NA, "C", "C", "C"),
B_aoi = c("C", "C", "C", "C", "A", "C", NA, NA, "C", "C", NA),
C_aoi = c("A", NA, NA, "B", NA, "C", "C", "A", "A", "A", "A")
)
但是,这也会折叠重复的
话语
值。预期结果是:
# A tibble: 7 x 6
line speaker utterance A_aoi B_aoi C_aoi
<dbl> <chr> <chr> <chr> <chr> <chr>
1 1 b Hey sweetheart! B C A
2 2 a Louise! B C *
3 3 NA (0.234) C C *
4 4 c What? B*C CAC B*C
5 5 NA (0.778) C * C
6 6 c um:: *C *C AA
7 7 a Wake up, breakfast's ready CC C* AA
#一个tible:7 x 6
线路扬声器发声A_aoi B_aoi C_aoi
嘿,亲爱的!B、C、A
路易丝!B C*
3钠(0.234)碳*
什么?B*C CAC B*C
5Na(0.778)C*C
6 c um::*c*c AA
7 a起床,早餐准备好了CC C*AA
感谢您的帮助
编辑:
我有一个逐步解决方案,但如果有人有一个更好、更简单的解决方案,我将非常感激:
# step 1 -- collapse only `aoi` columns:
df_a <- df %>%
group_by(grp = rleid(speaker)) %>%
summarise(across(c(line, speaker), first),
A_aoi = str_c(if_else(!is.na(A_aoi), A_aoi, "*" ), collapse = ""),
B_aoi = str_c(if_else(!is.na(B_aoi), B_aoi, "*" ), collapse = ""),
C_aoi = str_c(if_else(!is.na(C_aoi), C_aoi, "*" ), collapse = ""), .groups = 'drop') %>%
select(- c(grp, line, speaker))
# step 2 -- remove duplicates:
df_b <- df[-which(duplicated(df$line)),]
# step 3 -- collapse `utterance`:
df_c <- df_b %>%
group_by(grp = rleid(speaker)) %>%
summarise(across(c(line, speaker), first),
utterance = str_c(utterance, collapse = ' '), .groups = 'drop') %>%
select(- grp)
# step 4 -- bind:
bind_cols(df_c, df_a)
#步骤1--仅折叠'aoi'列:
df_a%
分组人(grp=rleid(扬声器))%>%
总结(跨越(c(行,发言人),第一个),
A_aoi=str_c(if_else(!is.na(A_aoi),A_aoi,“*”),collapse=“”),
B_aoi=str_c(如果其他(!is.na(B_aoi),B_aoi,“*”),collapse=“”),
C_aoi=str_C(如果_else(!is.na(C_aoi),C_aoi,“*”,collapse=“”),.groups='drop')%>%
选择(-c(玻璃钢、线路、扬声器))
#步骤2--删除重复项:
df_b%
总结(跨越(c(行,发言人),第一个),
话语=str_c(话语,折叠=“”),.groups='drop')%>%
选择(-grp)
#步骤4——绑定:
绑定cols(df_c,df_a)
如何使用独特(话语)
?这会帮助你实现你想要的吗
df %>%
group_by(grp = rleid(speaker)) %>%
summarise(across(c(line, speaker), first),
utterance = str_c(unique(utterance), collapse = ' '),
A_aoi = str_c(if_else(!is.na(A_aoi), A_aoi, "*" ), collapse = ""),
B_aoi = str_c(if_else(!is.na(B_aoi), B_aoi, "*" ), collapse = ""),
C_aoi = str_c(if_else(!is.na(C_aoi), C_aoi, "*" ), collapse = ""), .groups = 'drop') %>%
select(- grp)
输出
# A tibble: 7 x 6
line speaker utterance A_aoi B_aoi C_aoi
<dbl> <chr> <chr> <chr> <chr> <chr>
1 1 b Hey sweetheart! B C A
2 2 a Louise! B C *
3 3 NA (0.234) C C *
4 4 c What? B*C CAC B*C
5 5 NA (0.778) C * C
6 6 c um:: *C *C AA
7 7 a Wake up, breakfast's ready CC C* AA
#一个tible:7 x 6
线路扬声器发声A_aoi B_aoi C_aoi
嘿,亲爱的!B、C、A
路易丝!B C*
3钠(0.234)碳*
什么?B*C CAC B*C
5Na(0.778)C*C
6 c um::*c*c AA
7 a起床,早餐准备好了CC C*AA
太棒了,真的!这么简单,但绝对切中要害。也适用于更大、更复杂的数据集!那么thxs!
# A tibble: 7 x 6
line speaker utterance A_aoi B_aoi C_aoi
<dbl> <chr> <chr> <chr> <chr> <chr>
1 1 b Hey sweetheart! B C A
2 2 a Louise! B C *
3 3 NA (0.234) C C *
4 4 c What? B*C CAC B*C
5 5 NA (0.778) C * C
6 6 c um:: *C *C AA
7 7 a Wake up, breakfast's ready CC C* AA