从dplyr中的空格分隔字符串中提取第n个位置
我有一个看起来像这样的数据框:从dplyr中的空格分隔字符串中提取第n个位置,r,string,dplyr,R,String,Dplyr,我有一个看起来像这样的数据框: data <- data.frame(label = c('S', 'SH', 'S', 'S', 'SH'), word = c('sip', 'shoe', 'plaster', 'reception', 'reception'), word.segs = c('S IH1 P', 'SH UW1', 'P L AE1 S T AH0', 'R AH0 S EH1 P SH AH0 N', 'R
data <- data.frame(label = c('S', 'SH', 'S', 'S', 'SH'),
word = c('sip', 'shoe', 'plaster', 'reception', 'reception'),
word.segs = c('S IH1 P', 'SH UW1', 'P L AE1 S T AH0', 'R AH0 S EH1 P SH AH0 N', 'R AH0 S EH1 P SH AH0 N'),
seg.index = c(1, 1, 4, 3, 6))
(注意,我也只尝试过一次取消列表,将其保存到一个单独的对象,然后提取两个感兴趣的值,但这似乎没有明显加快)
我还在dplyr中尝试了一种替代方案,希望它可能更有效:
data <- data %>%
mutate(fol.seg = word.segs %>%
strsplit(split = " ") %>%
unlist() %>%
nth(seg.index+1))
数据%
变异(fol.seg=word.segs%>%
strsplit(split=”“)%>%
取消列表()%>%
第n个(分段索引+1))
但我收到以下错误消息,我不知道它为什么不工作:
mutate_impl(.data,dots)中出错:
计算错误:长度(n)=1不正确
任何帮助都将不胜感激 这很有效,只需使用base R即可。您可以使用
purrr
来想象它
library(dplyr)
try_pull = function(x, i) {
if (i > length(x)) NA else x[[i]]
}
res = data %>%
mutate(seg_list = strsplit(word.segs, split = " "),
seg1 = Map(f = try_pull, seg_list, seg.index + 1),
seg2 = Map(f = try_pull, seg_list, seg.index + 2)
)
res
# label word word.segs seg.index seg_list seg1 seg2
# 1 S sip S IH1 P 1 S, IH1, P IH1 P
# 2 SH shoe SH UW1 1 SH, UW1 UW1 NA
# 3 S plaster P L AE1 S T AH0 4 P, L, AE1, S, T, AH0 T AH0
# 4 S reception R AH0 S EH1 P SH AH0 N 3 R, AH0, S, EH1, P, SH, AH0, N EH1 P
# 5 SH reception R AH0 S EH1 P SH AH0 N 6 R, AH0, S, EH1, P, SH, AH0, N AH0 N
以下data.table方法应快速灵活地选择seg.index之后的所有seg或前两个seg
library(data.table)
data <- data.frame(label = c('S', 'SH', 'S', 'S', 'SH'),
word = c('sip', 'shoe', 'plaster', 'reception', 'reception'),
word.segs = c('S IH1 P', 'SH UW1', 'P L AE1 S T AH0', 'R AH0 S EH1 P SH AH0 N', 'R AH0 S EH1 P SH AH0 N'),
seg.index = c(1, 1, 4, 3, 6),stringsAsFactors = F)
data$id <- 1:nrow(data)
dt <- as.data.table(data,stringsAsFactors=F)
setkeyv(dt,"id")
segdt<-dt[,list(seg.index=seg.index,seg=unlist(strsplit(word.segs,"\\s+"))),by="id"][,n:=1:.N,by="id"]
segdt<-segdt[n>seg.index][,`:=`(seg.col=paste0("seg",1:.N),seg.num=1:.N),by="id"]
#dt[segdt[,list(index.word.segs=paste(seg,collapse=",")),by="id"]] #rejoin original table and all segs after seg.index
widesegs <- dcast.data.table(segdt[seg.num<=2,.(id,seg,seg.col)],id ~ seg.col,value.var="seg") #only first two segs after seg.index or NA
dt[widesegs]
备选结果:
要使所有分段都大于分段索引,请执行以下操作:
widesegs <- dcast.data.table(segdt[,.(id,seg,seg.col)],id ~ seg.col,value.var="seg") #all segs after seg.index or NA
dt[widesegs]
> dt[widesegs]
label word word.segs seg.index id seg1 seg2 seg3 seg4 seg5
1: S sip S IH1 P 1 1 IH1 P NA NA NA
2: SH shoe SH UW1 1 2 UW1 NA NA NA NA
3: S plaster P L AE1 S T AH0 4 3 T AH0 NA NA NA
4: S reception R AH0 S EH1 P SH AH0 N 3 4 EH1 P SH AH0 N
5: SH reception R AH0 S EH1 P SH AH0 N 6 5 AH0 N NA NA NA
wideegs dt[wideegs]
标签字word.segs seg.index id seg1 seg2 seg3 seg4 seg5
1:S sip S IH1 P 1 IH1 P NA NA NA
2:SH鞋SH UW1 1 2 UW1 NA NA NA
3:S石膏P L AE1 S T AH0 4 3 T AH0 NA NA NA
4:S接收R AH0 S EH1 P SH AH0 N 3 4 EH1 P SH AH0 N
5:SH接收R AH0 S EH1 P SH AH0 N 6 5 AH0 N NA NA
另一种可能性是:
data %>%
rowwise() %>%
mutate(seg1 = sapply(strsplit(as.character(word.segs), " "), function(x) x[seg.index + 1]),
seg2 = sapply(strsplit(as.character(word.segs), " "), function(x) x[seg.index + 2]))
label word word.segs seg.index seg1 seg2
<fct> <fct> <fct> <dbl> <chr> <chr>
1 S sip S IH1 P 1 IH1 P
2 SH shoe SH UW1 1 UW1 <NA>
3 S plaster P L AE1 S T AH0 4 T AH0
4 S reception R AH0 S EH1 P SH AH0 N 3 EH1 P
5 SH reception R AH0 S EH1 P SH AH0 N 6 AH0 N
工作完美-谢谢!(虽然“pattern”抛出了一个错误,但我认为它是“split”?)啊,是的,我最初使用的是带有pattern参数的
stringr::stru split
,当我切换到strsplit
时,忘了切换到split
。啊,我很快就找到了第二个解决方案——谢谢你纠正它!如果有人想知道的话,sapply()解决方案需要11.93秒,unlist()解决方案需要39.13秒(对于10万行的完整数据集来说),啊,知道所有后续段的解决方案是很好的-我没有想到使用data.table。谢谢
widesegs <- dcast.data.table(segdt[,.(id,seg,seg.col)],id ~ seg.col,value.var="seg") #all segs after seg.index or NA
dt[widesegs]
> dt[widesegs]
label word word.segs seg.index id seg1 seg2 seg3 seg4 seg5
1: S sip S IH1 P 1 1 IH1 P NA NA NA
2: SH shoe SH UW1 1 2 UW1 NA NA NA NA
3: S plaster P L AE1 S T AH0 4 3 T AH0 NA NA NA
4: S reception R AH0 S EH1 P SH AH0 N 3 4 EH1 P SH AH0 N
5: SH reception R AH0 S EH1 P SH AH0 N 6 5 AH0 N NA NA NA
data %>%
rowwise() %>%
mutate(seg1 = sapply(strsplit(as.character(word.segs), " "), function(x) x[seg.index + 1]),
seg2 = sapply(strsplit(as.character(word.segs), " "), function(x) x[seg.index + 2]))
label word word.segs seg.index seg1 seg2
<fct> <fct> <fct> <dbl> <chr> <chr>
1 S sip S IH1 P 1 IH1 P
2 SH shoe SH UW1 1 UW1 <NA>
3 S plaster P L AE1 S T AH0 4 T AH0
4 S reception R AH0 S EH1 P SH AH0 N 3 EH1 P
5 SH reception R AH0 S EH1 P SH AH0 N 6 AH0 N
data %>%
rowwise %>%
mutate(seg1 = strsplit(as.character(word.segs), " ") %>%
unlist() %>%
nth(seg.index + 1),
seg2 = strsplit(as.character(word.segs), " ") %>%
unlist() %>%
nth(seg.index + 2))