R 如何返回向量中满足特定条件的最后一个值
我有一个向量(在一个数据帧中)充满了递增的数字。我想找到所有连续的数字,并用序列中的第一个数字替换它们。这是否可以不使用循环 我的输入数据是:R 如何返回向量中满足特定条件的最后一个值,r,R,我有一个向量(在一个数据帧中)充满了递增的数字。我想找到所有连续的数字,并用序列中的第一个数字替换它们。这是否可以不使用循环 我的输入数据是: V1 1 4 5 7 10 15 16 17 20 我想要的输出是: V1 Out 1 1 4 4 5 4 7 7 10 10 15 15 16 15 17 15 20 20 到目前为止,我使用diff()计算了两行之间的差异,并通过向量循环来替换正确的值 V1 <- c(1,
V1
1
4
5
7
10
15
16
17
20
我想要的输出是:
V1 Out
1 1
4 4
5 4
7 7
10 10
15 15
16 15
17 15
20 20
到目前为止,我使用diff()计算了两行之间的差异,并通过向量循环来替换正确的值
V1 <- c(1, 4, 5, 7, 10, 15, 16, 17, 20)
df <- data.frame(V1)
df$diff <- c(0, diff(df$V1) == 1)
df$Out <- NA
for (j in 1:(nrow(df))){
if (df$diff[j] == 0){
df$Out[j] <- df$V1[j]
} else {
df$Out[j] <- df$V1[max(which(df$diff[1:j] == 0))]
}
}
V1使用基本R可以
with(d1, ave(V1, cumsum(c(1, diff(V1) != 1)), FUN = function(i) i[1]))
#[1] 1 4 4 7 10 15 15 15 20
dplyr
library(dplyr)
d1 %>%
group_by(grp = cumsum(c(1, diff(V1) != 1))) %>%
mutate(out = first(V1))
数据表
library(data.table)
setDT(d1)[, out := first(V1), by = cumsum(c(1, diff(V1) != 1))]
使用dplyr
和tidyr
:
library(tidyr)
library(dplyr)
> df %>% mutate(
+ diff=c(0,diff(V1))==1,
+ V2=ifelse(diff,NA,V1)
+ ) %>%
+ fill(V2) %>%
+ select(-diff)
V1 V2
1 1 1
2 4 4
3 5 4
4 7 7
5 10 10
6 15 15
7 16 15
8 17 15
9 20 20
另一个选项,分3步使用zoo
package:
将V2
定义为V1
:
df$V2 <- df$V1
最后,使用zoo::na.locf
将na
s替换为最后一个值:
library(zoo)
df$V2 <- na.locf(df$V2)
使用magrittr
:
库(magrittr)
df$V2%替换(c(假,差异(df$V1)=1),NA)%>%NA.locf
使用shift()
或lag()
而不是diff()
目前提出的所有解决方案都使用diff(V1)
来确定连续数字。另一方面,数据。表
和dplyr
包括shift()
和lag()
,分别是可以使用的函数(也由@Frank建议)
因此,与其
我们可以写作
setDT(d1)[, out := V1[1], by = cumsum(V1 - shift(V1, fill = V1[1]) != 1)]
dplyr
解决方案变为
library(dplyr)
d1 %>%
group_by(grp = cumsum(V1 - lag(V1, default = V1[1]) != 1)) %>%
mutate(out = first(V1))
library(data.table)
with(d1, ave(V1, cumsum(V1 - shift(V1, fill = V1[1]) != 1), FUN = function(i) i[1]))
同样,基本R解决方案变为
library(dplyr)
d1 %>%
group_by(grp = cumsum(V1 - lag(V1, default = V1[1]) != 1)) %>%
mutate(out = first(V1))
library(data.table)
with(d1, ave(V1, cumsum(V1 - shift(V1, fill = V1[1]) != 1), FUN = function(i) i[1]))
及
基准代码
目前正在测试12种不同的方法,每种方法都有双常数和整数常数,总共有24种变体
library(magrittr)
library(microbenchmark)
bm <- microbenchmark(
ave_diff = DF$Out <- with(DF, ave(V1, cumsum(c(1, diff(V1) != 1)), FUN = function(i) i[1])),
ave_shift = DF$Out <- with(DF, ave(V1, cumsum(V1 - shift(V1, fill = V1[1]) != 1), FUN = function(i) i[1])),
zoo_diff = {DF$Out <- DF$V1; DF$Out[c(FALSE, diff(DF$V1) == 1)] <- NA; DF$Out <- zoo::na.locf(DF$Out)},
zoo_pipe = DF$Out <- DF$V1 %>% replace(c(FALSE, diff(DF$V1) == 1), NA) %>% zoo::na.locf(),
zoo_shift = DF$Out <- DF$V1 %>% replace(DF$V1 == shift(DF$V1, fill = DF$V1[1]) + 1, NA) %>% zoo::na.locf(),
dp_diff = r2 <- DF %>%
dplyr::group_by(grp = cumsum(c(1, diff(V1) != 1))) %>%
dplyr::mutate(Out = first(V1)),
dp_lag = r3 <- DF %>%
dplyr::group_by(grp = cumsum(V1 - dplyr::lag(V1, default = V1[1]) != 1)) %>%
dplyr::mutate(Out = first(V1)),
dt_diff = DT[, Out := V1[1], by = cumsum(c(1, diff(V1) != 1))],
dt_shift1 = DT[, Out := V1[1], by = cumsum(V1 - shift(V1, fill = V1[1]) != 1)],
dt_shift2 = DT[, Out := V1[1], by = cumsum(V1 != shift(V1, fill = V1[1]) + 1)],
dt_zoo_diff = DT[, Out := V1][c(FALSE, diff(DF$V1) == 1), Out := NA][, Out := zoo::na.locf(Out)],
dt_zoo_shift = DT[, Out := V1][V1 == shift(V1, fill = V1[1]) + 1, Out := NA][, Out := zoo::na.locf(Out)],
ave_diff_L = DF$Out <- with(DF, ave(V1, cumsum(c(1L, diff(V1) != 1L)), FUN = function(i) i[1L])),
ave_shift_L = DF$Out <- with(DF, ave(V1, cumsum(V1 - shift(V1, fill = V1[1L]) != 1L), FUN = function(i) i[1L])),
zoo_diff_L = {DF$Out <- DF$V1; DF$Out[c(FALSE, diff(DF$V1) == 1L)] <- NA_integer_; DF$Out <- zoo::na.locf(DF$Out)},
zoo_pipe_L = DF$Out <- DF$V1 %>% replace(c(FALSE, diff(DF$V1) == 1L), NA_integer_) %>% zoo::na.locf(),
zoo_shift_L = DF$Out <- DF$V1 %>% replace(DF$V1 == shift(DF$V1, fill = DF$V1[1L]) + 1L, NA_integer_) %>% zoo::na.locf(),
dp_diff_L = r2 <- DF %>%
dplyr::group_by(grp = cumsum(c(1L, diff(V1) != 1L))) %>%
dplyr::mutate(Out = first(V1)),
dp_lag_L = r3 <- DF %>%
dplyr::group_by(grp = cumsum(V1 - dplyr::lag(V1, default = V1[1L]) != 1L)) %>%
dplyr::mutate(Out = first(V1)),
dt_diff_L = DT[, Out := V1[1L], by = cumsum(c(1L, diff(V1) != 1L))],
dt_shift1_L = DT[, Out := V1[1L], by = cumsum(V1 - shift(V1, fill = V1[1L]) != 1L)],
dt_shift2_L = DT[, Out := V1[1L], by = cumsum(V1 != shift(V1, fill = V1[1L]) + 1L)],
dt_zoo_diff_L = DT[, Out := V1][c(FALSE, diff(DF$V1) == 1L), Out := NA_integer_][, Out := zoo::na.locf(Out)],
dt_zoo_shift_L = DT[, Out := V1][V1 == shift(V1, fill = V1[1L]) + 1L, Out := NA_integer_][, Out := zoo::na.locf(Out)],
times = 20L
)
注意时间轴的对数刻度
观察
对于给定的问题大小和结构:
zoo::na.locf()
方法比使用分组的各种实现要快,而且na.locf()
与shift()
的组合有一点优势
- 第二个但接近的是
数据。带有分组的表
- 第三个但慢三倍的是
dplyr
- 最后一个是
ave()
,它比最快的要慢20多倍,每次运行最多需要3秒
shift()
/lag()
版本总是比diff()快
- 类型转换很重要。使用
diff()
的版本尤其受到影响,例如,带有整数常量的ave_diff比双contants版本快约2.5倍李>
ave
是一个循环函数。在引擎盖下,ave使用分体式和折叠式。。非常感谢。我想他需要一种矢量化的方式。。我很抱歉,动物园岩石-D
library(data.table)
with(d1, ave(V1, cumsum(V1 - shift(V1, fill = V1[1]) != 1), FUN = function(i) i[1]))
library(zoo)
library(magrittr)
library(data.table)
df$V2 <- df$V1 %>% replace(DF$V1 == shift(DF$V1, fill = DF$V1[1]) + 1, NA) %>% na.locf()
library(data.table)
n <- 1e6L
f <- 2L
set.seed(1234L)
DF <- data.frame(V1 = sort(sample.int(f*n, n)),
Out = 1:n)
DT <- data.table(DF)
DT
library(magrittr)
library(microbenchmark)
bm <- microbenchmark(
ave_diff = DF$Out <- with(DF, ave(V1, cumsum(c(1, diff(V1) != 1)), FUN = function(i) i[1])),
ave_shift = DF$Out <- with(DF, ave(V1, cumsum(V1 - shift(V1, fill = V1[1]) != 1), FUN = function(i) i[1])),
zoo_diff = {DF$Out <- DF$V1; DF$Out[c(FALSE, diff(DF$V1) == 1)] <- NA; DF$Out <- zoo::na.locf(DF$Out)},
zoo_pipe = DF$Out <- DF$V1 %>% replace(c(FALSE, diff(DF$V1) == 1), NA) %>% zoo::na.locf(),
zoo_shift = DF$Out <- DF$V1 %>% replace(DF$V1 == shift(DF$V1, fill = DF$V1[1]) + 1, NA) %>% zoo::na.locf(),
dp_diff = r2 <- DF %>%
dplyr::group_by(grp = cumsum(c(1, diff(V1) != 1))) %>%
dplyr::mutate(Out = first(V1)),
dp_lag = r3 <- DF %>%
dplyr::group_by(grp = cumsum(V1 - dplyr::lag(V1, default = V1[1]) != 1)) %>%
dplyr::mutate(Out = first(V1)),
dt_diff = DT[, Out := V1[1], by = cumsum(c(1, diff(V1) != 1))],
dt_shift1 = DT[, Out := V1[1], by = cumsum(V1 - shift(V1, fill = V1[1]) != 1)],
dt_shift2 = DT[, Out := V1[1], by = cumsum(V1 != shift(V1, fill = V1[1]) + 1)],
dt_zoo_diff = DT[, Out := V1][c(FALSE, diff(DF$V1) == 1), Out := NA][, Out := zoo::na.locf(Out)],
dt_zoo_shift = DT[, Out := V1][V1 == shift(V1, fill = V1[1]) + 1, Out := NA][, Out := zoo::na.locf(Out)],
ave_diff_L = DF$Out <- with(DF, ave(V1, cumsum(c(1L, diff(V1) != 1L)), FUN = function(i) i[1L])),
ave_shift_L = DF$Out <- with(DF, ave(V1, cumsum(V1 - shift(V1, fill = V1[1L]) != 1L), FUN = function(i) i[1L])),
zoo_diff_L = {DF$Out <- DF$V1; DF$Out[c(FALSE, diff(DF$V1) == 1L)] <- NA_integer_; DF$Out <- zoo::na.locf(DF$Out)},
zoo_pipe_L = DF$Out <- DF$V1 %>% replace(c(FALSE, diff(DF$V1) == 1L), NA_integer_) %>% zoo::na.locf(),
zoo_shift_L = DF$Out <- DF$V1 %>% replace(DF$V1 == shift(DF$V1, fill = DF$V1[1L]) + 1L, NA_integer_) %>% zoo::na.locf(),
dp_diff_L = r2 <- DF %>%
dplyr::group_by(grp = cumsum(c(1L, diff(V1) != 1L))) %>%
dplyr::mutate(Out = first(V1)),
dp_lag_L = r3 <- DF %>%
dplyr::group_by(grp = cumsum(V1 - dplyr::lag(V1, default = V1[1L]) != 1L)) %>%
dplyr::mutate(Out = first(V1)),
dt_diff_L = DT[, Out := V1[1L], by = cumsum(c(1L, diff(V1) != 1L))],
dt_shift1_L = DT[, Out := V1[1L], by = cumsum(V1 - shift(V1, fill = V1[1L]) != 1L)],
dt_shift2_L = DT[, Out := V1[1L], by = cumsum(V1 != shift(V1, fill = V1[1L]) + 1L)],
dt_zoo_diff_L = DT[, Out := V1][c(FALSE, diff(DF$V1) == 1L), Out := NA_integer_][, Out := zoo::na.locf(Out)],
dt_zoo_shift_L = DT[, Out := V1][V1 == shift(V1, fill = V1[1L]) + 1L, Out := NA_integer_][, Out := zoo::na.locf(Out)],
times = 20L
)
library(ggplot2)
autoplot(bm)
Unit: milliseconds
expr min lq mean median uq max neval cld
ave_diff 2594.89941 2643.32224 2752.9753 2723.7035 2868.6586 3006.0420 20 e
ave_shift 947.13267 1001.70742 1107.7351 1047.6835 1218.5809 1395.5059 20 c
zoo_diff 100.13967 130.23284 197.7273 142.8525 262.1980 428.2976 20 a
zoo_pipe 104.98025 112.04101 181.3073 119.5275 185.3215 434.2936 20 a
zoo_shift 88.86549 98.49058 177.2143 110.5392 260.1160 416.9985 20 a
dp_diff 1148.18227 1219.68396 1303.6350 1290.5575 1344.1400 1628.1786 20 d
dp_lag 712.58827 746.77952 804.8908 776.3303 809.8323 1157.2102 20 b
dt_diff 226.67524 233.81038 292.0675 241.9369 275.8491 517.1760 20 a
dt_shift1 199.64651 207.39276 255.1607 215.7960 223.7947 882.9923 20 a
dt_shift2 203.87617 210.06736 260.8550 218.9917 244.7247 499.8797 20 a
dt_zoo_diff 109.45194 121.41501 216.3579 159.0960 278.5257 483.1110 20 a
dt_zoo_shift 94.59905 109.32432 204.0329 127.0619 373.8622 430.0885 20 a
ave_diff_L 992.12820 1041.12873 1127.8128 1071.8525 1217.1493 1457.3166 20 c
ave_shift_L 905.41152 973.81932 1063.2237 1015.6805 1170.2522 1323.9317 20 c
zoo_diff_L 103.30228 114.63442 227.4359 140.5280 300.3003 822.3366 20 a
zoo_pipe_L 103.89433 112.16467 231.3165 133.3362 398.7240 545.7856 20 a
zoo_shift_L 91.88764 104.21339 157.6434 138.7488 165.0197 401.3890 20 a
dp_diff_L 749.65952 766.00479 851.0737 806.1116 886.6429 1155.3144 20 b
dp_lag_L 731.08180 757.95232 823.0169 794.4421 827.7100 1079.2576 20 b
dt_diff_L 214.97477 226.80928 241.3575 232.7037 244.8673 323.6259 20 a
dt_shift1_L 199.80509 211.20539 277.5616 218.3371 259.9801 513.2925 20 a
dt_shift2_L 200.37902 204.23732 224.7275 210.7217 216.6133 470.6335 20 a
dt_zoo_diff_L 111.64757 122.62327 162.4947 140.4175 174.0932 409.0788 20 a
dt_zoo_shift_L 95.91114 109.24219 164.7059 126.5924 170.2320 388.6558 20 a