R 根据列生成引导样本
我有一个这样的数据集R 根据列生成引导样本,r,dplyr,sampling,resampling,R,Dplyr,Sampling,Resampling,我有一个这样的数据集 set.seed(1) df <- data.frame(ID = rep(1:4, each = 3), x = c(1,2,3,2,3,4,1,2,3,3,4,5), V1 = rnorm(12)) > df ID x V1 1 1 1 -0.6264538 2 1 2 0.1836433 3 1 3 -0.8356286 4 2 2 1.5952
set.seed(1)
df <- data.frame(ID = rep(1:4, each = 3),
x = c(1,2,3,2,3,4,1,2,3,3,4,5),
V1 = rnorm(12))
> df
ID x V1
1 1 1 -0.6264538
2 1 2 0.1836433
3 1 3 -0.8356286
4 2 2 1.5952808
5 2 3 0.3295078
6 2 4 -0.8204684
7 3 1 0.4874291
8 3 2 0.7383247
9 3 3 0.5757814
10 4 3 -0.3053884
11 4 4 1.5117812
12 4 5 0.3898432
该数据集现在在每个时间点有4个观察值。我们可以首先找到每个
x
发生的最大次数,并使用replace=TRUE
对每个x
进行sample\n
,以获得每个x
的相同行数
max_sample <- max(table(df$x))
library(dplyr)
df %>%
group_by(x) %>%
sample_n(max_sample, replace = TRUE) %>%
arrange(x)
# ID x V1
# <int> <dbl> <dbl>
# 1 3 1 0.487
# 2 1 1 -0.626
# 3 1 1 -0.626
# 4 1 1 -0.626
# 5 3 2 0.738
# 6 2 2 1.60
# 7 2 2 1.60
# 8 3 2 0.738
# 9 4 3 -0.305
#10 2 3 0.330
#11 2 3 0.330
#12 4 3 -0.305
#13 4 4 1.51
#14 4 4 1.51
#15 4 4 1.51
#16 4 4 1.51
#17 4 5 0.390
#18 4 5 0.390
#19 4 5 0.390
#20 4 5 0.390
最大样本百分比
分组依据(x)%>%
样本n(最大样本,替换为真)%>%
安排(x)
#ID x V1
#
# 1 3 1 0.487
# 2 1 1 -0.626
# 3 1 1 -0.626
# 4 1 1 -0.626
# 5 3 2 0.738
# 6 2 2 1.60
# 7 2 2 1.60
# 8 3 2 0.738
# 9 4 3 -0.305
#10 2 3 0.330
#11 2 3 0.330
#12 4 3 -0.305
#13 4 4 1.51
#14 4 4 1.51
#15 4 4 1.51
#16 4 4 1.51
#17 4 5 0.390
#18 4 5 0.390
#19 4 5 0.390
#20 4 5 0.390
谢谢,我应该补充一点,x
并不总是以1开头,它的范围从-20到+20@spore234嗯……我想这不重要,因为我们用表计算x
的频率,而不管它的值是多少。
max_sample <- max(table(df$x))
library(dplyr)
df %>%
group_by(x) %>%
sample_n(max_sample, replace = TRUE) %>%
arrange(x)
# ID x V1
# <int> <dbl> <dbl>
# 1 3 1 0.487
# 2 1 1 -0.626
# 3 1 1 -0.626
# 4 1 1 -0.626
# 5 3 2 0.738
# 6 2 2 1.60
# 7 2 2 1.60
# 8 3 2 0.738
# 9 4 3 -0.305
#10 2 3 0.330
#11 2 3 0.330
#12 4 3 -0.305
#13 4 4 1.51
#14 4 4 1.51
#15 4 4 1.51
#16 4 4 1.51
#17 4 5 0.390
#18 4 5 0.390
#19 4 5 0.390
#20 4 5 0.390