dplyr分组依据和样本,同时忽略NA
我想通过对同一组中的非NA值进行抽样,为每组的NA值进行gapfill 这是最接近我希望使用dplyr分组依据和样本,同时忽略NA,r,dplyr,na,sample,R,Dplyr,Na,Sample,我想通过对同一组中的非NA值进行抽样,为每组的NA值进行gapfill 这是最接近我希望使用实现的目标!is.na() >dput(数据) 结构(列表)(len=c(NA,45447.4157838775161037.71538108, 78147.8550470324, 7193.48815617057, 1571.95459212405, 18191.381972185, 20366.2132412031, 10014.987524596, 1403.72511829297, 5651.
实现的目标!is.na()
>dput(数据)
结构(列表)(len=c(NA,45447.4157838775161037.71538108,
78147.8550470324, 7193.48815617057, 1571.95459212405, 18191.381972185,
20366.2132412031, 10014.987524596, 1403.72511829297, 5651.17842991513,
6848.03271105711, 8043.32937011393, 8926.65133418451, 5808.44456603825,
2208.14264175252, 1797.4936747033, 5325.76651327694, 2660.66730207955,
5844.07912541444, 3956.40473896271, 959.873314407621, 3294.01472360025,
5221.94864001864, 3781.51913857335, 7811.83819953768, 3387.20323328623,
5514.92099458441, 5792.54371531706, 5643.98385143961, 15478.916809379,
8401.665332052177046.250748192472734.73639821402,北美62332.3343404513,
NA,46563.121471811325590.402010523813015.36822758624984.80432801441,
点=c(NA,0,8,5,2,0,9,0,0,0,0,3,1,0,6,1,
1,0,0,1,0,0,0,1,2,0,0,0,0,0,0,0,0,0,1,0,NA,
10,NA,19,6,5,0,NA),国家=结构(c(1L,2L,2L,2L,
2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,
2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,3L,2L,
3L、2L、2L、2L、2L、1L),.Label=c(“WCY_____;ES”、“WCY_____;FR”,
“WCY_________;IT”),class=“factor”),组=c(1L,2L,2L,2L,2L,3L,
3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,
3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,3L,
4L,4L,4L,4L,4L),row.names=c(NA,-42L),class=“data.frame”)
图书馆(dplyr)
数据1%
分组依据(分组)%>%
变异(nulen=if_else(country='WCY __; FR',len,sample(len[!is.na(len)],1,TRUE)),
nupoint=if_else(国家='WCY _____; FR',点,样本(点[!is.na(点)],1,TRUE)))
但是我在sample.int中得到了错误(长度(x)、大小、替换、prob):
第一个参数无效
已知分布和间隙填充分布之间不应有显著差异。如果同一组中没有要采样的值(其他值是NA
,或者“组”中只有一行),则应从整个数据集中采样。任何包装都可以 这里有一个想法
dd %>%
mutate(len1 = replace(len, is.na(len), sample(len[!is.na(len)], 1, TRUE)),
point1 = replace(point, is.na(point), sample(point[!is.na(point)], 1, TRUE))) %>%
group_by(group) %>%
mutate(nulen = ifelse(all(is.na(len)) & country == 'WCY_____FR', len1,
ifelse(is.na(len) & country == 'WCY_____FR', sample(len[!is.na(len)], 1, TRUE), len)))
这就给了,
len point国家/地区组len1 point1 nulen
1 NA NA WCY_u_________;ES 1 1572。0 NA
2 45447. 0 WCY________;FR 2 45447。0 45447.
3 161038. 8 WCY_u_______;FR 2 161038。8 161038.
4 78148. 5 WCY_u_______;FR 2 78148。5 78148.
5 7193. 2 WCY_u________;FR 3 7193。2 7193.
6 1572. 0 WCY________;FR 3 1572。0 1572.
7 18191. 9 WCY第3 18191号联邦公报。9 18191.
8 20366. 0 WCY_u_____;FR 3 20366。0 20366.
9 10015. 0 WCY\uuuuuuuuu\FR 3 10015。0 10015.
10 1404. 0 WCY_u_____;FR 3 1404。0 1404.
# ... 还有32行
对于
点
也可以这样做。请使用dput()
添加数据样本。谢谢,但我确实问过“如果没有来自同一组的值进行采样,那么应该从整个数据集中采集样本。”我已经对问题进行了编辑,使其更加明确。@user3725599请看一看,并让我知道不太清楚。仍然存在NA值。我添加了nulen2=ifelse(is.na(nulen),len1,nulen),但想不出一个更简单的修复方法。并且采样值是相同的。
dd %>%
mutate(len1 = replace(len, is.na(len), sample(len[!is.na(len)], 1, TRUE)),
point1 = replace(point, is.na(point), sample(point[!is.na(point)], 1, TRUE))) %>%
group_by(group) %>%
mutate(nulen = ifelse(all(is.na(len)) & country == 'WCY_____FR', len1,
ifelse(is.na(len) & country == 'WCY_____FR', sample(len[!is.na(len)], 1, TRUE), len)))
len point country group len1 point1 nulen
<dbl> <dbl> <fct> <int> <dbl> <dbl> <dbl>
1 NA NA WCY_____ES 1 1572. 0 NA
2 45447. 0 WCY_____FR 2 45447. 0 45447.
3 161038. 8 WCY_____FR 2 161038. 8 161038.
4 78148. 5 WCY_____FR 2 78148. 5 78148.
5 7193. 2 WCY_____FR 3 7193. 2 7193.
6 1572. 0 WCY_____FR 3 1572. 0 1572.
7 18191. 9 WCY_____FR 3 18191. 9 18191.
8 20366. 0 WCY_____FR 3 20366. 0 20366.
9 10015. 0 WCY_____FR 3 10015. 0 10015.
10 1404. 0 WCY_____FR 3 1404. 0 1404.
# ... with 32 more rows