R 如何使用“sapply()”从类别中的组中插补值?
我想用特定的R 如何使用“sapply()”从类别中的组中插补值?,r,sapply,imputation,R,Sapply,Imputation,我想用特定的ctry方法对cat1中所有ctry的val中的缺失进行估算 数据示例 set.seed(654) df1 <- data.frame( year=rep(2000:2005, each=5), ctry=rep(LETTERS[1:5], 6), val=rnorm(30) ) df1$cat <- ifelse(df1$ctry %in% c("A", "B"), 1, 0) df1[sample(nrow(df1), 12), "val"] <-
ctry
方法对cat1
中所有ctry
的val
中的缺失进行估算
数据示例
set.seed(654)
df1 <- data.frame(
year=rep(2000:2005, each=5),
ctry=rep(LETTERS[1:5], 6),
val=rnorm(30)
)
df1$cat <- ifelse(df1$ctry %in% c("A", "B"), 1, 0)
df1[sample(nrow(df1), 12), "val"] <- NA
> head(df1)
year ctry val cat
1 2000 A -0.76031762 1
2 2000 B -0.38970450 1
3 2000 C 1.68962523 0
4 2000 D NA 0
5 2000 E 0.09530146 0
6 2001 A NA 1
现在我成功地按国家手动插补:
df2 <- df1
df2$val[df2$ctry %in% names(cat1)[1] & is.na(df2$val)] <- cat1[1]
> head(df2)
year ctry val cat
1 2000 A -0.76031762 1
2 2000 B -0.38970450 1
3 2000 C 1.68962523 0
4 2000 D NA 0
5 2000 E 0.09530146 0
6 2001 A -0.49758245 1
预期输出将是一个完整的数据框架,其中包含类别
cat1
中国家的特定插补平均值。您可以尝试使用group\u by
的tidyverse
方法,以获得每ctry
的平均值。然后使用ifelse
更新NA
。添加了一个新列val2
,以说明正在发生的情况。您可以编写“val”
来覆盖该列
library(tidyverse)
df1 %>%
group_by(ctry) %>%
mutate(Mean=mean(val, na.rm = T)) %>%
mutate(val2=ifelse(is.na(val) & cat == 1, Mean, val)) %>%
ungroup()
# A tibble: 30 x 6
year ctry val cat Mean val2
<int> <fct> <dbl> <dbl> <dbl> <dbl>
1 2000 A -0.760 1 -0.498 -0.760
2 2000 B -0.390 1 -0.614 -0.390
3 2000 C 1.69 0 0.397 1.69
4 2000 D NA 0 -0.0321 NA
5 2000 E 0.0953 0 -0.513 0.0953
6 2001 A NA 1 -0.498 -0.498
7 2001 B NA 1 -0.614 -0.614
8 2001 C NA 0 0.397 NA
9 2001 D NA 0 -0.0321 NA
10 2001 E NA 0 -0.513 NA
# ... with 20 more rows
库(tidyverse)
df1%>%
组别(中心)%>%
突变(平均值=平均值(val,na.rm=T))%>%
突变(val2=ifelse(is.na(val)&cat==1,Mean,val))%>%
解组()
#一个tibble:30x6
年份ctry val cat平均值2
1200A-0.7601-0.498-0.760
2000B-0.3901-0.614-0.390
3 2000 C 1.69 0.397 1.69
4 2000 D钠0-0.0321钠
52000 E 0.0953 0-0.513 0.0953
6 2001 A NA 1-0.498-0.498
7 2001 B NA 1-0.614-0.614
8 2001 C NA 0.397 NA
9 2001 D NA 0-0.0321 NA
10 2001 eNa 0-0.513 NA
# ... 还有20行
您可以尝试使用groupby
的tidyverse
方法来获得每ctry
的平均值。然后使用ifelse
更新NA
。添加了一个新列val2
,以说明正在发生的情况。您可以编写“val”
来覆盖该列
library(tidyverse)
df1 %>%
group_by(ctry) %>%
mutate(Mean=mean(val, na.rm = T)) %>%
mutate(val2=ifelse(is.na(val) & cat == 1, Mean, val)) %>%
ungroup()
# A tibble: 30 x 6
year ctry val cat Mean val2
<int> <fct> <dbl> <dbl> <dbl> <dbl>
1 2000 A -0.760 1 -0.498 -0.760
2 2000 B -0.390 1 -0.614 -0.390
3 2000 C 1.69 0 0.397 1.69
4 2000 D NA 0 -0.0321 NA
5 2000 E 0.0953 0 -0.513 0.0953
6 2001 A NA 1 -0.498 -0.498
7 2001 B NA 1 -0.614 -0.614
8 2001 C NA 0 0.397 NA
9 2001 D NA 0 -0.0321 NA
10 2001 E NA 0 -0.513 NA
# ... with 20 more rows
库(tidyverse)
df1%>%
组别(中心)%>%
突变(平均值=平均值(val,na.rm=T))%>%
突变(val2=ifelse(is.na(val)&cat==1,Mean,val))%>%
解组()
#一个tibble:30x6
年份ctry val cat平均值2
1200A-0.7601-0.498-0.760
2000B-0.3901-0.614-0.390
3 2000 C 1.69 0.397 1.69
4 2000 D钠0-0.0321钠
52000 E 0.0953 0-0.513 0.0953
6 2001 A NA 1-0.498-0.498
7 2001 B NA 1-0.614-0.614
8 2001 C NA 0.397 NA
9 2001 D NA 0-0.0321 NA
10 2001 eNa 0-0.513 NA
# ... 还有20行
如果我的理解正确,您希望自动完成最后一个过程
sapply(seq_along(cat1),
function(x) df2$val[df2$ctry %in% names(cat1)[x] & is.na(df2$val)] <<- cat1[x])
> df2
year ctry val cat
1 2000 A -0.760317618 1
2 2000 B -0.389704501 1
3 2000 C 1.689625228 0
4 2000 D NA 0
5 2000 E 0.095301460 0
6 2001 A -0.497582454 1
7 2001 B -0.613936417 1
8 2001 C NA 0
9 2001 D NA 0
10 2001 E NA 0
11 2002 A -0.107260116 1
12 2002 B -0.838168333 1
13 2002 C -0.982605890 0
14 2002 D -0.820370986 0
15 2002 E -0.871432562 0
16 2003 A -0.497582454 1
17 2003 B -0.613936417 1
18 2003 C -0.006557849 0
19 2003 D 0.661696295 0
20 2003 E -0.762828067 0
21 2004 A -0.286692466 1
22 2004 B -0.613936417 1
23 2004 C 0.512579937 0
24 2004 D 0.722127317 0
25 2004 E NA 0
26 2005 A -0.836059616 1
27 2005 B -0.613936417 1
28 2005 C 0.774016151 0
29 2005 D -0.691866605 0
30 2005 E NA 0
sapply(沿第1类),
函数(x)df2$val[df2$ctry%在%names(cat1)[x]&is.na(df2$val)]如果我的理解正确,您希望自动完成最后一个过程
sapply(seq_along(cat1),
function(x) df2$val[df2$ctry %in% names(cat1)[x] & is.na(df2$val)] <<- cat1[x])
> df2
year ctry val cat
1 2000 A -0.760317618 1
2 2000 B -0.389704501 1
3 2000 C 1.689625228 0
4 2000 D NA 0
5 2000 E 0.095301460 0
6 2001 A -0.497582454 1
7 2001 B -0.613936417 1
8 2001 C NA 0
9 2001 D NA 0
10 2001 E NA 0
11 2002 A -0.107260116 1
12 2002 B -0.838168333 1
13 2002 C -0.982605890 0
14 2002 D -0.820370986 0
15 2002 E -0.871432562 0
16 2003 A -0.497582454 1
17 2003 B -0.613936417 1
18 2003 C -0.006557849 0
19 2003 D 0.661696295 0
20 2003 E -0.762828067 0
21 2004 A -0.286692466 1
22 2004 B -0.613936417 1
23 2004 C 0.512579937 0
24 2004 D 0.722127317 0
25 2004 E NA 0
26 2005 A -0.836059616 1
27 2005 B -0.613936417 1
28 2005 C 0.774016151 0
29 2005 D -0.691866605 0
30 2005 E NA 0
sapply(沿第1类),
基R中的函数(x)df2$val[df2$ctry%在%names(cat1)[x]和is.na(df2$val)]:
set.seed(654)
df1 <- data.frame(
year=rep(2000:2005, each=5),
ctry=rep(LETTERS[1:5], 6),
val=rnorm(30)
)
df1$cat <- ifelse(df1$ctry %in% c("A", "B"), 1, 0)
df1[sample(nrow(df1), 12), "val"] <- NA
# want:
my_means <- tapply(df1$val, df1$ctry, mean, na.rm = TRUE)
df1$val <- ifelse(is.na(df1$val), my_means[df1$ctry], df1$val)
set.seed(654)
基R中的df1
set.seed(654)
df1 <- data.frame(
year=rep(2000:2005, each=5),
ctry=rep(LETTERS[1:5], 6),
val=rnorm(30)
)
df1$cat <- ifelse(df1$ctry %in% c("A", "B"), 1, 0)
df1[sample(nrow(df1), 12), "val"] <- NA
# want:
my_means <- tapply(df1$val, df1$ctry, mean, na.rm = TRUE)
df1$val <- ifelse(is.na(df1$val), my_means[df1$ctry], df1$val)
set.seed(654)
df1谢谢,确实是不错的tidyverse
方法!只需在ifelse()
中添加&cat==1
,只有cat==1
的国家才会被插补。我依赖于sapply()
即基本R方法。@jaySf是的,这是正确的方法。刚刚编辑了我的答案。谢谢,确实是不错的tidyverse
方法!只需在ifelse()
中添加&cat==1
,只有cat==1
的国家才会被插补。我依赖于sapply()
即基本R方法。@jaySf是的,这是正确的方法。刚刚编辑了我的答案。