R 通过在mutate()中只改变一个自变量来拟合多个回归模型
我怀疑这个问题可能是重复的,但我没有发现任何令人满意的地方。设想一个结构如下的简单数据集:R 通过在mutate()中只改变一个自变量来拟合多个回归模型,r,dplyr,regression,tidyverse,R,Dplyr,Regression,Tidyverse,我怀疑这个问题可能是重复的,但我没有发现任何令人满意的地方。设想一个结构如下的简单数据集: set.seed(123) df <- data.frame(cov_a = rbinom(100, 1, prob = 0.5), cov_b = rbinom(100, 1, prob = 0.5), cont_a = runif(100), cont_b = runif(100),
set.seed(123)
df <- data.frame(cov_a = rbinom(100, 1, prob = 0.5),
cov_b = rbinom(100, 1, prob = 0.5),
cont_a = runif(100),
cont_b = runif(100),
dep = runif(100))
cov_a cov_b cont_a cont_b dep
1 0 1 0.238726027 0.784575267 0.9860542973
2 1 0 0.962358936 0.009429905 0.1370674714
3 0 0 0.601365726 0.779065883 0.9053095817
4 1 1 0.515029727 0.729390652 0.5763018376
5 1 0 0.402573342 0.630131853 0.3954488591
6 0 1 0.880246541 0.480910830 0.4498024841
7 1 1 0.364091865 0.156636851 0.7065019011
8 1 1 0.288239281 0.008215520 0.0825027458
9 1 0 0.170645235 0.452458394 0.3393125802
10 0 0 0.172171746 0.492293329 0.6807875512
但是,我希望避免使用double-
map()
,并使用更直接或更优雅的方法来解决它。我不确定这是否会被视为更直接/优雅,但我的解决方案不使用doublemap
:
library(tidyverse)
library(broom)
gen_model_expr <- function(var) {
form = paste("dep ~ cont_a + cont_b +", var)
tidy(lm(as.formula(form), data = df))
}
grep("cov_", names(df), value = TRUE) %>%
map(gen_model_expr)
我提出两种方法: 第一个很无聊。我们可以使用{dplyr}的
rowwise
符号代替purrr::map
。这种方法有两种风格。在rowwise
之后,我们可以使用(A)mutate%>%unest
或者(B),我们可以使用group\u map
。在这两种方法中,我都避免了重复数据,但如果需要,我们可以轻松地将数据包含在每一行中(在设置tibble
时,我们可以执行tibble(myvar=,data=list(df))
)。虽然(A)给出了一个包含所有数据的TIBLE,但(B)中的组映射
返回了一个类似于原始示例中的“双映射”方法的列表
第二种方法,我认为它相当“新鲜”(虽然不太直接),因为它既不使用<代码>行> < /代码>也不>代码> map < /代码>。在这里,我们使用{dplyr}的
cross
函数和cur\u column()
,为每个输出创建一个新的列,然后pivot\u long
和unnest
将所有结果放在一个tible
中
最后一个基准测试显示:“doulbe_map”速度最慢(因为数据列重复),其次是“cross”和“row_unnest”,而“row_group_map”速度相当快。然而,最快的方法是@latlio使用map
和一个自定义函数(下面称为“map\u custom\u fun”),但尽管它使用purr
,它可能不太“dplyr ish”
库(tidyverse)
图书馆(扫帚)
种子集(123)
df%变异%>%unnest
df%>%
选择(以(“cov”)开始)%>%
colnames%>%
TIBLE(myvar=)%%>%
行()
突变(res=list(tidy(lm(dep~cont_a+cont_b+eval(sym(.data$myvar)),data=df)))%>%
unnest(res)
#>#A tibble:8 x 6
#>myvar术语估计标准误差统计p值
#>
#>1 cov_a(截距)0.472 0.0812 5.81 0.0000000799
#>2 cov_a cont_a-0.103 0.0983-1.05 0.296
#>3 cov_a cont_b 0.172 0.0990 1.74 0.0848
#>4 cov_a eval(sym(.data$myvar))-0.0455 0.0581-0.783 0.436
#>5 cov_b(截距)0.415 0.0787 5.27 0.000000846
#>6 cov_b cont_a-0.0874 0.0984-0.888 0.377
#>7 cov_b cont_b 0.181 0.0980 1.84 0.0682
#>8 cov_b eval(sym(.data$myvar))0.0482 0.0576 0.837 0.405
#变更B:
#行%>%组映射
df%>%
选择(以(“cov”)开始)%>%
colnames%>%
TIBLE(myvar=)%%>%
行()
组映射(.keep=TRUE,
~tidy(lm(dep~cont_a+cont_b+eval(sym(myvar)),data=df)))
#> [[1]]
#>#A tibble:4 x 5
#>术语估计标准误差统计p值
#>
#>1(截距)0.472 0.0812 5.81 0.0000000799
#>2续-0.103 0.0983-1.05 0.296
#>3续0.172 0.0990 1.74 0.0848
#>4评估(sym(.x$myvar))-0.0455 0.0581-0.783 0.436
#>
#> [[2]]
#>#A tibble:4 x 5
#>术语估计标准误差统计p值
#>
#>1(截距)0.415 0.0787 5.27 0.000000846
#>2续-0.0874 0.0984-0.888 0.377
#>3续0.181 0.0980 1.84 0.0682
#>4评估(sym(.x$myvar))0.0482 0.0576 0.837 0.405
#第二种方法:使用摘要(跨)
#我们这里需要一个“tibble”,否则打印会搞砸
df_tbl%
总结(以“cov”开头),
~list(整洁的(lm)(
重新编制(c(“cont_a”、“cont_b”、“cur_column())、“dep”),
数据=df_tbl(()(()))%>%
pivot_更长(cols=everything(),
name_to=“var”,
值_to=“res”)%>%
unnest(res)
#>#A tibble:8 x 6
#>var术语估计标准误差统计p值
#>
#>1 cov_a(截距)0.472 0.0812 5.81 0.0000000799
#>2 cov_a cont_a-0.103 0.0983-1.05 0.296
#>3 cov_a cont_b 0.172 0.0990 1.74 0.0848
#>4 cov_a cov_a-0.0455 0.0581-0.783 0.436
#>5 cov_b(截距)0.415 0.0787 5.27 0.000000846
#>6 cov_b cont_a-0.0874 0.0984-0.888 0.377
#>7 cov_b cont_b 0.181 0.0980 1.84 0.0682
#>8 cov_b cov_b 0.0482 0.0576 0.837 0.405
由(v0.3.0)于2021-01-10创建
基准
#>单位:毫秒
#>expr最小lq平均uq最大neval
#>double_地图20.92847 22.238626 23.645280 22.726705 25.002493 34.29351 100
#>世界其他地区15.30179 15.835506 16.714873 16.358134 17.314802 20.60496 100
#>row_groupmap 10.1016810.49037411.23739810.70952411.45267720.40186100
#>穿过16.47369 17.117178 18.593908 17.945136 19.431190 29.29384 100
#>map_custom_fun 6.85758 7.311608 7.953066 7.611394 8.305757 19.57006 100
这里是另一个版本:
library(broom)
purrr::map_df(grep("cov_", names(df), value = TRUE),
~tidy(lm(reformulate(c('cont_a', 'cont_b', .x), 'dep'), data = df)))
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
#1 (Intercept) 0.472 0.0812 5.81 0.0000000799
#2 cont_a -0.103 0.0983 -1.05 0.296
#3 cont_b 0.172 0.0990 1.74 0.0848
#4 cov_a -0.0455 0.0581 -0.783 0.436
#5 (Intercept) 0.415 0.0787 5.27 0.000000846
#6 cont_a -0.0874 0.0984 -0.888 0.377
#7 cont_b 0.181 0.0980 1.84 0.0682
#8 cov_b 0.0482 0.0576 0.837 0.405
库(扫帚)
purrr::map_df(grep(“cov_”),name(df),value=TRUE),
~tidy(lm(重新格式化(c('cont_a','cont_b',.x),'dep'),data=df)))
#术语估计标准误差
[[1]]
# A tibble: 4 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.472 0.0812 5.81 0.0000000799
2 cont_a -0.103 0.0983 -1.05 0.296
3 cont_b 0.172 0.0990 1.74 0.0848
4 cov_a -0.0455 0.0581 -0.783 0.436
[[2]]
# A tibble: 4 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.415 0.0787 5.27 0.000000846
2 cont_a -0.0874 0.0984 -0.888 0.377
3 cont_b 0.181 0.0980 1.84 0.0682
4 cov_b 0.0482 0.0576 0.837 0.405
# A tibble: 4 x 13
expression min median `itr/sec` mem_alloc
<bch:expr> <bch:> <bch:> <dbl> <bch:byt>
1 names(df)[grepl("cov_", names(df))] 7.59µs 8.4µs 101975. 0B
2 grep("cov_", colnames(df), value = TRUE) 8.21µs 8.96µs 103142. 0B
3 grep("cov_", names(df), value = TRUE) 6.96µs 7.43µs 128694. 0B
4 df %>% select(starts_with("cov_")) %>% colnames 1.17ms 1.33ms 636. 5.39KB
library(broom)
purrr::map_df(grep("cov_", names(df), value = TRUE),
~tidy(lm(reformulate(c('cont_a', 'cont_b', .x), 'dep'), data = df)))
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
#1 (Intercept) 0.472 0.0812 5.81 0.0000000799
#2 cont_a -0.103 0.0983 -1.05 0.296
#3 cont_b 0.172 0.0990 1.74 0.0848
#4 cov_a -0.0455 0.0581 -0.783 0.436
#5 (Intercept) 0.415 0.0787 5.27 0.000000846
#6 cont_a -0.0874 0.0984 -0.888 0.377
#7 cont_b 0.181 0.0980 1.84 0.0682
#8 cov_b 0.0482 0.0576 0.837 0.405
purrr::map(grep("cov_", colnames(df), value = TRUE),
function(x){
df2 <- select(df, x, cont_a, cont_b, dep)
tidy(lm(dep ~ ., df2))
})
library(fixest)
set.seed(123)
n = 100
df = data.frame(cov_a = rbinom(n, 1, prob = 0.5),
cov_b = rbinom(n, 1, prob = 0.5),
cont_a = runif(n), cont_b = runif(n),
dep = runif(n))
# Estimation: sw means stepwise
res = feols(dep ~ sw(cov_a, cov_b) + cont_a + cont_b, df)
# Display the results
etable(res, order = "Int|cov")
#> model 1 model 2
#> Dependent Var.: dep dep
#>
#> (Intercept) 0.4722*** (0.0812) 0.4148*** (0.0788)
#> cov_a -0.0455 (0.0581)
#> cov_b 0.0482 (0.0576)
#> cont_a -0.1033 (0.0983) -0.0874 (0.0984)
#> cont_b 0.1723. (0.0990) 0.1808. (0.0980)
#> _______________ __________________ __________________
#> S.E. type Standard Standard
#> Observations 100 100
#> R2 0.04823 0.04910
#> Adj. R2 0.01849 0.01938
# Get the results into tidyverse
library(broom)
lapply(as.list(res), function(x) tidy(x))
#> [[1]]
#> # A tibble: 4 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.472 0.0812 5.81 0.0000000799
#> 2 cov_a -0.0455 0.0581 -0.783 0.436
#> 3 cont_a -0.103 0.0983 -1.05 0.296
#> 4 cont_b 0.172 0.0990 1.74 0.0848
#>
#> [[2]]
#> # A tibble: 4 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.415 0.0787 5.27 0.000000846
#> 2 cov_b 0.0482 0.0576 0.837 0.405
#> 3 cont_a -0.0874 0.0984 -0.888 0.377
#> 4 cont_b 0.181 0.0980 1.84 0.0682
library("data.table")
covariables <- c("cov_a", "cov_b")
# or, according to what you want to do :
covariables <- names(df)[grepl(names(df), pattern = "cov_")]
formulas <- paste0("dep ~ cont_a + cont_b + ",covariables )
res <- df[, lapply(formulas, function(x) list(lm(x, data=.SD)))]
summary.lm(res$V1[[1]])
summary.lm(res$V2[[1]])
library(tidyverse)
library(broom)
library("data.table")
library(microbenchmark)
test <- microbenchmark(dt = {
covariables <- c("cov_a", "cov_b")
covariables <- names(df)[grepl(names(df), pattern = "cov_")]
formulas <- paste0("dep ~ cont_a + cont_b + ",covariables )
res <- df[, lapply(formulas, function(x) list(lm(x, data=.SD)))]
summary.lm(res$V1[[1]])
summary.lm(res$V2[[1]])
},
broom = {
df_tbl <- as_tibble(df)
df_tbl %>%
summarise(across(starts_with("cov_"),
~ list(tidy(lm(
reformulate(c("cont_a","cont_b", cur_column()), "dep"),
data = df_tbl))))) %>%
pivot_longer(cols = everything(),
names_to = "var",
values_to = "res") %>%
unnest(res)
}, gep_cov = {
gen_model_expr <- function(var) {
form = paste("dep ~ cont_a + cont_b +", var)
tidy(lm(as.formula(form), data = df))
}
grep("cov_", names(df), value = TRUE) %>%
map(gen_model_expr)
}, times = 100)
> test
Unit: milliseconds
expr min lq mean median uq max neval cld
dt 2.3403 2.63330 3.116303 2.8403 3.21620 10.5219 100 a
broom 18.5069 20.40585 22.711605 21.3388 24.04675 46.8398 100 c
gep_cov 7.7221 8.39180 10.086074 9.4315 10.54345 21.0377 100 b