Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/jpa/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R 通过在mutate()中只改变一个自变量来拟合多个回归模型_R_Dplyr_Regression_Tidyverse - Fatal编程技术网

R 通过在mutate()中只改变一个自变量来拟合多个回归模型

R 通过在mutate()中只改变一个自变量来拟合多个回归模型,r,dplyr,regression,tidyverse,R,Dplyr,Regression,Tidyverse,我怀疑这个问题可能是重复的,但我没有发现任何令人满意的地方。设想一个结构如下的简单数据集: set.seed(123) df <- data.frame(cov_a = rbinom(100, 1, prob = 0.5), cov_b = rbinom(100, 1, prob = 0.5), cont_a = runif(100), cont_b = runif(100),

我怀疑这个问题可能是重复的,但我没有发现任何令人满意的地方。设想一个结构如下的简单数据集:

set.seed(123)
df <- data.frame(cov_a = rbinom(100, 1, prob = 0.5),
                 cov_b = rbinom(100, 1, prob = 0.5),
                 cont_a  = runif(100),
                 cont_b = runif(100),
                 dep = runif(100))

    cov_a cov_b      cont_a      cont_b          dep
1       0     1 0.238726027 0.784575267 0.9860542973
2       1     0 0.962358936 0.009429905 0.1370674714
3       0     0 0.601365726 0.779065883 0.9053095817
4       1     1 0.515029727 0.729390652 0.5763018376
5       1     0 0.402573342 0.630131853 0.3954488591
6       0     1 0.880246541 0.480910830 0.4498024841
7       1     1 0.364091865 0.156636851 0.7065019011
8       1     1 0.288239281 0.008215520 0.0825027458
9       1     0 0.170645235 0.452458394 0.3393125802
10      0     0 0.172171746 0.492293329 0.6807875512

但是,我希望避免使用double-
map()
,并使用更直接或更优雅的方法来解决它。

我不确定这是否会被视为更直接/优雅,但我的解决方案不使用double
map

library(tidyverse)
library(broom)

gen_model_expr <- function(var) {
  form = paste("dep ~ cont_a + cont_b +", var)
  tidy(lm(as.formula(form), data = df))
}

grep("cov_", names(df), value = TRUE) %>%
  map(gen_model_expr)

我提出两种方法:

第一个很无聊。我们可以使用{dplyr}的
rowwise
符号代替
purrr::map
。这种方法有两种风格。在
rowwise
之后,我们可以使用(A)
mutate%>%unest
或者(B),我们可以使用
group\u map
。在这两种方法中,我都避免了重复数据,但如果需要,我们可以轻松地将数据包含在每一行中(在设置
tibble
时,我们可以执行
tibble(myvar=,data=list(df))
)。虽然(A)给出了一个包含所有数据的TIBLE,但(B)中的
组映射
返回了一个类似于原始示例中的“双映射”方法的列表

第二种方法,我认为它相当“新鲜”(虽然不太直接),因为它既不使用<代码>行> < /代码>也不>代码> map < /代码>。在这里,我们使用{dplyr}的

cross
函数和
cur\u column()
,为每个输出创建一个新的列,然后
pivot\u long
unnest
将所有结果放在一个
tible

最后一个基准测试显示:“doulbe_map”速度最慢(因为数据列重复),其次是“cross”和“row_unnest”,而“row_group_map”速度相当快。然而,最快的方法是@latlio使用
map
和一个自定义函数(下面称为“map\u custom\u fun”),但尽管它使用
purr
,它可能不太“dplyr ish”

库(tidyverse)
图书馆(扫帚)
种子集(123)
df%变异%>%unnest
df%>%
选择(以(“cov”)开始)%>%
colnames%>%
TIBLE(myvar=)%%>%
行()
突变(res=list(tidy(lm(dep~cont_a+cont_b+eval(sym(.data$myvar)),data=df)))%>%
unnest(res)
#>#A tibble:8 x 6
#>myvar术语估计标准误差统计p值
#>                                           
#>1 cov_a(截距)0.472 0.0812 5.81 0.0000000799
#>2 cov_a cont_a-0.103 0.0983-1.05 0.296
#>3 cov_a cont_b 0.172 0.0990 1.74 0.0848
#>4 cov_a eval(sym(.data$myvar))-0.0455 0.0581-0.783 0.436
#>5 cov_b(截距)0.415 0.0787 5.27 0.000000846
#>6 cov_b cont_a-0.0874 0.0984-0.888 0.377
#>7 cov_b cont_b 0.181 0.0980 1.84 0.0682
#>8 cov_b eval(sym(.data$myvar))0.0482 0.0576 0.837 0.405
#变更B:
#行%>%组映射
df%>%
选择(以(“cov”)开始)%>%
colnames%>%
TIBLE(myvar=)%%>%
行()
组映射(.keep=TRUE,
~tidy(lm(dep~cont_a+cont_b+eval(sym(myvar)),data=df)))
#> [[1]]
#>#A tibble:4 x 5
#>术语估计标准误差统计p值
#>                                       
#>1(截距)0.472 0.0812 5.81 0.0000000799
#>2续-0.103 0.0983-1.05 0.296
#>3续0.172 0.0990 1.74 0.0848
#>4评估(sym(.x$myvar))-0.0455 0.0581-0.783 0.436
#> 
#> [[2]]
#>#A tibble:4 x 5
#>术语估计标准误差统计p值
#>                                      
#>1(截距)0.415 0.0787 5.27 0.000000846
#>2续-0.0874 0.0984-0.888 0.377
#>3续0.181 0.0980 1.84 0.0682
#>4评估(sym(.x$myvar))0.0482 0.0576 0.837 0.405
#第二种方法:使用摘要(跨)
#我们这里需要一个“tibble”,否则打印会搞砸
df_tbl%
总结(以“cov”开头),
~list(整洁的(lm)(
重新编制(c(“cont_a”、“cont_b”、“cur_column())、“dep”),
数据=df_tbl(()(()))%>%
pivot_更长(cols=everything(),
name_to=“var”,
值_to=“res”)%>%
unnest(res)
#>#A tibble:8 x 6
#>var术语估计标准误差统计p值
#>                                
#>1 cov_a(截距)0.472 0.0812 5.81 0.0000000799
#>2 cov_a cont_a-0.103 0.0983-1.05 0.296
#>3 cov_a cont_b 0.172 0.0990 1.74 0.0848
#>4 cov_a cov_a-0.0455 0.0581-0.783 0.436
#>5 cov_b(截距)0.415 0.0787 5.27 0.000000846
#>6 cov_b cont_a-0.0874 0.0984-0.888 0.377
#>7 cov_b cont_b 0.181 0.0980 1.84 0.0682
#>8 cov_b cov_b 0.0482 0.0576 0.837 0.405
由(v0.3.0)于2021-01-10创建

基准

#>单位:毫秒
#>expr最小lq平均uq最大neval
#>double_地图20.92847 22.238626 23.645280 22.726705 25.002493 34.29351 100
#>世界其他地区15.30179 15.835506 16.714873 16.358134 17.314802 20.60496 100
#>row_groupmap 10.1016810.49037411.23739810.70952411.45267720.40186100
#>穿过16.47369 17.117178 18.593908 17.945136 19.431190 29.29384 100
#>map_custom_fun 6.85758 7.311608 7.953066 7.611394 8.305757 19.57006 100

这里是另一个版本:

library(broom)

purrr::map_df(grep("cov_", names(df), value = TRUE), 
   ~tidy(lm(reformulate(c('cont_a', 'cont_b', .x), 'dep'), data = df)))

#  term        estimate std.error statistic      p.value
#  <chr>          <dbl>     <dbl>     <dbl>        <dbl>
#1 (Intercept)   0.472     0.0812     5.81  0.0000000799
#2 cont_a       -0.103     0.0983    -1.05  0.296       
#3 cont_b        0.172     0.0990     1.74  0.0848      
#4 cov_a        -0.0455    0.0581    -0.783 0.436       
#5 (Intercept)   0.415     0.0787     5.27  0.000000846 
#6 cont_a       -0.0874    0.0984    -0.888 0.377       
#7 cont_b        0.181     0.0980     1.84  0.0682      
#8 cov_b         0.0482    0.0576     0.837 0.405       
库(扫帚)
purrr::map_df(grep(“cov_”),name(df),value=TRUE),
~tidy(lm(重新格式化(c('cont_a','cont_b',.x),'dep'),data=df)))
#术语估计标准误差
[[1]]
# A tibble: 4 x 5
  term        estimate std.error statistic      p.value
  <chr>          <dbl>     <dbl>     <dbl>        <dbl>
1 (Intercept)   0.472     0.0812     5.81  0.0000000799
2 cont_a       -0.103     0.0983    -1.05  0.296       
3 cont_b        0.172     0.0990     1.74  0.0848      
4 cov_a        -0.0455    0.0581    -0.783 0.436       

[[2]]
# A tibble: 4 x 5
  term        estimate std.error statistic     p.value
  <chr>          <dbl>     <dbl>     <dbl>       <dbl>
1 (Intercept)   0.415     0.0787     5.27  0.000000846
2 cont_a       -0.0874    0.0984    -0.888 0.377      
3 cont_b        0.181     0.0980     1.84  0.0682     
4 cov_b         0.0482    0.0576     0.837 0.405 
# A tibble: 4 x 13
  expression                                         min median `itr/sec` mem_alloc
  <bch:expr>                                      <bch:> <bch:>     <dbl> <bch:byt>
1 names(df)[grepl("cov_", names(df))]             7.59µs  8.4µs   101975.        0B
2 grep("cov_", colnames(df), value = TRUE)        8.21µs 8.96µs   103142.        0B
3 grep("cov_", names(df), value = TRUE)           6.96µs 7.43µs   128694.        0B
4 df %>% select(starts_with("cov_")) %>% colnames 1.17ms 1.33ms      636.    5.39KB
library(broom)

purrr::map_df(grep("cov_", names(df), value = TRUE), 
   ~tidy(lm(reformulate(c('cont_a', 'cont_b', .x), 'dep'), data = df)))

#  term        estimate std.error statistic      p.value
#  <chr>          <dbl>     <dbl>     <dbl>        <dbl>
#1 (Intercept)   0.472     0.0812     5.81  0.0000000799
#2 cont_a       -0.103     0.0983    -1.05  0.296       
#3 cont_b        0.172     0.0990     1.74  0.0848      
#4 cov_a        -0.0455    0.0581    -0.783 0.436       
#5 (Intercept)   0.415     0.0787     5.27  0.000000846 
#6 cont_a       -0.0874    0.0984    -0.888 0.377       
#7 cont_b        0.181     0.0980     1.84  0.0682      
#8 cov_b         0.0482    0.0576     0.837 0.405       
purrr::map(grep("cov_", colnames(df), value = TRUE), 
           function(x){
             df2 <- select(df, x, cont_a, cont_b, dep)
             tidy(lm(dep ~ ., df2))
           })
library(fixest)

set.seed(123)
n = 100
df = data.frame(cov_a = rbinom(n, 1, prob = 0.5),
                cov_b = rbinom(n, 1, prob = 0.5),
                cont_a  = runif(n), cont_b = runif(n),
                dep = runif(n))

# Estimation: sw means stepwise
res = feols(dep ~ sw(cov_a, cov_b) + cont_a + cont_b, df)
# Display the results
etable(res, order = "Int|cov")
#>                            model 1            model 2
#> Dependent Var.:                dep                dep
#>                                                      
#> (Intercept)     0.4722*** (0.0812) 0.4148*** (0.0788)
#> cov_a             -0.0455 (0.0581)                   
#> cov_b                                 0.0482 (0.0576)
#> cont_a            -0.1033 (0.0983)   -0.0874 (0.0984)
#> cont_b            0.1723. (0.0990)   0.1808. (0.0980)
#> _______________ __________________ __________________
#> S.E. type                 Standard           Standard
#> Observations                   100                100
#> R2                         0.04823            0.04910
#> Adj. R2                    0.01849            0.01938
# Get the results into tidyverse
library(broom)
lapply(as.list(res), function(x) tidy(x))
#> [[1]]
#> # A tibble: 4 x 5
#>   term        estimate std.error statistic      p.value
#>   <chr>          <dbl>     <dbl>     <dbl>        <dbl>
#> 1 (Intercept)   0.472     0.0812     5.81  0.0000000799
#> 2 cov_a        -0.0455    0.0581    -0.783 0.436       
#> 3 cont_a       -0.103     0.0983    -1.05  0.296       
#> 4 cont_b        0.172     0.0990     1.74  0.0848      
#> 
#> [[2]]
#> # A tibble: 4 x 5
#>   term        estimate std.error statistic     p.value
#>   <chr>          <dbl>     <dbl>     <dbl>       <dbl>
#> 1 (Intercept)   0.415     0.0787     5.27  0.000000846
#> 2 cov_b         0.0482    0.0576     0.837 0.405      
#> 3 cont_a       -0.0874    0.0984    -0.888 0.377      
#> 4 cont_b        0.181     0.0980     1.84  0.0682
library("data.table")
covariables <- c("cov_a", "cov_b")
# or, according to what you want to do :  
covariables <- names(df)[grepl(names(df), pattern = "cov_")]
formulas <- paste0("dep ~ cont_a + cont_b + ",covariables )
res <- df[, lapply(formulas, function(x) list(lm(x, data=.SD)))]
summary.lm(res$V1[[1]])
summary.lm(res$V2[[1]])

library(tidyverse)
library(broom)
library("data.table")

library(microbenchmark)

test <- microbenchmark(dt = {
covariables <- c("cov_a", "cov_b") 
covariables <- names(df)[grepl(names(df), pattern = "cov_")]
formulas <- paste0("dep ~ cont_a + cont_b + ",covariables )
res <- df[, lapply(formulas, function(x) list(lm(x, data=.SD)))]
summary.lm(res$V1[[1]])
summary.lm(res$V2[[1]])
},
broom = {


df_tbl <- as_tibble(df)

df_tbl %>% 
  summarise(across(starts_with("cov_"), 
                   ~ list(tidy(lm(
                     reformulate(c("cont_a","cont_b", cur_column()), "dep"),
                     data = df_tbl))))) %>% 
  pivot_longer(cols = everything(),
               names_to = "var",
               values_to = "res") %>% 
  unnest(res)
}, gep_cov = {
  gen_model_expr <- function(var) {
    form = paste("dep ~ cont_a + cont_b +", var)
    tidy(lm(as.formula(form), data = df))
  }
  
  grep("cov_", names(df), value = TRUE) %>%
    map(gen_model_expr)
}, times = 100)

> test
Unit: milliseconds
    expr     min       lq      mean  median       uq     max neval cld
      dt  2.3403  2.63330  3.116303  2.8403  3.21620 10.5219   100 a  
   broom 18.5069 20.40585 22.711605 21.3388 24.04675 46.8398   100   c
 gep_cov  7.7221  8.39180 10.086074  9.4315 10.54345 21.0377   100  b