R 以逐步方式比较系数_R - Fatal编程技术网

R 以逐步方式比较系数

R 以逐步方式比较系数,r,R,我使用mtcars数据来解释我的问题。例如，我试图用mpg作为因变量来估计变量cyl的回归系数，并通过包括其他变量来评估系数的变化步骤1：lm（mpg~cyl，data=df）获取cyl的粗系数第2步：在第1步的模型中每次添加一个其他变量，选择cyl系数变化最大（%）的变量，并将该变量添加到上述模型中第3步：重复第2步，将剩余变量的每个变量添加到上述模型中，再次找到系数变化最大的一个“cyl” 步骤：重复，直到所有变量都包含在模型中 library(dplyr) df <- mtca

我使用

mtcars

数据来解释我的问题。例如，我试图用

mpg

作为因变量来估计变量

cyl

的回归系数，并通过包括其他变量来评估系数的变化

步骤1：

lm（mpg~cyl，data=df）

获取

cyl的粗系数
第2步：在第1步的模型中每次添加一个其他变量，选择cyl
系数变化最大（%）的变量，并将该变量添加到上述模型中
第3步：重复第2步，将剩余变量的每个变量添加到上述模型中，再次找到系数变化最大的一个“cyl”
步骤：重复，直到所有变量都包含在模型中
library(dplyr)
df <- mtcars %>% select(mpg, cyl, disp, hp, wt)

my_fun1 <- function(df=data) {
  out_df <- data.frame(matrix(ncol = 0, nrow = (length(df) - 1)))
  md_1 <- lm(mpg ~ cyl, data = df)
  out_df$Models[1] <- "Crude"
  out_df$Estimate[1] <- md_1$coefficients[2]
  pre_change <- 0
  to_rm <- 0
  for (k in 2:(length(df)-1)) {
    for (i in 3:length(df)) {
      if (!i %in% to_rm) {
        md_tmp <- update(md_1, . ~ . + df[[i]])
        change <- abs(100*(md_tmp$coefficients[2] - md_1$coefficients[2])/md_1$coefficients[2])
        dif <- md_tmp$coefficients[2] - md_1$coefficients[2]
        if (change >= pre_change) {
          out_df$Estimate[k] <- md_tmp$coefficients[2]
          out_df$Models[k] <- paste("+", names(df)[[i]])
          out_df$Diff[k] <- md_tmp$coefficients[2] - md_1$coefficients[2]
          picked <- names(df)[[i]]
          picked_i = i
          pre_change <- out_df$`Change (%)`[k] <- change
        }
      }
    }
    to_rm <- c(to_rm, picked_i)
    md_1 <- update(md_1, .~. + eval(as.name(paste(picked))))
    pre_change = 0
   }
  out_df
}

my_fun1(df = df)

但是，步骤1和2提供了正确的结果，步骤3和4不正确。如有任何建议，将不胜感激
 通过使用R的矢量化属性并避免痛苦的for
循环，您可能会使这变得更容易一些
my_fun2 <- function(dat, i) {
  fit <- lm(mpg ~ cyl, data=dat)
  crude <- fit$coef[2]
  # vectorized evaluation function
  # fits model and calculates coef and change
  evav <- Vectorize(function(i) {
    # create extension string from the "i"s
    cf.ext <- paste(names(dat)[i], collapse="+")
    # update formula with extensions
    beta <- update(fit, as.formula(
      paste0("mpg~cyl", 
             # paste already accepted coefs case they exist
             if (length(bests) != 0) {
               paste("", names(dat)[bests], sep="+", collapse="")
             },
             "+", cf.ext)
      ))$coe[2]
    # calculate Diff
    beta.d <- abs(crude - beta)
    # calculate Change %
    beta.d.perc <- 100 / crude*beta.d
    # set an attribute "cf.name" to be able to identify coef later
    return(`attr<-`(c(beta=beta, beta.d=beta.d, 
                      beta.d.perc=beta.d.perc), 
                    "cf.name", cf.ext))
  }, SIMPLIFY=FALSE)  # simplifying would strip off attributes
  # create empty vector bests
  bests <- c()
  # lapply evav() over the "i"s
  res <- lapply(i, function(...) {
    # run evav()
    i.res <- evav(i)
    # find largest change
    largest <- which.max(mapply(`[`, i.res, 2))
    # update "bests" vector within function environment with `<<-`
    bests <<- c(bests, i[largest])
    # same with the "i"s
    i <<- i[-largest]
    return(i.res[[largest]])
  })
  # summarize everything into matrix and give dimnames
  res <- `dimnames<-`(rbind(c(crude, NA, NA), 
                            do.call(rbind, res)), 
                      list(
                        c("crude", 
                          paste0("+ ", mapply(attr, res, "cf.name"))),
                        c("Estimate", "Diff", "Change (%)")))
  return(res)
}

检查
检查Diff
s：
fit <- lm(mpg ~ cyl, data=mtcars[c("mpg", "cyl", "disp", "hp", "wt")])
sapply(c("disp", "hp", "wt"), function(x) 
  unname(abs(fit$coe[2] - update(fit, as.formula(paste("mpg~cyl+", x)))$coe[2])))
#      disp        hp        wt 
# 1.2885133 0.6110965 1.3679952 
sapply(c("disp", "hp"), function(x) 
  unname(abs(fit$coe[2] - update(fit, as.formula(paste("mpg~cyl+wt+", x)))$coe[2])))
#     disp       hp 
# 1.090847 1.934173 
sapply(c("disp"), function(x) 
  unname(abs(fit$coe[2] - update(fit, as.formula(paste("mpg~cyl+wt+hp+", x)))$coe[2])))
#    disp 
# 1.58247 

fit您对for
循环的看法绝对正确。但是，我希望+hp
行中的值来自：lm（mpg~cyl+wt+hp）
而在您的代码中，它们来自lm（mpg~cyl+hp）
。另外，第+disp
行中的那些应该来自lm（mpg~cyl+wt+hp+disp）
，而不是lm（mpg~cyl+disp）
。我将进一步研究你的代码，因为我以前没有使用过其中的一些代码。非常感谢您的帮助。@ZhiqiangWang啊，我明白了，请看更新<代码>映射（
：，3，3:ncol（dat））
创建一个用于sapply（）
的增长序列。我还补充了一些意见。我假设您希望输出中的原油的绝对差异/变化。您的速度惊人。我仍在努力理解您以前的代码。在每一步中，我想在剩下的变量中选择一个变量，它在cyl
@ZhiqiangWang-ah的系数中产生了最大的变化（%），目前它正在计算所有剩余的COEF，我回到我的机器后会解决这个问题。顺便说一句，您是否查看了step（）
函数，是否执行？step
。太棒了！我快速浏览了一下step（），这并不是我想要的。干杯
my_fun2(mtcars[c("mpg", "cyl", "disp", "hp", "wt")], i=3:5)
#          Estimate     Diff Change (%)
# crude  -2.8757901       NA         NA
# + wt   -1.5077950 1.367995  -47.56937
# + hp   -0.9416168 1.934173  -67.25711
# + disp -1.2933197 1.582470  -55.02733

fit <- lm(mpg ~ cyl, data=mtcars[c("mpg", "cyl", "disp", "hp", "wt")])
sapply(c("disp", "hp", "wt"), function(x) 
  unname(abs(fit$coe[2] - update(fit, as.formula(paste("mpg~cyl+", x)))$coe[2])))
#      disp        hp        wt 
# 1.2885133 0.6110965 1.3679952 
sapply(c("disp", "hp"), function(x) 
  unname(abs(fit$coe[2] - update(fit, as.formula(paste("mpg~cyl+wt+", x)))$coe[2])))
#     disp       hp 
# 1.090847 1.934173 
sapply(c("disp"), function(x) 
  unname(abs(fit$coe[2] - update(fit, as.formula(paste("mpg~cyl+wt+hp+", x)))$coe[2])))
#    disp 
# 1.58247