如何使用两个数据帧运行带有for循环的线性回归_R_Loops_Dataframe_For Loop_Linear Regression

如何使用两个数据帧运行带有for循环的线性回归

r loops dataframe for-loop

如何使用两个数据帧运行带有for循环的线性回归,r,loops,dataframe,for-loop,linear-regression,R,Loops,Dataframe,For Loop,Linear Regression,我试图使用一个4列的数据框（所有这些都是因变量）和一个194行212列的数据框进行一个简单的线性回归分析。我有5个其他数据帧用作同一分析的因变量我已经达到了预期的结果，但是我需要扩展它，我尝试添加一个额外的for循环（对于因变量的列），但是我还需要同时创建更多的空列表我想知道我将如何实现这一点我的循环电流为： y <- data.frame(Green_Class_Commercial[,-1]) x <- data.frame(lagvar[1:175,c(-1,-2)])

我试图使用一个4列的数据框（所有这些都是因变量）和一个194行212列的数据框进行一个简单的线性回归分析。我有5个其他数据帧用作同一分析的因变量

我已经达到了预期的结果，但是我需要扩展它，我尝试添加一个额外的for循环（对于因变量的列），但是我还需要同时创建更多的空列表

我想知道我将如何实现这一点

我的循环电流为：

y <- data.frame(Green_Class_Commercial[,-1])
x <- data.frame(lagvar[1:175,c(-1,-2)])
out <- data.frame(NULL)              # create object to keep results

for (i in 1:length(x)) {
  m <- summary(lm(y[,1] ~ x[,i]))    # run model
  out[i, 1] <- names(x)[i]           # print variable name
  out[i, 2] <- m$coefficients[1,1]   # intercept
  out[i, 3] <- m$coefficients[2,1]   # coefficient
  out[i, 4] <-m$coefficients[2,4]    # Pvalue
  out[i,5] <-m$r.squared             # R-squared
}
names(out) <- c("Variable", "Intercept", "Coefficient","P-val","R-square")
head(out)

这是我想要运行回归的变量

#The x Variable
structure(list(GDP.SC = c(154698, 154698, 154698, 154698, 154698, 
154698, 154698, 154698, 154698, 154698, 160138.4, 160138.4, 160138.4, 
160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4
), GDP.SC1 = c(NA, 154698, 154698, 154698, 154698, 154698, 154698, 
154698, 154698, 154698, 154698, 160138.4, 160138.4, 160138.4, 
160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4), 
    GDP.SC2 = c(NA, NA, 154698, 154698, 154698, 154698, 154698, 
    154698, 154698, 154698, 154698, 154698, 160138.4, 160138.4, 
    160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4
    ), GDP.SC3 = c(NA, NA, NA, 154698, 154698, 154698, 154698, 
    154698, 154698, 154698, 154698, 154698, 154698, 160138.4, 
    160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4
    ), GDP.SC4 = c(NA, NA, NA, NA, 154698, 154698, 154698, 154698, 
    154698, 154698, 154698, 154698, 154698, 154698, 160138.4, 
    160138.4, 160138.4, 160138.4, 160138.4, 160138.4)), row.names = c(NA, 
20L), class = "data.frame")

#The Y Variable
structure(list(X = 1:20, ComBus = c(0.83, 0, 0.23, 0.09, 0.1, 
0.11, 0.15, 0.18, 0.37, 0.19, 0, 0.18, 0.09, 0.1, 0.03, 0.5, 
0.14, 0.17, 0.11, 0.06), ComCon = c(NA, 0, 0, 0, 0, 0.5, 0, 0, 
NA, 0.67, 0, 0, 0, 0, 0.5, 0, 0, NA, 1, 0), ComNoo = c(0.25, 
0.14, 0.38, 0.17, 0.14, 0.33, 0.44, 0.05, 0.04, 0.1, 0.18, 0.06, 
0.23, 0.14, 0.5, 0.14, 0.5, 0, 0.14, 0.23), ComOO = c(0, 0, 0, 
0, 0, 0.33, 0, 0, 0, 0.18, 0.22, 0.15, 0, 0, 0.17, 0, 0, 0, 0, 
0)), row.names = c(NA, 20L), class = "data.frame")

好的，这对你有好处吗？我正在用apply替换循环，如果可以的话

### Some dummy dataframes
x <- data.frame(v1 = rnorm(1:10),
                 v2 = rnorm(1:10),
                 v3 = runif(10, 1, 1000),
                 v4 = runif(10, 1, 1000))
x2 <- data.frame(v1 = rnorm(1:10),
                v2 = rnorm(1:10),
                v3 = runif(10, 1, 1000),
                v4 = runif(10, 1, 1000))
y <- data.frame(v1 = rnorm(1:10),
                v2 = rnorm(1:10),
                v3 = runif(10, 1, 1000),
                v4 = runif(10, 1, 1000))
y2 <- data.frame(v1 = rnorm(1:10),
                 v2 = rnorm(1:10),
                 v3 = runif(10, 1, 1000),
                 v4 = runif(10, 1, 1000)) 

###
# I tend to prefer the apply family of functions to replace loops where possible.
# This function takes two inputs, dataframes of dependent and independent variables.
# the apply function here takes the x_df and applies the following anonymous function to each column
# so for each column in x_df it performs a lm against the first column of y_df

lm_func <- function(y_df, x_df) {
  out <- apply(x_df, MARGIN = 2, function(x) {
    lm(y_df[, 1] ~ x)
  })
  return(out)
}

results_list <- lm_func(y, x)

# the output is one list element per lm. I like to keep the whole lm output just in case you need to go back to it

# we can then turn that list back into a dataframe using rbindlist from data.table
# and get what I think is your desired output using glance from broom

library(data.table)
library(broom)

results_glance <- rbindlist(lapply(results_list, glance), idcol = "var_name")

# or keep it as a list if you wish
results_list_glance <- lapply(results_list, glance)

# to run the function using a single x argument but multiple y arguments you can use mapply

results_list_m <- mapply(lm_func,
                       y_df = list(y, y2),
                       MoreArgs = list(    # other arguments you want to keep fixed
                         x_df = x
                       ),
                       SIMPLIFY = F
)

# the output is a little fiendish because it will be a list of lists
# we can include the rbindlist and glance into the function to make the output a little simpler:


lm_func_bind <- function(y_df, x_df) {
  out <- apply(x_df, MARGIN = 2, function(x) {
    lm(y_df[, 1] ~ x)
  })
  out <- rbindlist(lapply(out, glance), idcol = "var_name")
  return(out)
}
results_glance_df <- lm_func_bind(y, x)

results_list_dfs <- mapply(lm_func_bind,
                           y_df = list(y, y2),
                           MoreArgs = list(    # other arguments you want to keep fixed
                             x_df = x
                           ),
                           SIMPLIFY = F
)

###一些虚拟数据帧
好的，这对你有好处吗？我正在用apply替换循环，如果可以的话
### Some dummy dataframes
x <- data.frame(v1 = rnorm(1:10),
                 v2 = rnorm(1:10),
                 v3 = runif(10, 1, 1000),
                 v4 = runif(10, 1, 1000))
x2 <- data.frame(v1 = rnorm(1:10),
                v2 = rnorm(1:10),
                v3 = runif(10, 1, 1000),
                v4 = runif(10, 1, 1000))
y <- data.frame(v1 = rnorm(1:10),
                v2 = rnorm(1:10),
                v3 = runif(10, 1, 1000),
                v4 = runif(10, 1, 1000))
y2 <- data.frame(v1 = rnorm(1:10),
                 v2 = rnorm(1:10),
                 v3 = runif(10, 1, 1000),
                 v4 = runif(10, 1, 1000)) 

###
# I tend to prefer the apply family of functions to replace loops where possible.
# This function takes two inputs, dataframes of dependent and independent variables.
# the apply function here takes the x_df and applies the following anonymous function to each column
# so for each column in x_df it performs a lm against the first column of y_df

lm_func <- function(y_df, x_df) {
  out <- apply(x_df, MARGIN = 2, function(x) {
    lm(y_df[, 1] ~ x)
  })
  return(out)
}

results_list <- lm_func(y, x)

# the output is one list element per lm. I like to keep the whole lm output just in case you need to go back to it

# we can then turn that list back into a dataframe using rbindlist from data.table
# and get what I think is your desired output using glance from broom

library(data.table)
library(broom)

results_glance <- rbindlist(lapply(results_list, glance), idcol = "var_name")

# or keep it as a list if you wish
results_list_glance <- lapply(results_list, glance)

# to run the function using a single x argument but multiple y arguments you can use mapply

results_list_m <- mapply(lm_func,
                       y_df = list(y, y2),
                       MoreArgs = list(    # other arguments you want to keep fixed
                         x_df = x
                       ),
                       SIMPLIFY = F
)

# the output is a little fiendish because it will be a list of lists
# we can include the rbindlist and glance into the function to make the output a little simpler:


lm_func_bind <- function(y_df, x_df) {
  out <- apply(x_df, MARGIN = 2, function(x) {
    lm(y_df[, 1] ~ x)
  })
  out <- rbindlist(lapply(out, glance), idcol = "var_name")
  return(out)
}
results_glance_df <- lm_func_bind(y, x)

results_list_dfs <- mapply(lm_func_bind,
                           y_df = list(y, y2),
                           MoreArgs = list(    # other arguments you want to keep fixed
                             x_df = x
                           ),
                           SIMPLIFY = F
)


###一些虚拟数据帧
x考虑一个嵌套的lappy
，外部调用遍历因变量数据帧的每一列，每次内部调用遍历自变量数据帧的所有列时：
reg_data <- function(yvar, xdf) {
    # ITERATE THROUGH EACH COLUMN OF x
    df_list <- lapply(seq_along(xdf), function(i) {
      m <- summary(lm(yvar ~ x[,i]))        # run model

      data.frame(
         Variable = names(x)[i],            # print variable name
         Intercept = m$coefficients[1,1],   # intercept
         Coefficient = m$coefficients[2,1], # coefficient
         P_val = m$coefficients[2,4],       # P-value
         R_square = m$r.squared             # R-squared
      )
    })

   return(do.call(rbind, df_list))
}

# ITERATE THROUGH EACH COLUMN OF y
model_dfs <- lapply(y[-1], function(col) reg_data(col, x))

考虑一个嵌套的lappy
，外部调用遍历因变量数据帧的每一列，每次内部调用遍历自变量数据帧的所有列时：
reg_data <- function(yvar, xdf) {
    # ITERATE THROUGH EACH COLUMN OF x
    df_list <- lapply(seq_along(xdf), function(i) {
      m <- summary(lm(yvar ~ x[,i]))        # run model

      data.frame(
         Variable = names(x)[i],            # print variable name
         Intercept = m$coefficients[1,1],   # intercept
         Coefficient = m$coefficients[2,1], # coefficient
         P_val = m$coefficients[2,4],       # P-value
         R_square = m$r.squared             # R-squared
      )
    })

   return(do.call(rbind, df_list))
}

# ITERATE THROUGH EACH COLUMN OF y
model_dfs <- lapply(y[-1], function(col) reg_data(col, x))

将其应用于其他数据帧？这句话“我需要扩展它，我已经尝试添加一个额外的for循环”有点含糊不清，您的示例没有显示您还想做什么为什么不将所有回归数据保存在单个数据框中？嘿，有几种可能的解决方案。如果您需要多次执行相同的操作，我通常会将其作为一个函数编写，并向其提供数据帧（或将其映射到数据帧）。类似这样的东西，将每个lm
作为列表元素输出：lm_func@StupidWolf很抱歉，因此y=Green\u Class_Commercial
是一个包含175行和4列的数据帧，我还有5个类似的数据帧，我需要用in替换y（所有数据帧都有四列）我需要成为回归模型中的响应变量。X是一个数据帧，有194行212列（我需要用212个变量中的每一个一次运行一个简单的线性回归）。在我当前的代码中，我已经为Green_Class_Commercial dataframe的一列完成了这项工作，我希望在第2-4列中再次完成这项工作。如果有帮助，请告诉我thanks@Parfait感谢您的回复，这将使数据集的排序更加困难。Y变量表示在南卡罗来纳州格林维尔有人拖欠贷款的概率，这是我的教授给我的一个数据集，我目前的课程中，许多概率是根据状态（分类或通过）和位置（格林维尔或其他）以及其他6个不同的部门进行筛选的（消费者业务、商业业务等）将其应用于其他数据帧？这句话“我需要扩展它，我尝试添加一个额外的for循环”有点模糊，你的例子没有显示你还想做什么为什么不将所有回归数据保存在一个数据框中？嘿，有几种可能的解决方案。如果你需要做很多次同样的事情，我通常会将它作为一个函数编写，并将数据框提供给它（或者将它映射到数据框）。类似这样的东西，将每个lm
作为列表元素输出：lm_func@StupidWolf很抱歉，因此y=Green\u Class_Commercial
是一个包含175行和4列的数据帧，我还有5个类似的数据帧，需要用y替换（所有数据帧都有四列）我需要成为回归模型中的响应变量。X是一个包含194行212列的数据帧（我需要对212个变量中的每个变量一次运行一个简单的线性回归）.在我当前的代码中，我已经对Green_Class_Commercial dataframe的一列执行了此操作，我希望对第2-4列再次执行此操作。如果有帮助，请告诉我thanks@Parfait感谢您的回复，这将使对数据集进行排序变得更加困难。Y变量表示在格林维尔s有人拖欠贷款的概率南卡罗来纳州，这是我的教授给我的一个数据集，我目前的课程中，许多概率是根据状态（分类或通过）和位置（格林维尔或其他）以及其他6个不同的部门（消费商业、商业商业等）进行筛选的这是非常有用的信息，谢谢你花时间一步一步地告诉我！但是现在唯一的问题是我得到了不同的p值，你知道为什么会发生这种情况吗？我用了“头”（结果[1:5，c（1,6）]）```结果与上面的结果非常不同。var_name p.value 1:GDP.SC 0.9894016 2:GDP.SC1 0.9957419 3:GDP.SC2 0.9998764:GDP.SC3 0.9978171 5:GDP.SC4 0.9951122mmm我不确定。如果我在与我的示例相同的虚拟数据帧上运行你的循环，我会得到两者相同的结果。在没有可复制示例的情况下很难进行检查，您能提供一个吗？dput（您的数据）
，或head（dput（您的数据），n=20）
如果帧不是非常大，可能会有所帮助。您的环境中有任何遗留对象或其他东西吗？如果还没有，可能会在新的R会话上进行测试？或者您的真实数据帧比简单的虚拟数据帧稍微复杂一点？我可能已经发现了问题，在您上面的评论中，您提到将5个数据帧替换为y
。我给你的代码是y
数据框保持不变，x
数据框更改…如果这是我的错误，很容易修复，请告诉我，我会更新答案，是的，我现在看到了，y数据框是我正在更改的，谢谢这是非常有用的信息，谢谢你花时间陪我走过一步一步！但是现在唯一的问题是我得到了不同的p-val