R中的递归回归
假设我在R中有一个数据帧,如下所示:R中的递归回归,r,recursion,regression,lm,R,Recursion,Regression,Lm,假设我在R中有一个数据帧,如下所示: > set.seed(1) > X <- runif(50, 0, 1) > Y <- runif(50, 0, 1) > df <- data.frame(X,Y) > head(df) X Y 1 0.2655087 0.47761962 2 0.3721239 0.86120948 3 0.5728534 0.43809711 4 0.9082078 0.24479
> set.seed(1)
> X <- runif(50, 0, 1)
> Y <- runif(50, 0, 1)
> df <- data.frame(X,Y)
> head(df)
X Y
1 0.2655087 0.47761962
2 0.3721239 0.86120948
3 0.5728534 0.43809711
4 0.9082078 0.24479728
5 0.2016819 0.07067905
6 0.8983897 0.09946616
>设置种子(1)
>X Y测向头(测向)
xy
1 0.2655087 0.47761962
2 0.3721239 0.86120948
3 0.5728534 0.43809711
4 0.9082078 0.24479728
5 0.2016819 0.07067905
6 0.8983897 0.09946616
如何在X上执行Y的递归回归,从前20个观测值开始,每次增加一个观测值的回归窗口,直到它覆盖整个样本
关于如何执行固定窗口长度的滚动回归(例如,在zoo
包中使用rollapply
)有很多信息。然而,在寻找一个简单的递归选项时,我的搜索努力却白费了,因为它的起点是固定的,窗口大小也会增加。一个例外是来自quantreg
包()的lm.fit.recursive
函数。这个很好用。。。但事实上,它没有记录任何关于标准误差的信息,这是我构建递归置信区间所需要的
我当然可以使用循环来实现这一点。然而,我的实际数据帧非常大,并且也按id分组,这会导致复杂性。所以我希望找到一个更有效的选择。基本上,我正在寻找Stata中“滚动[…],递归”命令的R等价物。也许这会有所帮助:
set.seed(1)
X1 <- runif(50, 0, 1)
X2 <- runif(50, 0, 10) # I included another variable just for a better demonstration
Y <- runif(50, 0, 1)
df <- data.frame(X1,X2,Y)
rolling_lms <- lapply( seq(20,nrow(df) ), function(x) lm( Y ~ X1+X2, data = df[1:x , ]) )
并且有你需要的所有信息
> length(rolling_lms)
[1] 31
它从20次观测开始进行31次线性回归,直到达到50次。包含所有信息的每个回归都存储为滚动列表的一个元素
编辑
根据Carl下面的评论,为了得到每次回归的所有斜率向量,对于X1变量,这是一种非常好的技术(正如Carl所建议的):
您可能对biglm包中的
biglm
函数感兴趣。这允许您在数据子集上拟合回归,然后使用其他数据更新回归模型。最初的想法是将其用于大型数据集,以便您在任何给定时间只需要内存中的部分数据,但它完全符合您想要做的事情的描述(您可以将更新过程封装在一个循环中)。除了标准误差(当然还有系数)外,biglm
对象的摘要还提供了置信区间
库(biglm)
fit1Greg用biglm
提出的解决方案比LyzandeR用lm
提出的解决方案快,但仍然相当慢。我在下面展示的函数可以避免很多开销。我认为如果你用C++代码> Rcpp > <代码> < /P>,你可以使它大大加快。
# simulate data
set.seed(101)
n <- 1000
X <- matrix(rnorm(10 * n), n, 10)
y <- drop(10 + X %*% runif(10)) + rnorm(n)
dat <- data.frame(y = y, X)
# assign wrapper for biglm
biglm_wrapper <- function(formula, data, min_window_size){
mf <- model.frame(formula, data)
X <- model.matrix(terms(mf), mf)
y - model.response(mf)
n <- nrow(X)
p <- ncol(X)
storage.mode(X) <- "double"
storage.mode(y) <- "double"
w <- 1
qr <- list(
d = numeric(p), rbar = numeric(choose(p, 2)),
thetab = numeric(p), sserr = 0, checked = FALSE, tol = numeric(p))
nrbar = length(qr$rbar)
beta. <- numeric(p)
out <- NULL
for(i in 1:n){
row <- X[i, ] # will be over written
qr[c("d", "rbar", "thetab", "sserr")] <- .Fortran(
"INCLUD", np = p, nrbar = nrbar, weight = w, xrow = row, yelem = y[i],
d = qr$d, rbar = qr$rbar, thetab = qr$thetab, sserr = qr$sserr, ier = i - 0L,
PACKAGE = "biglm")[
c("d", "rbar", "thetab", "sserr")]
if(i >= min_window_size){
tmp <- .Fortran(
"REGCF", np = p, nrbar = nrbar, d = qr$d, rbar = qr$rbar,
thetab = qr$thetab, tol = qr$tol, beta = beta., nreq = p, ier = i,
PACKAGE = "biglm")
Coef <- tmp$beta
# compute vcov. See biglm/R/vcov.biglm.R
R <- diag(p)
R[row(R) > col(R)] <- qr$rbar
R <- t(R)
R <- sqrt(qr$d) * R
ok <- qr$d != 0
R[ok, ok] <- chol2inv(R[ok, ok, drop = FALSE])
R[!ok, ] <- NA
R[ ,!ok] <- NA
out <- c(out, list(cbind(
coef = Coef,
SE = sqrt(diag(R * qr$sserr / (i - p + sum(!ok)))))))
}
}
out
}
# assign function to compare with
library(biglm)
f2 <- function(formula, data, min_window_size){
fit1 <- biglm(formula, data = data[1:min_window_size, ])
data.split <-
split(data, c(rep(NA,min_window_size),1:(nrow(data) - min_window_size)))
out4 <- Reduce(update, data.split, init=fit1, accumulate=TRUE)
lapply(out4, function(x) summary(x)$mat[, c("Coef", "SE")])
}
# show that the two gives the same
frm <- y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10
r1 <- biglm_wrapper(frm, dat, 25)
r2 <- f2(frm, dat, 25)
all.equal(r1, r2, check.attributes = FALSE)
#R> [1] TRUE
# show run time
microbenchmark::microbenchmark(
r1 = biglm_wrapper(frm, dat, 25),
r2 = f2(frm, dat, 25),
r3 = lapply(
25:nrow(dat), function(x) lm(frm, data = dat[1:x , ])),
times = 5)
#R> Unit: milliseconds
#R> expr min lq mean median uq max neval cld
#R> r1 43.72469 44.33467 44.79847 44.9964 45.33536 45.60124 5 a
#R> r2 1101.51558 1161.72464 1204.68884 1169.4580 1246.74321 1344.00278 5 b
#R> r3 2080.52513 2232.35939 2231.02060 2253.1420 2260.74737 2328.32908 5 c
#模拟数据
种子集(101)
非常好。由于OP看起来相对较新,也许可以添加一个后紧缩示例,例如all_slopesThanks。非常好的建议!我在回答中补充了这一点。谢谢@Greg,我不知道biglm
软件包。我更愿意避免循环,但您的解决方案仍然很优雅。@Grant,您需要某种类型的循环,无论是显式循环(使用for
)还是隐式循环(使用内部循环)。我添加了一些代码,显示了使用其他函数避免显式for
循环的方法。您还可以编写一个递归函数,用一行更新,然后用剩余的行(和当前对象)调用自己,直到数据帧为空。但相对于循环,这在R中通常不是很有效。
all_slopes<-unlist(sapply(1:31,function(j) rolling_lms[[j]]$coefficients[2]))
> all_slopes
X1 X1 X1 X1 X1 X1 X1 X1 X1 X1
-0.37833614 -0.23231852 -0.20465589 -0.20458938 -0.11796060 -0.14621369 -0.13861210 -0.11906724 -0.10149900 -0.14045509
X1 X1 X1 X1 X1 X1 X1 X1 X1 X1
-0.14331323 -0.14450837 -0.16214836 -0.15715630 -0.17388457 -0.11427933 -0.10624746 -0.09767893 -0.10111773 -0.06415914
X1 X1 X1 X1 X1 X1 X1 X1 X1 X1
-0.06432559 -0.04492075 -0.04122131 -0.06138768 -0.06287532 -0.06305953 -0.06491377 -0.01389334 -0.01703270 -0.03683358
X1
-0.02039574
library(biglm)
fit1 <- biglm( Sepal.Width ~ Sepal.Length + Species, data=iris[1:20,])
summary(fit1)
out <- list()
out[[1]] <- fit1
for(i in 1:130) {
out[[i+1]] <- update(out[[i]], iris[i+20,])
}
out2 <- lapply(out, function(x) summary(x)$mat)
out3 <- sapply(out2, function(x) x[2,2:3])
matplot(t(out3), type='l')
fit1 <- biglm( Sepal.Width ~ Sepal.Length + Species, data=iris[1:20,])
iris.split <- split(iris, c(rep(NA,20),1:130))
out4 <- Reduce(update, iris.split, init=fit1, accumulate=TRUE)
out5 <- lapply(out4, function(x) summary(x)$mat)
out6 <- sapply(out5, function(x) x[2,2:3])
all.equal(out3,out6)
# simulate data
set.seed(101)
n <- 1000
X <- matrix(rnorm(10 * n), n, 10)
y <- drop(10 + X %*% runif(10)) + rnorm(n)
dat <- data.frame(y = y, X)
# assign wrapper for biglm
biglm_wrapper <- function(formula, data, min_window_size){
mf <- model.frame(formula, data)
X <- model.matrix(terms(mf), mf)
y - model.response(mf)
n <- nrow(X)
p <- ncol(X)
storage.mode(X) <- "double"
storage.mode(y) <- "double"
w <- 1
qr <- list(
d = numeric(p), rbar = numeric(choose(p, 2)),
thetab = numeric(p), sserr = 0, checked = FALSE, tol = numeric(p))
nrbar = length(qr$rbar)
beta. <- numeric(p)
out <- NULL
for(i in 1:n){
row <- X[i, ] # will be over written
qr[c("d", "rbar", "thetab", "sserr")] <- .Fortran(
"INCLUD", np = p, nrbar = nrbar, weight = w, xrow = row, yelem = y[i],
d = qr$d, rbar = qr$rbar, thetab = qr$thetab, sserr = qr$sserr, ier = i - 0L,
PACKAGE = "biglm")[
c("d", "rbar", "thetab", "sserr")]
if(i >= min_window_size){
tmp <- .Fortran(
"REGCF", np = p, nrbar = nrbar, d = qr$d, rbar = qr$rbar,
thetab = qr$thetab, tol = qr$tol, beta = beta., nreq = p, ier = i,
PACKAGE = "biglm")
Coef <- tmp$beta
# compute vcov. See biglm/R/vcov.biglm.R
R <- diag(p)
R[row(R) > col(R)] <- qr$rbar
R <- t(R)
R <- sqrt(qr$d) * R
ok <- qr$d != 0
R[ok, ok] <- chol2inv(R[ok, ok, drop = FALSE])
R[!ok, ] <- NA
R[ ,!ok] <- NA
out <- c(out, list(cbind(
coef = Coef,
SE = sqrt(diag(R * qr$sserr / (i - p + sum(!ok)))))))
}
}
out
}
# assign function to compare with
library(biglm)
f2 <- function(formula, data, min_window_size){
fit1 <- biglm(formula, data = data[1:min_window_size, ])
data.split <-
split(data, c(rep(NA,min_window_size),1:(nrow(data) - min_window_size)))
out4 <- Reduce(update, data.split, init=fit1, accumulate=TRUE)
lapply(out4, function(x) summary(x)$mat[, c("Coef", "SE")])
}
# show that the two gives the same
frm <- y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10
r1 <- biglm_wrapper(frm, dat, 25)
r2 <- f2(frm, dat, 25)
all.equal(r1, r2, check.attributes = FALSE)
#R> [1] TRUE
# show run time
microbenchmark::microbenchmark(
r1 = biglm_wrapper(frm, dat, 25),
r2 = f2(frm, dat, 25),
r3 = lapply(
25:nrow(dat), function(x) lm(frm, data = dat[1:x , ])),
times = 5)
#R> Unit: milliseconds
#R> expr min lq mean median uq max neval cld
#R> r1 43.72469 44.33467 44.79847 44.9964 45.33536 45.60124 5 a
#R> r2 1101.51558 1161.72464 1204.68884 1169.4580 1246.74321 1344.00278 5 b
#R> r3 2080.52513 2232.35939 2231.02060 2253.1420 2260.74737 2328.32908 5 c