R 预测包装外的分段lm
我有数百个分段线性模型的输出阵列(使用R中的分段包制作)。我希望能够使用predict函数在新数据上使用这些输出。明确地说,我的工作区中没有分段线性模型对象;我刚刚保存并重新导入了相关输出(例如系数和断点)。因此,我不能简单地使用分段包中的predict.segmented函数 下面是一个玩具示例,基于该示例,它看起来很有希望,但与predict.segmented函数的输出不匹配R 预测包装外的分段lm,r,lm,predict,R,Lm,Predict,我有数百个分段线性模型的输出阵列(使用R中的分段包制作)。我希望能够使用predict函数在新数据上使用这些输出。明确地说,我的工作区中没有分段线性模型对象;我刚刚保存并重新导入了相关输出(例如系数和断点)。因此,我不能简单地使用分段包中的predict.segmented函数 下面是一个玩具示例,基于该示例,它看起来很有希望,但与predict.segmented函数的输出不匹配 library(segmented) set.seed(12) xx <- 1:100 zz <
library(segmented)
set.seed(12)
xx <- 1:100
zz <- runif(100)
yy <- 2 + 1.5*pmax(xx-35,0) - 1.5*pmax(xx-70,0) +
15*pmax(zz-0.5,0) + rnorm(100,0,2)
dati <- data.frame(x=xx,y=yy,z=zz)
out.lm<-lm(y~x,data=dati)
o<-## S3 method for class 'lm':
segmented(out.lm,seg.Z=~x,psi=list(x=c(30,60)),
control=seg.control(display=FALSE))
# Note that coefficients with U in the name are differences in slopes, not slopes.
# Compare:
slope(o)
coef(o)[2] + coef(o)[3]
coef(o)[2] + coef(o)[3] + coef(o)[4]
# prediction
pred <- data.frame(x = 1:100)
pred$dummy1 <- pmax(pred$x - o$psi[1,2], 0)
pred$dummy2 <- pmax(pred$x - o$psi[2,2], 0)
pred$dummy3 <- I(pred$x > o$psi[1,2]) * (coef(o)[2] + coef(o)[3])
pred$dummy4 <- I(pred$x > o$psi[2,2]) * (coef(o)[2] + coef(o)[3] + coef(o)[4])
names(pred)[-1]<- names(model.frame(o))[-c(1,2)]
# compute the prediction, using standard predict function
# computing confidence intervals further
# suppose that the breakpoints are fixed
pred <- data.frame(pred, predict(o, newdata= pred,
interval="confidence"))
# Try prediction using the predict.segment version to compare
test <- predict.segmented(o)
plot(pred$fit, test, ylim = c(0, 100))
abline(0,1, col = "red")
# At least one segment not being predicted correctly?
库(分段)
种子(12)
xx为什么在预测中使用名为xx
的列,但在原始data.frame中它被称为x
?要使predict()
正常工作,列的名称必须完全匹配。谢谢@MrFlick。我修复了predict中的列名,以便它们与原始data.frame匹配。第一段的预测拟合现在看起来不错,但我仍在检查其他段。对于任何与激发此代码的Nabble链接进行比较的人,我更改了坡度的定义(因为根据文档,以“U”开头的系数是坡度的差异)。
library(segmented)
## Define function for making matrix of dummy variables (this is based on code from predict.segmented())
dummy.matrix <- function(x.values, x_names, psi.est = TRUE, nameU, nameV, diffSlope, est.psi) {
# This function creates a model matrix with dummy variables for a segmented lm with two breakpoints.
# Inputs:
# x.values: the x values of the segmented lm
# x_names: the name of the column of x values
# psi.est: this is legacy from the predict.segmented function, leave it set to 'TRUE'
# obj: the segmented lm object
# nameU: names (class character) of 3rd and 4th coef, which are "U1.x" "U2.x" for lm with two breaks. Example: names(c(obj$coef[3], obj$coef[4]))
# nameV: names (class character) of 5th and 6th coef, which are "psi1.x" "psi2.x" for lm with two breaks. Example: names(c(obj$coef[5], obj$coef[6]))
# diffSlope: the coefficients (class numeric) with the slope differences; called U1.x and U2.x for lm with two breaks. Example: c(o$coef[3], o$coef[4])
# est.psi: the estimated break points (class numeric); these are the estimated breakpoints from segmented.lm. Example: c(obj$psi[1,2], obj$psi[2,2])
#
n <- length(x.values)
k <- length(est.psi)
PSI <- matrix(rep(est.psi, rep(n, k)), ncol = k)
newZ <- matrix(x.values, nrow = n, ncol = k, byrow = FALSE)
dummy1 <- pmax(newZ - PSI, 0)
if (psi.est) {
V <- ifelse(newZ > PSI, -1, 0)
dummy2 <- if (k == 1)
V * diffSlope
else V %*% diag(diffSlope)
newd <- cbind(x.values, dummy1, dummy2)
colnames(newd) <- c(x_names, nameU, nameV)
} else {
newd <- cbind(x.values, dummy1)
colnames(newd) <- c(x_names, nameU)
}
# if (!x_names %in% names(coef(obj.seg)))
# newd <- newd[, -1, drop = FALSE]
return(newd)
}
## Test dummy matrix function----------------------------------------------
set.seed(12)
xx<-1:100
zz<-runif(100)
yy<-2+1.5*pmax(xx-35,0)-1.5*pmax(xx-70,0)+15*pmax(zz-.5,0)+rnorm(100,0,2)
dati<-data.frame(x=xx,y=yy,z=zz)
out.lm<-lm(y~x,data=dati)
#1 segmented variable, 2 breakpoints: you have to specify starting values (vector) for psi:
o<-segmented(out.lm,seg.Z=~x,psi=c(30,60),
control=seg.control(display=FALSE))
slope(o)
plot.segmented(o)
summary(o)
# Test dummy matrix fn with the same dataset
newdata <- dati
nameU1 <- c("U1.x", "U2.x")
nameV1 <- c("psi1.x", "psi2.x")
diffSlope1 <- c(o$coef[3], o$coef[4])
est.psi1 <- c(o$psi[1,2], o$psi[2,2])
test <- dummy.matrix(x.values = newdata$x, x_names = "x", psi.est = TRUE,
nameU = nameU1, nameV = nameV1, diffSlope = diffSlope1, est.psi = est.psi1)
# Predict response variable using matrix multiplication
col1 <- matrix(1, nrow = dim(test)[1])
test <- cbind(col1, test) # Now test is the same as model.matrix(o)
predY <- coef(o) %*% t(test)
plot(predY[1,])
lines(predict.segmented(o), col = "blue") # good, predict.segmented gives same answer