R：预测组的新值_R_Predict_Tidyverse

R：预测组的新值

R：预测组的新值,r,predict,tidyverse,R,Predict,Tidyverse,我在一个数据框中为每组计算了不同的回归： DF.L <- DF %>% group_by(Channel) %>% do(Fit = rlm(L ~ -1 + Y + I(Y^2), data = .)) 我做错了什么？我认为你的错误来自你打电话的方式。我无法修复您的确切代码，但这里有一个简单的方法，您可以从模型中获得预测。下面概述了使用purr和nest的更复杂方法：更新-purrr和nest方式只需添加此项，就可以显示它可以在tidyverse中使用predict轻松

我在一个数据框中为每组计算了不同的回归：

DF.L <- DF %>%
group_by(Channel) %>%
do(Fit = rlm(L ~ -1 + Y + I(Y^2), data = .))

我做错了什么？

我认为你的错误来自你打电话的方式。我无法修复您的确切代码，但这里有一个简单的方法，您可以从模型中获得预测。下面概述了使用

purr

和

nest

的更复杂方法：

更新-
purrr
和
nest
方式
只需添加此项，就可以显示它可以在
tidyverse
中使用
predict
轻松完成。有关更多详细信息，请参阅上面的链接

library(tidyverse) # shuffle the rows to mix up the species set.seed(1234) myiris <- iris[sample(nrow(iris), replace = F),] # create first dataset - use the first 50 rows for running the model iris_nested <- myiris[1:50,] %>% nest(-Species) %>% rename(myorigdata = data) # create second dataset - use the other 100 rows for making predictions new_iris_nested <- myiris[51:150,] %>% nest(-Species) %>% rename(mynewdata = data) # make a model function my_rlm <- function(df) { MASS::rlm(Sepal.Length ~ Petal.Length + Petal.Width, data = df) } # get the predictions (see the GitHub link above which breaks this into steps) predictions_tall <- iris_nested %>% mutate(my_model = map(myorigdata, my_rlm)) %>% full_join(new_iris_nested, by = "Species") %>% mutate(my_new_pred = map2(my_model, mynewdata, predict)) %>% select(Species, mynewdata, my_new_pred) %>% unnest(mynewdata, my_new_pred) %>% rename(modeled = my_new_pred, measured = Sepal.Length) %>% gather("Type", "Sepal.Length", modeled, measured)
最后，显示预测结果的曲线图：

predictions_tall %>% ggplot(aes(x = Petal.Length, y = Sepal.Length)) + geom_line(aes(color = Species, linetype = Type))

原版-扫帚方式
我现在更新了它，只使用该组的模型计算每个组的预测
这种方式使用
扫帚
软件包（特别是
增强
功能）来添加拟合值。请参阅此处的更多信息：
因为您不提供数据，所以我在这里使用
iris

library(tidyverse) library(broom) # first shuffle around the rows of iris set.seed(1234) myiris <- iris[sample(nrow(iris), replace = F),] # first data - first 25 rows for running the models on origiris <- myiris[1:25,] %>% nest(-Species) %>% rename(origdata = data) # second data - last 50 rows for predicting on prediris <- myiris[101:150,] %>% nest(-Species) %>% rename(preddata = data) # estimate models on the first 25 rows # a separate model is estimated for each species iris_mod <- origiris %>% mutate(mod = map(origdata, ~ MASS::rlm(Sepal.Length ~ Petal.Length + Petal.Width, data = .)))
现在你真正想要的是——对新数据集进行预测：

# get fitted values for the second dataset (preddata) # each model is fitted to the appropriate species' nested dataframe prediris_aug <- iris_mod %>% inner_join(prediris, by = "Species") %>% map2_df(.x = iris_mod$mod, .y = prediris$preddata, .f = ~augment(.x, newdata = .y)) %>% as.tibble()

使用内置的
ChickWeight
数据：

library(dplyr) library(MASS) library(broom) library(tidyr) library(ggplot2) head(ChickWeight)

适合一些型号现在，我们希望将模型的数据框架与要测试的新数据结合起来。首先，我们通过和tidyr:：nest对模拟数据进行分组。这将创建一个对象，该对象是一个包含四个组和一个名为data的列表列的数据帧，其中的每个元素都包含一个汇总的数据帧

ChickWeight_simulated %>% group_by(Diet) %>% nest()

Nice@meenaparam，但根据每行的物种值，每行应该只有一个预测。@医学物理学家在
do
之前使用
group_by
意味着我们为每一级别的物种拟合了一个模型，因此我们最终有三个模型可以预测。在您的数据中，您是否只希望基于基于相同分组值构建的模型对新观测值进行预测？是。在我的情况下，使用另一个渠道的模型是没有意义的。@MedicalPhysist没问题，没有意识到这一点。您现在有了答案，但我现在将更新此答案，以显示
purr
方式，我认为这会使问题变得更简单。谢谢！这是一个非常有趣的选择。太棒了！但是我想，整洁的人应该让事情变得更容易@hadley。
# get fitted values for the first dataset (origdata) origiris_aug <- iris_mod %>% mutate(origpred = map(mod, augment)) %>% unnest(origpred) %>% as.tibble()

origiris_aug # A tibble: 25 x 10 Species .rownames Sepal.Length Petal.Length Petal.Width .fitted .se.fit .resid <fctr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 1 setosa 18 5.1 1.4 0.3 5.002797 0.1514850 0.09720290 2 setosa 2 4.9 1.4 0.2 4.931824 0.1166911 -0.03182417 3 setosa 34 5.5 1.4 0.2 4.931824 0.1166911 0.56817583 4 setosa 40 5.1 1.5 0.2 4.981975 0.1095883 0.11802526 5 setosa 39 4.4 1.3 0.2 4.881674 0.1422123 -0.48167359 6 setosa 36 5.0 1.2 0.2 4.831523 0.1784156 0.16847698 7 setosa 25 4.8 1.9 0.2 5.182577 0.2357614 -0.38257703 8 setosa 31 4.8 1.6 0.2 5.032125 0.1241074 -0.23212531 9 setosa 42 4.5 1.3 0.3 4.952647 0.1760223 -0.45264653 10 setosa 21 5.4 1.7 0.2 5.082276 0.1542594 0.31772411 # ... with 15 more rows, and 2 more variables: .hat <dbl>, .sigma <dbl>

# get fitted values for the second dataset (preddata) # each model is fitted to the appropriate species' nested dataframe prediris_aug <- iris_mod %>% inner_join(prediris, by = "Species") %>% map2_df(.x = iris_mod$mod, .y = prediris$preddata, .f = ~augment(.x, newdata = .y)) %>% as.tibble()

prediris_aug # A tibble: 50 x 7 .rownames Sepal.Length Sepal.Width Petal.Length Petal.Width .fitted .se.fit <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 1 105 6.5 3.0 5.8 2.2 8.557908 3.570269 2 115 5.8 2.8 5.1 2.4 8.348800 3.666631 3 117 6.5 3.0 5.5 1.8 8.123565 3.005888 4 139 6.0 3.0 4.8 1.8 7.772511 2.812748 5 103 7.1 3.0 5.9 2.1 8.537086 3.475224 6 107 4.9 2.5 4.5 1.7 7.551086 2.611123 7 119 7.7 2.6 6.9 2.3 9.180537 4.000412 8 135 6.1 2.6 5.6 1.4 7.889823 2.611457 9 124 6.3 2.7 4.9 1.8 7.822661 2.838502 10 118 7.7 3.8 6.7 2.2 9.009263 3.825613 # ... with 40 more rows

library(dplyr) library(MASS) library(broom) library(tidyr) library(ggplot2) head(ChickWeight)

weight Time Chick Diet 1 42 0 1 1 2 51 2 1 1 3 59 4 1 1 4 64 6 1 1 5 76 8 1 1 6 93 10 1 1

ChickWeight_models <- ChickWeight %>% group_by(Diet) %>% do(fit = MASS::rlm(weight ~ Time + I(Time^2), data = .)) ChickWeight_models

Source: local data frame [4 x 2] Groups: <by row> # A tibble: 4 x 2 Diet fit * <fctr> <list> 1 1 <S3: rlm> 2 2 <S3: rlm> 3 3 <S3: rlm> 4 4 <S3: rlm>

ChickWeight_simulated <- ChickWeight %>% mutate(Time = Time + runif(length(Time)), weight = weight + rnorm(length(weight))) ChickWeight_simulated

weight Time Chick Diet 1 42.72075 0.9786272 1 1 2 51.12669 2.8399631 1 1 3 58.64632 4.4576380 1 1 4 63.77617 6.1083591 1 1 5 75.40434 8.1051792 1 1 6 91.75830 10.7899030 1 1

ChickWeight_simulated %>% group_by(Diet) %>% nest()

# A tibble: 4 x 2 Diet data <fctr> <list> 1 1 <tibble [220 x 3]> 2 2 <tibble [120 x 3]> 3 3 <tibble [120 x 3]> 4 4 <tibble [118 x 3]>

ChickWeight_simulated %>% group_by(Diet) %>% nest() %>% full_join(ChickWeight_models)

# A tibble: 4 x 3 Diet data fit <fctr> <list> <list> 1 1 <tibble [220 x 3]> <S3: rlm> 2 2 <tibble [120 x 3]> <S3: rlm> 3 3 <tibble [120 x 3]> <S3: rlm> 4 4 <tibble [118 x 3]> <S3: rlm>

ChickWeight_simulated_predicted <- ChickWeight_simulated %>% group_by(Diet) %>% nest() %>% full_join(ChickWeight_models) %>% group_by(Diet) %>% do(augment(.$fit[[1]], newdata = .$data[[1]])) head(ChickWeight_simulated_predicted)

# A tibble: 6 x 6 # Groups: Diet [1] Diet weight Time Chick .fitted .se.fit <fctr> <dbl> <dbl> <ord> <dbl> <dbl> 1 1 42.72075 0.9786272 1 43.62963 2.368838 2 1 51.12669 2.8399631 1 51.80855 1.758385 3 1 58.64632 4.4576380 1 59.67606 1.534051 4 1 63.77617 6.1083591 1 68.43218 1.534152 5 1 75.40434 8.1051792 1 80.00678 1.647612 6 1 91.75830 10.7899030 1 97.26450 1.726331

ChickWeight_simulated_predicted %>% ggplot(aes(Time, weight)) + geom_point(shape = 1) + geom_ribbon(aes(Time, ymin = .fitted-1.96*.se.fit, ymax = .fitted+1.96*.se.fit), alpha = 0.5, fill = "black") + geom_line(aes(Time, .fitted), size = 1, color = "red") + facet_wrap(~Diet)