扫帚组件-lm.fit中的错误(x,y,偏移=偏移,singular.ok=singular.ok,…):0(非NA)情况
我有一个学生属性和考试成绩的数据框架,我试图为每个年级(1到12)拟合一个线性模型。我正在使用扫帚包为每个年级有效地创建一个模型。下面是一个简化的示例数据集和我正在使用的代码扫帚组件-lm.fit中的错误(x,y,偏移=偏移,singular.ok=singular.ok,…):0(非NA)情况,r,linear-regression,broom,R,Linear Regression,Broom,我有一个学生属性和考试成绩的数据框架,我试图为每个年级(1到12)拟合一个线性模型。我正在使用扫帚包为每个年级有效地创建一个模型。下面是一个简化的示例数据集和我正在使用的代码 #start df creation grade <- rep(1:12, each = 40) attendance_rate <- round(runif(480, min=25, max=100), 1) test_growth <- round(runif(480, min = -12, ma
#start df creation
grade <- rep(1:12, each = 40)
attendance_rate <- round(runif(480, min=25, max=100), 1)
test_growth <- round(runif(480, min = -12, max = 38))
binary_flag <- round(runif(480, min = 0, max = 1))
score <- round(runif(480, min = 92, max = 370))
survey_response <- round(runif(480, min = 1, max = 4))
df <- data.frame(grade, attendance_rate, test_growth, binary_flag, score, survey_response)
df$survey_response[df$grade == 1] <- NA
# end df creation
#create train test split for each grade level
set.seed(123)
df_train <- lapply(split(seq(1:nrow(df)), df$grade), function(x) sample(x, floor(.6*length(x))))
df_test <- mapply(function(x,y) setdiff(x,y), x = split(seq(1:nrow(df)), df$grade), y = df_train)
df_train <- df[unlist(df_train),]
df_test <- df[unlist(df_test),]
#create models
models_nested <- df_train %>%
group_by(grade) %>% nest() %>%
mutate(
fit = map(data, ~ lm(score ~ attendance_rate + test_growth + binary_flag + survey_response, data = .x)),
tidied = map(fit, tidy),
augmented = map(fit, augment),
glanced = map(fit, glance)
)
我知道这是因为所有一年级的学生在调查问卷的回复栏中都有NA值。我不知道如何解决这个问题,而不为一年级运行单独的回归,完全删除调查响应列/变量。如果某个特定的等级子集只包含空值,有没有办法告诉lm函数忽略一个变量?我显然想在其他年级模型的回归中保留这个变量
我已尽力把这个问题说清楚,但如有必要,我很乐意在评论中澄清
编辑2020年6月9日:我不想为一年级模型返回NA,我只希望一年级线性模型在没有“调查响应”列的情况下运行。我希望在所有其他年级模型中都包含调查回复栏
我希望有人能帮忙 我们可以在
survey\u response
中检查NA
值,并相应地使用模型
library(broom)
library(dplyr)
library(tidyr)
library(purrr)
df_train %>%
group_by(grade) %>%
nest() %>%
mutate(fit = map(data, ~ if(all(is.na(.x$survey_response)))
lm(score ~ attendance_rate + test_growth + binary_flag, data = .x)
else lm(score ~ attendance_rate + test_growth + binary_flag + survey_response, data = .x)),
tidied = map(fit, tidy),
augmented = map(fit, augment),
glanced = map(fit, glance))
# grade data fit tidied augmented glanced
# <int> <list> <list> <list> <list> <list>
# 1 1 <tibble [24 × 5]> <lm> <tibble [4 × 5]> <tibble [24 × 11]> <tibble [1 × 11]>
# 2 2 <tibble [24 × 5]> <lm> <tibble [4 × 5]> <tibble [24 × 11]> <tibble [1 × 11]>
# 3 3 <tibble [24 × 5]> <lm> <tibble [4 × 5]> <tibble [24 × 11]> <tibble [1 × 11]>
# 4 4 <tibble [24 × 5]> <lm> <tibble [4 × 5]> <tibble [24 × 11]> <tibble [1 × 11]>
# 5 5 <tibble [24 × 5]> <lm> <tibble [4 × 5]> <tibble [24 × 11]> <tibble [1 × 11]>
# 6 6 <tibble [24 × 5]> <lm> <tibble [4 × 5]> <tibble [24 × 11]> <tibble [1 × 11]>
# 7 7 <tibble [24 × 5]> <lm> <tibble [4 × 5]> <tibble [24 × 11]> <tibble [1 × 11]>
# 8 8 <tibble [24 × 5]> <lm> <tibble [4 × 5]> <tibble [24 × 11]> <tibble [1 × 11]>
# 9 9 <tibble [24 × 5]> <lm> <tibble [4 × 5]> <tibble [24 × 11]> <tibble [1 × 11]>
#10 10 <tibble [24 × 5]> <lm> <tibble [4 × 5]> <tibble [24 × 11]> <tibble [1 × 11]>
#11 11 <tibble [24 × 5]> <lm> <tibble [4 × 5]> <tibble [24 × 11]> <tibble [1 × 11]>
#12 12 <tibble [24 × 5]> <lm> <tibble [4 × 5]> <tibble [24 × 11]> <tibble [1 × 11]>
库(扫帚)
图书馆(dplyr)
图书馆(tidyr)
图书馆(purrr)
df_列车%>%
组别(职系)%>%
嵌套()%>%
mutate(fit=map(数据,~if(all(is.na(.x$survey_response)))
lm(分数~出勤率+测试增长+二进制标志,数据=.x)
else lm(分数~出勤率+测试增长+二进制标志+调查响应,数据=.x)),
整洁=地图(适合、整洁),
增强=映射(拟合,增强),
扫视=地图(适合,扫视))
#等级数据拟合整理增强扫描
#
# 1 1
# 2 2
# 3 3
# 4 4
# 5 5
# 6 6
# 7 7
# 8 8
# 9 9
#10 10
#11 11
#12 12
我们可以从purr
library(broom)
library(dplyr)
library(tidyr)
library(purrr)
poslm <- possibly(lm, otherwise = NA)
df_train %>%
group_by(grade) %>%
nest() %>%
mutate(fit = map(data, ~ poslm(score ~ attendance_rate + test_growth +
binary_flag + survey_response, data = .x)),
tidied = map(fit, possibly(tidy, otherwise = NA)),
augmented = map(fit, possibly(augment, otherwise = NA)),
glanced = map(fit, possibly(glance, otherwise = NA)))
# A tibble: 12 x 6
# Groups: grade [12]
# grade data fit tidied augmented glanced
# <int> <list> <list> <list> <list> <list>
# 1 1 <tibble [24 × 5]> <lgl [1]> <lgl [1]> <lgl [1]> <lgl [1]>
# 2 2 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
# 3 3 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
# 4 4 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
# 5 5 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
# 6 6 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
# 7 7 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
# 8 8 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
# 9 9 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
#10 10 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
#11 11 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
#12 12 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
库(扫帚)
图书馆(dplyr)
图书馆(tidyr)
图书馆(purrr)
poslm%
组别(职系)%>%
嵌套()%>%
变异(拟合=map(数据,~poslm(分数~出勤率+测试增长+
二进制标志+调查响应,数据=.x)),
整洁=地图(适合,可能(整洁,否则=NA)),
增强=映射(拟合,可能(增强,否则=NA)),
扫视=地图(适合,可能(扫视,否则=NA)))
#一个tibble:12x6
#分组:年级[12]
#等级数据拟合整理增强扫描
#
# 1 1
# 2 2
# 3 3
# 4 4
# 5 5
# 6 6
# 7 7
# 8 8
# 9 9
#10 10
#11 11
#12 12
谢谢您的帮助。我意识到我本可以把我的问题说得更清楚,我道歉。我不想简单地返回一年级的NA,我希望运行一个不包括调查响应列的备用线性模型。因此,我希望以下模型仅在一年级运行:lm(分数~出勤率+测试增长+二进制标志,数据=.x)注意,调查响应变量已删除,但同样仅在一年级运行。这可能吗?再次感谢您的时间和投入。@rachael_学习了,但您并不总是不知道,只有调查(response)
。会给你一个错误。有时test\u growth
an也会给你一个错误,对吗?你怎么会知道呢?总是调查结果导致了这个问题——我在处理教育数据,这只是这个数据集的一个特点,一年级的孩子不参加这项调查,而所有其他年级的孩子都参加调查。是的,这很有效!!非常感谢你。我希望有一天我会像你在R一样好谢谢你的回复。我意识到我的问题不够清晰。我不想为一年级模型返回NA,我只希望一年级的线性模型运行时不包含survey_response列。我希望在所有其他年级模型中都包含调查回复栏。这可能吗?
library(broom)
library(dplyr)
library(tidyr)
library(purrr)
poslm <- possibly(lm, otherwise = NA)
df_train %>%
group_by(grade) %>%
nest() %>%
mutate(fit = map(data, ~ poslm(score ~ attendance_rate + test_growth +
binary_flag + survey_response, data = .x)),
tidied = map(fit, possibly(tidy, otherwise = NA)),
augmented = map(fit, possibly(augment, otherwise = NA)),
glanced = map(fit, possibly(glance, otherwise = NA)))
# A tibble: 12 x 6
# Groups: grade [12]
# grade data fit tidied augmented glanced
# <int> <list> <list> <list> <list> <list>
# 1 1 <tibble [24 × 5]> <lgl [1]> <lgl [1]> <lgl [1]> <lgl [1]>
# 2 2 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
# 3 3 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
# 4 4 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
# 5 5 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
# 6 6 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
# 7 7 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
# 8 8 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
# 9 9 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
#10 10 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
#11 11 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>
#12 12 <tibble [24 × 5]> <lm> <tibble [5 × 5]> <tibble [24 × 12]> <tibble [1 × 11]>