用户定义的R函数不';不要用dplyr给出正确答案
我试图执行一个与示例完全相同的分析,但是使用不同的beta分布参数化。在分析开始时,我不确定我想要什么样的参数化,所以我定义了一个自定义函数,这样我就可以在一个地方进行参数化更改,然后在代码的其余部分使用。见下文用户定义的R函数不';不要用dplyr给出正确答案,r,statistics,data-science,R,Statistics,Data Science,我试图执行一个与示例完全相同的分析,但是使用不同的beta分布参数化。在分析开始时,我不确定我想要什么样的参数化,所以我定义了一个自定义函数,这样我就可以在一个地方进行参数化更改,然后在代码的其余部分使用。见下文 f_beta <- function(x, elig, par) { return ( dbeta(x, exp(par[1] + par[2] * log(elig)), exp(par[3] + par[4] * log
f_beta <- function(x, elig, par) {
return (
dbeta(x,
exp(par[1] + par[2] * log(elig)),
exp(par[3] + par[4] * log(elig)),
log = TRUE))
}
f_β%
交叉(配合)%>%
变异(似然性=先验*dbeta(enrpct,exp(a0+b0*log(elig)),exp(a1+b1*log(elig)),log=TRUE),
可能性2=先前的*f_β(enrpct,elig,c(a0,b0,a1,b1))%>%
分组依据(id)%>%
顶部n(1,可能性)%>%
解组()
完整代码如下
library(tidyr)
library(dplyr)
# custom beta distribution parameterization
f_beta <- function(x, elig, par) {
return (dbeta(x, exp(par[1] + par[2] * log(elig)), exp(par[3] + par[4] * log(elig)), log = TRUE))
}
# log-likelihood
ll <- function(x, elig, par) {
-sum(f_beta(x, elig, par))
}
# optimizer
fit_beta <- function(x, elig, init = c(-0.5, 0.2, -1.1, 0.3)) {
m <- optim(par = init, fn = ll, elig = elig, x = x)
coef <- m$par
data_frame(a0 = coef[1], b0 = coef[2], a1 = coef[3], b1 = coef[4], number = length(x))
}
####### generate data
n <- 10000
n2 <- 5000
# mixture 1 parameters
a0 <- -1
b0 <- 0.3
a1 <- -2
b1 <- 1
# mixture 2 parameters
a01 <- -1
b01 <- 0.5
a11 <- -1.5
b11 <- 0.8
# generate data
df <- data.frame(id = 1:n, elig = sample(1:100, size = n, replace = TRUE) * 100)
df$enrpct <- rbeta(n, exp(a0 + b0 * log(df$elig)), exp(a1 + b1 * log(df$elig)))
df2 <- data.frame(id = (n+1):(n+n2), elig = sample(1:100, size = n2, replace = TRUE) * 100)
df2$enrpct <- rbeta(n2, exp(a01 + b01 * log(df2$elig)), exp(a11 + b11 * log(df2$elig)))
df <- rbind(df, df2)
# assign to clusters randomly like in example
df$cluster <- factor(sample(c("A", "B"), nrow(df), replace = TRUE))
# maximization step of E-M algorithm
fits <- df %>%
group_by(cluster) %>%
do(fit_beta(.$enrpct, .$elig)) %>%
ungroup() %>%
mutate(prior = number / sum(number))
# expectation step of E-M algorithm
assignments <- df %>%
select(-cluster) %>%
crossing(fits) %>%
mutate(likelihood = prior * dbeta(enrpct, exp(a0 + b0 * log(elig)), exp(a1 + b1 * log(elig)), log = TRUE),
likelihood2 = prior * f_beta(enrpct, elig, c(a0, b0, a1, b1))) %>%
group_by(id) %>%
top_n(1, likelihood) %>%
ungroup()
head(assignments)
library(tidyr)
图书馆(dplyr)
#自定义beta分布参数化
f_beta这是因为在计算概率2时使用了c(a0,b0,a1,b1)
。但是这些变量中的每一个都是数据帧中的一整列,因此使用c()
只会将它们串联起来,最终会使用错误的值
有了这一点,它将发挥作用:
f_β这是因为在计算似然度2时使用c(a0,b0,a1,b1)
。但是这些变量中的每一个都是数据帧中的一整列,因此使用c()
只会将它们串联起来,最终会使用错误的值
有了这一点,它将发挥作用:
f_β
library(tidyr)
library(dplyr)
# custom beta distribution parameterization
f_beta <- function(x, elig, par) {
return (dbeta(x, exp(par[1] + par[2] * log(elig)), exp(par[3] + par[4] * log(elig)), log = TRUE))
}
# log-likelihood
ll <- function(x, elig, par) {
-sum(f_beta(x, elig, par))
}
# optimizer
fit_beta <- function(x, elig, init = c(-0.5, 0.2, -1.1, 0.3)) {
m <- optim(par = init, fn = ll, elig = elig, x = x)
coef <- m$par
data_frame(a0 = coef[1], b0 = coef[2], a1 = coef[3], b1 = coef[4], number = length(x))
}
####### generate data
n <- 10000
n2 <- 5000
# mixture 1 parameters
a0 <- -1
b0 <- 0.3
a1 <- -2
b1 <- 1
# mixture 2 parameters
a01 <- -1
b01 <- 0.5
a11 <- -1.5
b11 <- 0.8
# generate data
df <- data.frame(id = 1:n, elig = sample(1:100, size = n, replace = TRUE) * 100)
df$enrpct <- rbeta(n, exp(a0 + b0 * log(df$elig)), exp(a1 + b1 * log(df$elig)))
df2 <- data.frame(id = (n+1):(n+n2), elig = sample(1:100, size = n2, replace = TRUE) * 100)
df2$enrpct <- rbeta(n2, exp(a01 + b01 * log(df2$elig)), exp(a11 + b11 * log(df2$elig)))
df <- rbind(df, df2)
# assign to clusters randomly like in example
df$cluster <- factor(sample(c("A", "B"), nrow(df), replace = TRUE))
# maximization step of E-M algorithm
fits <- df %>%
group_by(cluster) %>%
do(fit_beta(.$enrpct, .$elig)) %>%
ungroup() %>%
mutate(prior = number / sum(number))
# expectation step of E-M algorithm
assignments <- df %>%
select(-cluster) %>%
crossing(fits) %>%
mutate(likelihood = prior * dbeta(enrpct, exp(a0 + b0 * log(elig)), exp(a1 + b1 * log(elig)), log = TRUE),
likelihood2 = prior * f_beta(enrpct, elig, c(a0, b0, a1, b1))) %>%
group_by(id) %>%
top_n(1, likelihood) %>%
ungroup()
head(assignments)
likelihood2 = prior * f_beta(enrpct, elig, a0, b0, a1, b1)