R：来自经过训练的GLM模型的分类公式[提供了可复制的示例]_R_Classification_Logistic Regression

R：来自经过训练的GLM模型的分类公式[提供了可复制的示例]

R：来自经过训练的GLM模型的分类公式[提供了可复制的示例],r,classification,logistic-regression,R,Classification,Logistic Regression,问题 >coef(model1) #(Intercept) PetalLength PetalWidth #-31.938998 -7.501714 63.670583 >exp(coef(model1)) #(Intercept) PetalLength PetalWidth #1.346075e-14 5.521371e-04 4.485211e+27 （1）下面名为“model1”的示例代码中的拟合模型的分类公式是什么？（这是公式A、B还是两者都不

问题

>coef(model1)
#(Intercept) PetalLength  PetalWidth 
#-31.938998   -7.501714   63.670583 

>exp(coef(model1))
#(Intercept)  PetalLength   PetalWidth 
#1.346075e-14 5.521371e-04 4.485211e+27

（1）下面名为“model1”的示例代码中的拟合模型的分类公式是什么？（这是公式A、B还是两者都不是？）

（2） “model1”如何确定class==1和class=2

公式A：类别（物种{1:2}）=（-31.938998）+（-7.501714*[PetalLength]）+（63.670583*[PetalWidth]）
公式B：类别（物种{1:2}）=1.346075e-14+（5.521371e-04*[PetalLength]）+（4.485211e+27*[PetalWidth]）

用例

# Load data (using iris dataset from Google Drive because uci.edu link wasn't working for me today)
#iris <- read.csv(url("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"), header = FALSE)
iris <- read.csv(url("https://docs.google.com/spreadsheets/d/1ovz31Y6PrV5OwpqFI_wvNHlMTf9IiPfVy1c3fiQJMcg/pub?gid=811038462&single=true&output=csv"), header = FALSE)
dataSet <- iris

#assign column names
names(dataSet) <- c("SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species")

#col names
dsColNames <- as.character(names(dataSet))

#num of columns and rows
dsColCount <- as.integer(ncol(dataSet))
dsRowCount <- as.integer(nrow(dataSet))

#class ordinality and name
classColumn <- 5 
classColumnName <- dsColNames[classColumn]
y_col_pos <- classColumn

#features ordinality
x_col_start_pos <- 1
x_col_end_pos <- 4

# % of [dataset] reserved for training/test and validation  
set.seed(10)
sampleAmt <- 0.25
mainSplit <- sample(2, dsRowCount, replace=TRUE, prob=c(sampleAmt, 1-sampleAmt))

#split [dataSet] into two sets
dsTrainingTest <- dataSet[mainSplit==1, 1:5] 
dsValidation <- dataSet[mainSplit==2, 1:5]
nrow(dsTrainingTest);nrow(dsValidation);

# % of [dsTrainingTest] reserved for training
sampleAmt <- 0.5
secondarySplit <- sample(2, nrow(dsTrainingTest), replace=TRUE, prob=c(sampleAmt, 1-sampleAmt))

#split [dsTrainingTest] into two sets 
dsTraining <- dsTrainingTest[secondarySplit==1, 1:5]
dsTest <- dsTrainingTest[secondarySplit==2, 1:5]
nrow(dsTraining);nrow(dsTest);

nrow(dataSet) == nrow(dsTrainingTest)+nrow(dsValidation)
nrow(dsTrainingTest) == nrow(dsTraining)+nrow(dsTest)

library(randomGLM)

dataSetEnum <- dsTraining[,1:5]
dataSetEnum[,5] <- as.character(dataSetEnum[,5])
dataSetEnum[,5][dataSetEnum[,5]=="Iris-setosa"] <- 1 
dataSetEnum[,5][dataSetEnum[,5]=="Iris-versicolor"] <- 2 
dataSetEnum[,5][dataSetEnum[,5]=="Iris-virginica"] <- 2 
dataSetEnum[,5] <- as.integer(dataSetEnum[,5])

x <- as.matrix(dataSetEnum[,1:4])
y <- as.factor(dataSetEnum[,5:5])

# number of features
N <- ncol(x)

# define function misclassification.rate
if (exists("misclassification.rate") ) rm(misclassification.rate);
misclassification.rate<-function(tab){
  num1<-sum(diag(tab))
  denom1<-sum(tab)
  signif(1-num1/denom1,3)
}

#Fit randomGLM model - Ensemble predictor comprised of individual generalized linear model predictors
RGLM <- randomGLM(x, y, classify=TRUE, keepModels=TRUE,randomSeed=1002)

RGLM$thresholdClassProb

tab1 <- table(y, RGLM$predictedOOB)
tab1
# y  1  2
# 1  2  0
# 2  0 12

# accuracy
1-misclassification.rate(tab1)

# variable importance measure
varImp = RGLM$timesSelectedByForwardRegression
sum(varImp>=0)

table(varImp)

# select most important features
impF = colnames(x)[varImp>=5]
impF

# build single GLM model with most important features
model1 = glm(y~., data=as.data.frame(x[, impF]), family = binomial(link='logit'))

coef(model1)
#(Intercept) PetalLength  PetalWidth 
#-31.938998   -7.501714   63.670583 

exp(coef(model1))
#(Intercept)  PetalLength   PetalWidth 
#1.346075e-14 5.521371e-04 4.485211e+27 

confint.default(model1)
#                2.5 %   97.5 %
#(Intercept) -363922.5 363858.6
#PetalLength -360479.0 360464.0
#PetalWidth  -916432.0 916559.4

使用R拟合/训练二元分类模型，然后解释该模型，以便在Excel中手动计算分类，而不是R

模型系数

>coef(model1)
#(Intercept) PetalLength  PetalWidth 
#-31.938998   -7.501714   63.670583 

>exp(coef(model1))
#(Intercept)  PetalLength   PetalWidth 
#1.346075e-14 5.521371e-04 4.485211e+27

R代码示例

# Load data (using iris dataset from Google Drive because uci.edu link wasn't working for me today)
#iris <- read.csv(url("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"), header = FALSE)
iris <- read.csv(url("https://docs.google.com/spreadsheets/d/1ovz31Y6PrV5OwpqFI_wvNHlMTf9IiPfVy1c3fiQJMcg/pub?gid=811038462&single=true&output=csv"), header = FALSE)
dataSet <- iris

#assign column names
names(dataSet) <- c("SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species")

#col names
dsColNames <- as.character(names(dataSet))

#num of columns and rows
dsColCount <- as.integer(ncol(dataSet))
dsRowCount <- as.integer(nrow(dataSet))

#class ordinality and name
classColumn <- 5 
classColumnName <- dsColNames[classColumn]
y_col_pos <- classColumn

#features ordinality
x_col_start_pos <- 1
x_col_end_pos <- 4

# % of [dataset] reserved for training/test and validation  
set.seed(10)
sampleAmt <- 0.25
mainSplit <- sample(2, dsRowCount, replace=TRUE, prob=c(sampleAmt, 1-sampleAmt))

#split [dataSet] into two sets
dsTrainingTest <- dataSet[mainSplit==1, 1:5] 
dsValidation <- dataSet[mainSplit==2, 1:5]
nrow(dsTrainingTest);nrow(dsValidation);

# % of [dsTrainingTest] reserved for training
sampleAmt <- 0.5
secondarySplit <- sample(2, nrow(dsTrainingTest), replace=TRUE, prob=c(sampleAmt, 1-sampleAmt))

#split [dsTrainingTest] into two sets 
dsTraining <- dsTrainingTest[secondarySplit==1, 1:5]
dsTest <- dsTrainingTest[secondarySplit==2, 1:5]
nrow(dsTraining);nrow(dsTest);

nrow(dataSet) == nrow(dsTrainingTest)+nrow(dsValidation)
nrow(dsTrainingTest) == nrow(dsTraining)+nrow(dsTest)

library(randomGLM)

dataSetEnum <- dsTraining[,1:5]
dataSetEnum[,5] <- as.character(dataSetEnum[,5])
dataSetEnum[,5][dataSetEnum[,5]=="Iris-setosa"] <- 1 
dataSetEnum[,5][dataSetEnum[,5]=="Iris-versicolor"] <- 2 
dataSetEnum[,5][dataSetEnum[,5]=="Iris-virginica"] <- 2 
dataSetEnum[,5] <- as.integer(dataSetEnum[,5])

x <- as.matrix(dataSetEnum[,1:4])
y <- as.factor(dataSetEnum[,5:5])

# number of features
N <- ncol(x)

# define function misclassification.rate
if (exists("misclassification.rate") ) rm(misclassification.rate);
misclassification.rate<-function(tab){
  num1<-sum(diag(tab))
  denom1<-sum(tab)
  signif(1-num1/denom1,3)
}

#Fit randomGLM model - Ensemble predictor comprised of individual generalized linear model predictors
RGLM <- randomGLM(x, y, classify=TRUE, keepModels=TRUE,randomSeed=1002)

RGLM$thresholdClassProb

tab1 <- table(y, RGLM$predictedOOB)
tab1
# y  1  2
# 1  2  0
# 2  0 12

# accuracy
1-misclassification.rate(tab1)

# variable importance measure
varImp = RGLM$timesSelectedByForwardRegression
sum(varImp>=0)

table(varImp)

# select most important features
impF = colnames(x)[varImp>=5]
impF

# build single GLM model with most important features
model1 = glm(y~., data=as.data.frame(x[, impF]), family = binomial(link='logit'))

coef(model1)
#(Intercept) PetalLength  PetalWidth 
#-31.938998   -7.501714   63.670583 

exp(coef(model1))
#(Intercept)  PetalLength   PetalWidth 
#1.346075e-14 5.521371e-04 4.485211e+27 

confint.default(model1)
#                2.5 %   97.5 %
#(Intercept) -363922.5 363858.6
#PetalLength -360479.0 360464.0
#PetalWidth  -916432.0 916559.4

#加载数据（使用谷歌硬盘上的iris数据集，因为uci.edu link今天不适合我）
#irisGLM模型具有链接函数和线性预测因子。您尚未在上面指定链接函数
设Y={0,1}和X是nxp矩阵。（使用伪乳胶）这将导致\hat Y=\phi（X\hat B）=\eta

其中

-\eta
是线性预测值

-\phi（）
是链接函数
线性预测器正好是X%*%\hat B
，分类回到p（Y=1 | X）=\phi^{-1}（\eta）
——即反向链接函数。反向链接功能显然取决于链接的选择。对于logit，您有反向logitP（Y=1 | X）=exp（eta）/（1+exp（eta））
GLM模型有一个链接函数和一个线性预测器。您尚未在上面指定链接函数
设Y={0,1}和X是nxp矩阵。（使用伪乳胶）这将导致\hat Y=\phi（X\hat B）=\eta

其中

-\eta
是线性预测值

-\phi（）
是链接函数
线性预测器正好是X%*%\hat B
，分类回到p（Y=1 | X）=\phi^{-1}（\eta）
——即反向链接函数。反向链接功能显然取决于链接的选择。对于logit，您有反向logitP（Y=1 | X）=exp（eta）/（1+exp（eta））
您的模型定义为
model1 <- glm(y~., data=as.data.frame(x[, impF]), family=binomial(link='logit'))

我们可以检查这是否与fitted.values的输出相同
> fitted.values(model1)
          20           50           65           66           87          105 
3.448852e-11 1.253983e-13 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
         106          107          111          112          116          118 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
         129          130 
1.000000e+00 1.000000e+00 

最后，根据p（Y=1）是否高于或低于某个阈值，可以将响应分为两类。比如说,
> ifelse(fitted.values(model1) > 0.5, 1, 0)
 20  50  65  66  87 105 106 107 111 112 116 118 129 130 
  0   0   1   1   1   1   1   1   1   1   1   1   1   1 

您的模型定义为
model1 <- glm(y~., data=as.data.frame(x[, impF]), family=binomial(link='logit'))

我们可以检查这是否与fitted.values的输出相同
> fitted.values(model1)
          20           50           65           66           87          105 
3.448852e-11 1.253983e-13 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
         106          107          111          112          116          118 
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
         129          130 
1.000000e+00 1.000000e+00 

最后，根据p（Y=1）是否高于或低于某个阈值，可以将响应分为两类。比如说,
> ifelse(fitted.values(model1) > 0.5, 1, 0)
 20  50  65  66  87 105 106 107 111 112 116 118 129 130 
  0   0   1   1   1   1   1   1   1   1   1   1   1   1 

感谢您对@Alex的评论，但是link函数是在OP的示例代码中提供的。请参见第95行model1=glm（y~，data=as.data.frame（x[，impF]），family=binomial（link='logit'））
@Webby-fair。。。我略读了你的问题。我没看见。我的建议是你回顾GLM模型，因为你应该了解你使用的模型。我同意，但这并不能回答我的任何一个问题。我不理解你的答案。是公式A、B还是C（考虑到OP中提供的模型系数）？谢谢@Alex的评论，但是OP的示例代码中提供了链接函数。请参见第95行model1=glm（y~，data=as.data.frame（x[，impF]），family=binomial（link='logit'））
@Webby-fair。。。我略读了你的问题。我没看见。我的建议是你回顾GLM模型，因为你应该了解你使用的模型。我同意，但这并不能回答我的任何一个问题。我不理解你的答案。是公式A、B还是C（给定OP中提供的模型系数）？