Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/300.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 如何计算logistic回归精度_Python_Machine Learning_Logistic Regression - Fatal编程技术网

Python 如何计算logistic回归精度

Python 如何计算logistic回归精度,python,machine-learning,logistic-regression,Python,Machine Learning,Logistic Regression,我是机器学习和python编码方面的一名完全初学者,我的任务是从头开始编写逻辑回归,以了解在幕后会发生什么。到目前为止,我已经对假设函数、成本函数和梯度下降进行了编码,然后对逻辑回归进行了编码。然而,对于打印精度的编码,我得到了一个低输出(0.69),它不会随着迭代次数的增加或学习率的改变而改变。我的问题是,我下面的准确代码有问题吗?任何指向正确方向的帮助都将不胜感激 X = data[['radius_mean', 'texture_mean', 'perimeter_mean', 'a

我是机器学习和python编码方面的一名完全初学者,我的任务是从头开始编写逻辑回归,以了解在幕后会发生什么。到目前为止,我已经对假设函数、成本函数和梯度下降进行了编码,然后对逻辑回归进行了编码。然而,对于打印精度的编码,我得到了一个低输出(0.69),它不会随着迭代次数的增加或学习率的改变而改变。我的问题是,我下面的准确代码有问题吗?任何指向正确方向的帮助都将不胜感激

X = data[['radius_mean', 'texture_mean', 'perimeter_mean',
   'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
   'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
   'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
   'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
   'fractal_dimension_se', 'radius_worst', 'texture_worst',
   'perimeter_worst', 'area_worst', 'smoothness_worst',
   'compactness_worst', 'concavity_worst', 'concave points_worst',
   'symmetry_worst', 'fractal_dimension_worst']]
X = np.array(X)
X = min_max_scaler.fit_transform(X)
Y = data["diagnosis"].map({'M':1,'B':0})
Y = np.array(Y)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25)

X = data["diagnosis"].map(lambda x: float(x))

def Sigmoid(z):
    if z < 0:
        return 1 - 1/(1 + math.exp(z))
    else:
        return 1/(1 + math.exp(-z))

def Hypothesis(theta, x):
    z = 0
    for i in range(len(theta)):
        z += x[i]*theta[i]
    return Sigmoid(z)

def Cost_Function(X,Y,theta,m):
    sumOfErrors = 0
    for i in range(m):
        xi = X[i]
        hi = Hypothesis(theta,xi)
        error = Y[i] * math.log(hi if  hi >0 else 1)
        if Y[i] == 1:
            error = Y[i] * math.log(hi if  hi >0 else 1)
        elif Y[i] == 0:
            error = (1-Y[i]) * math.log(1-hi  if  1-hi >0 else 1)
        sumOfErrors += error

    constant = -1/m
    J = constant * sumOfErrors
    #print ('cost is: ', J ) 
    return J

def Cost_Function_Derivative(X,Y,theta,j,m,alpha):
    sumErrors = 0
    for i in range(m):
        xi = X[i]
        xij = xi[j]
        hi = Hypothesis(theta,X[i])
        error = (hi - Y[i])*xij
        sumErrors += error
    m = len(Y)
    constant = float(alpha)/float(m)
    J = constant * sumErrors
    return J

def Gradient_Descent(X,Y,theta,m,alpha):
    new_theta = []
    constant = alpha/m
    for j in range(len(theta)):
        CFDerivative = Cost_Function_Derivative(X,Y,theta,j,m,alpha)
        new_theta_value = theta[j] - CFDerivative
        new_theta.append(new_theta_value)
    return new_theta


def Accuracy(theta):
    correct = 0
    length = len(X_test, Hypothesis(X,theta))
    for i in range(length):
        prediction = round(Hypothesis(X[i],theta))
        answer = Y[i]
    if prediction == answer.all():
            correct += 1
    my_accuracy = (correct / length)*100
    print ('LR Accuracy %: ', my_accuracy)



def Logistic_Regression(X,Y,alpha,theta,num_iters):
    theta = np.zeros(X.shape[1])
    m = len(Y)
    for x in range(num_iters):
        new_theta = Gradient_Descent(X,Y,theta,m,alpha)
        theta = new_theta
        if x % 100 == 0:
            Cost_Function(X,Y,theta,m)
            print ('theta: ', theta)    
            print ('cost: ', Cost_Function(X,Y,theta,m))
    Accuracy(theta)

initial_theta = [0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]  
alpha = 0.0001
iterations = 1000
Logistic_Regression(X,Y,alpha,initial_theta,iterations)
X=data[[半径平均值”,“纹理平均值”,“周长平均值”,
“面积”是指“平滑度”是指“紧凑度”是指“凹度”,
“凹点”是指“对称性”是指“分形维数”是指,
‘半径’、‘纹理’、‘周长’、‘面积’、‘平滑度’,
“紧度”、“凹度”、“凹点”、“对称性”,
“分形维数”、“半径最差”、“纹理最差”,
“周长最差”、“面积最差”、“平滑度最差”,
“压实度最差”、“凹度最差”、“凹点最差”,
“对称性最差”、“分形维数最差”]]
X=np.数组(X)
X=最小最大缩放器。拟合变换(X)
Y=数据[“诊断”].map({'M':1,'B':0})
Y=np.数组(Y)
X_系列,X_测试,Y_系列,Y_测试=系列测试分割(X,Y,测试尺寸=0.25)
X=数据[“诊断”].map(λX:float(X))
def乙状结肠(z):
如果z<0:
返回1-1/(1+数学表达式(z))
其他:
返回1/(1+数学表达式(-z))
def假设(θ,x):
z=0
对于范围内的i(len(θ)):
z+=x[i]*θ[i]
返回乙状结肠(z)
def成本函数(X,Y,θ,m):
sumOfErrors=0
对于范围内的i(m):
席=x[i]
hi=假设(θ,xi)
error=Y[i]*math.log(如果hi>0,则为hi,否则为1)
如果Y[i]==1:
error=Y[i]*math.log(如果hi>0,则为hi,否则为1)
elif Y[i]==0:
错误=(1-Y[i])*math.log(如果1-hi>0,则为1-hi,否则为1)
sumOfErrors+=错误
常数=-1/m
J=常数*误差总和
#打印('成本为:',J)
返回J
def成本函数导数(X,Y,θ,j,m,alpha):
sumErrors=0
对于范围内的i(m):
席=x[i]
xij=xi[j]
hi=假设(θ,X[i])
错误=(hi-Y[i])*xij
sumErrors+=错误
m=len(Y)
常数=浮动(α)/浮动(m)
J=常数*误差
返回J
def梯度下降(X,Y,θ,m,α):
新θ=[]
常数=α/m
对于范围内的j(len(θ)):
Cf导数=成本函数导数(X,Y,θ,j,m,α)
新的θ值=θ[j]-Cf导数
新θ.追加(新θ值)
返回新θ
def精度(θ):
正确=0
长度=len(X_检验,假设(X,θ))
对于范围内的i(长度):
预测=圆形(假设(X[i],θ))
答案=Y[i]
如果预测==回答.all():
正确+=1
我的精确度=(正确/长度)*100
打印(“LR精度%:”,我的精度)
定义逻辑回归(X、Y、α、θ、数值):
θ=np.零(X.形[1])
m=len(Y)
对于范围内的x(数量):
新θ=梯度下降(X,Y,θ,m,alpha)
θ=新θ
如果x%100==0:
成本函数(X,Y,θ,m)
打印('θ:',θ)
打印('cost:',cost_函数(X,Y,θ,m))
精度(θ)
初始θ=[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
α=0.0001
迭代次数=1000
逻辑回归(X,Y,alpha,初始θ,迭代)

这是使用威斯康星州乳腺癌数据集()的数据,我在其中权衡了30个特征——尽管将特征更改为已知相关的特征也不会改变我的准确度。

我不确定你是如何得出
alpha
0.0001
,但我认为它太低了。将您的代码用于癌症数据表明,成本随着每次迭代而降低——这只是缓慢地进行

当我把它提高到0.5时,我仍然得到了一个降低的成本,但是在一个更合理的水平上。经过1000次迭代后,它报告:

cost:  0.23668000993020666
在修正了
精度
函数后,我得到了数据测试段的92%

您已经安装了Numpy,如
X=np.array(X)
所示。你应该考虑使用它来进行手术。对于这样的工作来说,速度会快几个数量级。这是一个矢量化版本,可立即给出结果,而不是等待:

import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv("cancerdata.csv")
X = df.values[:,2:-1].astype('float64')
X = (X - np.mean(X, axis =0)) /  np.std(X, axis = 0)

## Add a bias column to the data
X = np.hstack([np.ones((X.shape[0], 1)),X])
X = MinMaxScaler().fit_transform(X)
Y = df["diagnosis"].map({'M':1,'B':0})
Y = np.array(Y)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25)


def Sigmoid(z):
    return 1/(1 + np.exp(-z))

def Hypothesis(theta, x):   
    return Sigmoid(x @ theta) 

def Cost_Function(X,Y,theta,m):
    hi = Hypothesis(theta, X)
    _y = Y.reshape(-1, 1)
    J = 1/float(m) * np.sum(-_y * np.log(hi) - (1-_y) * np.log(1-hi))
    return J

def Cost_Function_Derivative(X,Y,theta,m,alpha):
    hi = Hypothesis(theta,X)
    _y = Y.reshape(-1, 1)
    J = alpha/float(m) * X.T @ (hi - _y)
    return J

def Gradient_Descent(X,Y,theta,m,alpha):
    new_theta = theta - Cost_Function_Derivative(X,Y,theta,m,alpha)
    return new_theta

def Accuracy(theta):
    correct = 0
    length = len(X_test)
    prediction = (Hypothesis(theta, X_test) > 0.5)
    _y = Y_test.reshape(-1, 1)
    correct = prediction == _y
    my_accuracy = (np.sum(correct) / length)*100
    print ('LR Accuracy %: ', my_accuracy)

def Logistic_Regression(X,Y,alpha,theta,num_iters):
    m = len(Y)
    for x in range(num_iters):
        new_theta = Gradient_Descent(X,Y,theta,m,alpha)
        theta = new_theta
        if x % 100 == 0:
            #print ('theta: ', theta)    
            print ('cost: ', Cost_Function(X,Y,theta,m))
    Accuracy(theta)

ep = .012

initial_theta = np.random.rand(X_train.shape[1],1) * 2 * ep - ep
alpha = 0.5
iterations = 2000
Logistic_Regression(X_train,Y_train,alpha,initial_theta,iterations)

我想我可能有一个不同版本的scikit,因为我已经更改了
MinMaxScaler
行以使其工作。结果是,我可以在一眨眼的时间内完成10K次迭代,并且将模型应用于测试集的结果的准确率约为97%。

准确率是最直观的性能度量之一,它只是正确预测的观测值与总观测值的比率。更高的精度意味着模型的预成型更好

Accuracy = TP+TN/TP+FP+FN+TN

TP = True positives
TN = True negatives
FN = False negatives
TN = True negatives
当你使用准确度测量时,你的假阳性和假阴性的成本应该是相似的。一个更好的指标是F1分数,由

F1-score = 2*(Recall*Precision)/Recall+Precision where,

Precision = TP/TP+FP
Recall = TP/TP+FN
在这里阅读更多

python中机器学习的优点在于,像scikit learn这样的重要模块是开源的,因此您可以随时查看实际代码。 请使用scikit learn metrics源代码的以下链接,该源代码将让您了解scikit learn如何计算准确度分数

from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)

Python为我们提供了这个scikit学习库,使我们的工作更轻松, 这对我很有用:

从sklearn.metrics导入准确性评分
y_pred=对数预测(x_检验)
分数=准确度分数(y_测试,y_预测)

考虑使用sklearn Accurance\u score来检查它是否产生相同的准确率,
answer.all()中的
all
是什么?如果预测==ans,为什么不干脆