Python 2.7 python中的随机林

Python 2.7 python中的随机林,python-2.7,scipy,random-forest,Python 2.7,Scipy,Random Forest,我用python运行了一个随机森林模型,能够看到分类表。但我希望能得到全面的代码,从python中的数据准备、模型运行、模型验证和准确性检查代码开始,涵盖所有方面? 我的模型中有很多假阳性。任何有助于改进这一点的帮助都将非常有用。请参阅 import urllib2 import numpy from sklearn import tree from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import ac

我用python运行了一个随机森林模型,能够看到分类表。但我希望能得到全面的代码,从python中的数据准备、模型运行、模型验证和准确性检查代码开始,涵盖所有方面? 我的模型中有很多假阳性。任何有助于改进这一点的帮助都将非常有用。

请参阅

import urllib2
import numpy
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import random
from math import sqrt
import matplotlib.pyplot as plot


# Define function confusion matrix
def confusionMatrix(predicted, actual, threshold):
    if len(predicted) != len(actual): return -1
    tp = 0.0
    fp = 0.0
    tn = 0.0
    fn = 0.0
    for i in range(len(actual)):
        if actual[i] > 0.5: #labels that are 1.0  (positive examples)
            if predicted[i] > threshold:
                tp += 1.0 #correctly predicted positive
            else:
                fn += 1.0 #incorrectly predicted negative
        else:              #labels that are 0.0 (negative examples)
            if predicted[i] < threshold:
                tn += 1.0 #correctly predicted negative
            else:
                fp += 1.0 #incorrectly predicted positive
    rtn = [tp, fn, fp, tn]
    return rtn



#Hyperlink for Python
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra")
data = urllib2.urlopen(target_url)

xList = []
labels = []
names = []
firstline = True

for line in data:
    #row strip by "," sign
    row = line.strip().split(",")
    # assign labels as last column
    labels.append(float(row[-1]))
    #remove label from row
    row.pop()
    #feature vector
    floatRow = [float(num) for num in row]
    #append on the xList
    xList.append(floatRow)


nrows = len(xList)
ncols = len(xList[0])

#Split Data for Test and Train
random.seed(1)
nSample = int(nrows * 0.30)
idxTest = random.sample(range(nrows),nSample)
idxTest.sort()
idxTrain = [idx for idx in range(nrows) if not(idx in idxTest)]

xTrain = [xList[r] for r in idxTrain]
xTest = [xList[r] for r in idxTest]
yTrain = [labels[r] for r in idxTrain]
yTest = [labels[r] for r in idxTest]



numTreesMax = 30

treeDepth = 12

nAttr = 4

modelList = []
indexList = []
predList = []
nTrainRows = len(yTrain)


for iTrees in range(numTreesMax):

        idxAttr = random.sample(range(ncols), nAttr)
        idxAttr.sort()
        indexList.append(idxAttr)

        idxRows = []
        for i in range(int(0.5 * nTrainRows)):
                idxRows.append(random.choice(range(len(xTrain))))
        idxRows.sort()

        xRFTrain = []
        yRFTrain = [] 

        for i in range(len(idxRows)):
                temp = [xTrain[idxRows[i]][j] for j in idxAttr]
                xRFTrain.append(temp)
                yRFTrain.append(yTrain[idxRows[i]])

        modelList.append(DecisionTreeClassifier(max_depth = treeDepth))

        modelList[-1].fit(xRFTrain,yRFTrain)

        xRFTest = []
        for xx in xTest:
                temp = [xx[i] for i in idxAttr]
                xRFTest.append(temp)

        latestOutSAmplePrediction = modelList[-1].predict(xRFTest)
        predList.append(list(latestOutSAmplePrediction))



classerror = []
allPredictions = []
for iModels in range(len(modelList)):
        prediction = []
        for iPred in range(len(xTest)):
                prediction.append(sum([predList[i][iPred] for i in range(iModels +1)])/(iModels +1))

        allPredictions.append(prediction)
        conMatTest = confusionMatrix(prediction,yTest,0.5)
        errors = 1.0 - ((conMatTest[0] + conMatTest[3])/(conMatTest[0]+conMatTest[1]+conMatTest[2]+conMatTest[3]))
        classerror.append(errors)





nModels = [i + 1 for i in range(len(modelList))]

plot.plot(nModels,classerror)
plot.axis('tight')
plot.xlabel('Number of Trees in Ensamble')
plot.ylabel('Class Error')
plot.ylim((0.0,max(classerror)))
plot.show()
导入urllib2
进口numpy
从sklearn导入树
从sklearn.tree导入DecisionTreeClassifier
从sklearn.metrics导入准确性\u分数
随机输入
从数学导入sqrt
将matplotlib.pyplot导入为绘图
#定义函数混淆矩阵
def混淆矩阵(预测、实际、阈值):
如果len(预测)!=len(实际):返回-1
tp=0.0
fp=0.0
tn=0.0
fn=0.0
对于范围内的i(len(实际)):
如果实际[i]>0.5:#标签为1.0(正面示例)
如果预测[i]>阈值:
tp+=1.0#正确预测为正
其他:
fn+=1.0#错误预测为负
其他:#0.0的标签(负面示例)
如果预测[i]<阈值:
tn+=1.0#正确预测为负
其他:
fp+=1.0#错误预测为正
rtn=[tp,fn,fp,tn]
返回rtn
#Python的超链接
目标url=(“https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra")
data=urlib2.urlopen(目标url)
xList=[]
标签=[]
名称=[]
firstline=True
对于行输入数据:
#按“,”号排成一行
行=行.strip().split(“,”)
#将标签指定为最后一列
labels.append(float(行[-1]))
#从行中删除标签
row.pop()
#特征向量
floatRow=[行中num的float(num)]
#在xList上追加
xList.append(floatRow)
nrows=len(xList)
ncols=len(xList[0])
#用于测试和训练的分割数据
随机种子(1)
nSample=int(nrows*0.30)
idxTest=随机样本(范围(nrows),nSample)
idxTest.sort()
idxTrain=[idx代表范围内的idx(nrows),如果不是(idx代表范围内的idx)]
xTrain=[xList[r]表示idxTrain中的r]
xTest=[xList[r]表示idxTest中的r]
yTrain=[idxTrain中r的标签[r]
yTest=[idxTest中r的标签[r]
numTreesMax=30
树顶=12
nAttr=4
模型列表=[]
索引列表=[]
predList=[]
nTrainRows=len(yTrain)
对于范围内的iTrees(numTreesMax):
idxAttr=随机样本(范围(ncols),nAttr)
idxAttr.sort()
indexList.append(idxAttr)
idxRows=[]
对于范围内的i(int(0.5*nTrainRows)):
追加(random.choice(范围(len(xTrain)))
idxRows.sort()
xRFTrain=[]
yRFTrain=[]
对于范围内的i(len(idxRows)):
temp=[xTrain[idxRows[i]][j]表示idxAttr中的j]
XRF列车附加(温度)
附加(yTrain[idxRows[i]]
追加(DecisionTreeClassifier(最大深度=树深度))
模型列表[-1].fit(XRF列车,yRFTrain)
xRFTest=[]
对于xTest中的xx:
temp=[xx[i]表示idxAttr中的i]
xRFTest.append(临时)
latestOutSAmplePrediction=modelList[-1]。预测(xRFTest)
append(list(latestOutSAmplePrediction))
classerror=[]
所有预测=[]
对于范围内的iModels(len(modelList)):
预测=[]
对于范围内的IPED(len(xTest)):
prediction.append(范围内i的和([predList[i][iPred])/(iModels+1))
allPredictions.append(预测)
conMatTest=混淆矩阵(预测,Y测试,0.5)
错误=1.0-((conMatTest[0]+conMatTest[3])/(conMatTest[0]+conMatTest[1]+conMatTest[2]+conMatTest[3]))
classerror.append(错误)
nModels=[i+1表示范围内的i(len(modelList))]
plot.plot(nModels,classerror)
绘图轴(“紧”)
plot.xlabel(‘Ensamble’中的树数)
plot.ylabel('类错误')
plot.ylim((0.0,最大值(类错误)))
plot.show()

您需要在问题中添加信息。如果我说“您需要更改第五行中的代码并添加一个If”-那么您需要回答的问题就是这些问题。以下是一些链接: