Python 2.7 python中的随机林
我用python运行了一个随机森林模型,能够看到分类表。但我希望能得到全面的代码,从python中的数据准备、模型运行、模型验证和准确性检查代码开始,涵盖所有方面? 我的模型中有很多假阳性。任何有助于改进这一点的帮助都将非常有用。请参阅Python 2.7 python中的随机林,python-2.7,scipy,random-forest,Python 2.7,Scipy,Random Forest,我用python运行了一个随机森林模型,能够看到分类表。但我希望能得到全面的代码,从python中的数据准备、模型运行、模型验证和准确性检查代码开始,涵盖所有方面? 我的模型中有很多假阳性。任何有助于改进这一点的帮助都将非常有用。请参阅 import urllib2 import numpy from sklearn import tree from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import ac
import urllib2
import numpy
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import random
from math import sqrt
import matplotlib.pyplot as plot
# Define function confusion matrix
def confusionMatrix(predicted, actual, threshold):
if len(predicted) != len(actual): return -1
tp = 0.0
fp = 0.0
tn = 0.0
fn = 0.0
for i in range(len(actual)):
if actual[i] > 0.5: #labels that are 1.0 (positive examples)
if predicted[i] > threshold:
tp += 1.0 #correctly predicted positive
else:
fn += 1.0 #incorrectly predicted negative
else: #labels that are 0.0 (negative examples)
if predicted[i] < threshold:
tn += 1.0 #correctly predicted negative
else:
fp += 1.0 #incorrectly predicted positive
rtn = [tp, fn, fp, tn]
return rtn
#Hyperlink for Python
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra")
data = urllib2.urlopen(target_url)
xList = []
labels = []
names = []
firstline = True
for line in data:
#row strip by "," sign
row = line.strip().split(",")
# assign labels as last column
labels.append(float(row[-1]))
#remove label from row
row.pop()
#feature vector
floatRow = [float(num) for num in row]
#append on the xList
xList.append(floatRow)
nrows = len(xList)
ncols = len(xList[0])
#Split Data for Test and Train
random.seed(1)
nSample = int(nrows * 0.30)
idxTest = random.sample(range(nrows),nSample)
idxTest.sort()
idxTrain = [idx for idx in range(nrows) if not(idx in idxTest)]
xTrain = [xList[r] for r in idxTrain]
xTest = [xList[r] for r in idxTest]
yTrain = [labels[r] for r in idxTrain]
yTest = [labels[r] for r in idxTest]
numTreesMax = 30
treeDepth = 12
nAttr = 4
modelList = []
indexList = []
predList = []
nTrainRows = len(yTrain)
for iTrees in range(numTreesMax):
idxAttr = random.sample(range(ncols), nAttr)
idxAttr.sort()
indexList.append(idxAttr)
idxRows = []
for i in range(int(0.5 * nTrainRows)):
idxRows.append(random.choice(range(len(xTrain))))
idxRows.sort()
xRFTrain = []
yRFTrain = []
for i in range(len(idxRows)):
temp = [xTrain[idxRows[i]][j] for j in idxAttr]
xRFTrain.append(temp)
yRFTrain.append(yTrain[idxRows[i]])
modelList.append(DecisionTreeClassifier(max_depth = treeDepth))
modelList[-1].fit(xRFTrain,yRFTrain)
xRFTest = []
for xx in xTest:
temp = [xx[i] for i in idxAttr]
xRFTest.append(temp)
latestOutSAmplePrediction = modelList[-1].predict(xRFTest)
predList.append(list(latestOutSAmplePrediction))
classerror = []
allPredictions = []
for iModels in range(len(modelList)):
prediction = []
for iPred in range(len(xTest)):
prediction.append(sum([predList[i][iPred] for i in range(iModels +1)])/(iModels +1))
allPredictions.append(prediction)
conMatTest = confusionMatrix(prediction,yTest,0.5)
errors = 1.0 - ((conMatTest[0] + conMatTest[3])/(conMatTest[0]+conMatTest[1]+conMatTest[2]+conMatTest[3]))
classerror.append(errors)
nModels = [i + 1 for i in range(len(modelList))]
plot.plot(nModels,classerror)
plot.axis('tight')
plot.xlabel('Number of Trees in Ensamble')
plot.ylabel('Class Error')
plot.ylim((0.0,max(classerror)))
plot.show()
导入urllib2
进口numpy
从sklearn导入树
从sklearn.tree导入DecisionTreeClassifier
从sklearn.metrics导入准确性\u分数
随机输入
从数学导入sqrt
将matplotlib.pyplot导入为绘图
#定义函数混淆矩阵
def混淆矩阵(预测、实际、阈值):
如果len(预测)!=len(实际):返回-1
tp=0.0
fp=0.0
tn=0.0
fn=0.0
对于范围内的i(len(实际)):
如果实际[i]>0.5:#标签为1.0(正面示例)
如果预测[i]>阈值:
tp+=1.0#正确预测为正
其他:
fn+=1.0#错误预测为负
其他:#0.0的标签(负面示例)
如果预测[i]<阈值:
tn+=1.0#正确预测为负
其他:
fp+=1.0#错误预测为正
rtn=[tp,fn,fp,tn]
返回rtn
#Python的超链接
目标url=(“https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra")
data=urlib2.urlopen(目标url)
xList=[]
标签=[]
名称=[]
firstline=True
对于行输入数据:
#按“,”号排成一行
行=行.strip().split(“,”)
#将标签指定为最后一列
labels.append(float(行[-1]))
#从行中删除标签
row.pop()
#特征向量
floatRow=[行中num的float(num)]
#在xList上追加
xList.append(floatRow)
nrows=len(xList)
ncols=len(xList[0])
#用于测试和训练的分割数据
随机种子(1)
nSample=int(nrows*0.30)
idxTest=随机样本(范围(nrows),nSample)
idxTest.sort()
idxTrain=[idx代表范围内的idx(nrows),如果不是(idx代表范围内的idx)]
xTrain=[xList[r]表示idxTrain中的r]
xTest=[xList[r]表示idxTest中的r]
yTrain=[idxTrain中r的标签[r]
yTest=[idxTest中r的标签[r]
numTreesMax=30
树顶=12
nAttr=4
模型列表=[]
索引列表=[]
predList=[]
nTrainRows=len(yTrain)
对于范围内的iTrees(numTreesMax):
idxAttr=随机样本(范围(ncols),nAttr)
idxAttr.sort()
indexList.append(idxAttr)
idxRows=[]
对于范围内的i(int(0.5*nTrainRows)):
追加(random.choice(范围(len(xTrain)))
idxRows.sort()
xRFTrain=[]
yRFTrain=[]
对于范围内的i(len(idxRows)):
temp=[xTrain[idxRows[i]][j]表示idxAttr中的j]
XRF列车附加(温度)
附加(yTrain[idxRows[i]]
追加(DecisionTreeClassifier(最大深度=树深度))
模型列表[-1].fit(XRF列车,yRFTrain)
xRFTest=[]
对于xTest中的xx:
temp=[xx[i]表示idxAttr中的i]
xRFTest.append(临时)
latestOutSAmplePrediction=modelList[-1]。预测(xRFTest)
append(list(latestOutSAmplePrediction))
classerror=[]
所有预测=[]
对于范围内的iModels(len(modelList)):
预测=[]
对于范围内的IPED(len(xTest)):
prediction.append(范围内i的和([predList[i][iPred])/(iModels+1))
allPredictions.append(预测)
conMatTest=混淆矩阵(预测,Y测试,0.5)
错误=1.0-((conMatTest[0]+conMatTest[3])/(conMatTest[0]+conMatTest[1]+conMatTest[2]+conMatTest[3]))
classerror.append(错误)
nModels=[i+1表示范围内的i(len(modelList))]
plot.plot(nModels,classerror)
绘图轴(“紧”)
plot.xlabel(‘Ensamble’中的树数)
plot.ylabel('类错误')
plot.ylim((0.0,最大值(类错误)))
plot.show()
您需要在问题中添加信息。如果我说“您需要更改第五行中的代码并添加一个If”-那么您需要回答的问题就是这些问题。以下是一些链接: