R 决策树:完美分类(二分法)类噪声为100%,但几乎为零预测噪声为99%。(在右侧尝试了2个alg)。为什么?
我正在使用一个具有二分法类的数据集,并使用R-performance测试噪声如何影响来自Rweka的决策树j48。我添加了噪声,并在实验过程中使用了0.01到0.5的置信因子 由于班级是二分法的,训练集的最差表现为50%是班级二分法是完全合理的 现在。。。随着验证的进行,当噪声为总噪声时,我有一些精度几乎完美的点。我怎么解释呢 红色表示训练集;蓝色用于验证(80/20) 准确度=(正确预测等级/总测试等级)×100%或正确分类实例的百分比(TP+TN)/(TP+TN+FP+FN)。其中TP、FN、FP和TN分别表示真阳性、假阴性、假阳性和真阴性的数量 我使用的数据集是这个数据集的随机样本(10%) 这是我的代码:R 决策树:完美分类(二分法)类噪声为100%,但几乎为零预测噪声为99%。(在右侧尝试了2个alg)。为什么?,r,machine-learning,weka,cart,decision-tree,R,Machine Learning,Weka,Cart,Decision Tree,我正在使用一个具有二分法类的数据集,并使用R-performance测试噪声如何影响来自Rweka的决策树j48。我添加了噪声,并在实验过程中使用了0.01到0.5的置信因子 由于班级是二分法的,训练集的最差表现为50%是班级二分法是完全合理的 现在。。。随着验证的进行,当噪声为总噪声时,我有一些精度几乎完美的点。我怎么解释呢 红色表示训练集;蓝色用于验证(80/20) 准确度=(正确预测等级/总测试等级)×100%或正确分类实例的百分比(TP+TN)/(TP+TN+FP+FN)。其中TP、F
library(arules)
library(RWeka)
library(readr)
library(caret)
library(caTools)
setwd("C:\\Users\\Lucas\\Desktop\\AA")
inputFileName = "hr.csv"
ruido = function( datasetName, percentage ) {
hr = read_csv(datasetName)
sample = sample.split(hr$left, SplitRatio = percentage / 100)
toModify = subset(hr, sample == TRUE)
NotModify = subset(hr, sample == FALSE)
toModify$left = sapply(toModify$left, function(x) !x )
new_hr = rbind(toModify, toNotModify)
output = strsplit(datasetName, "[.]")
outputFileName = paste0(output[[1]][1], '_ruido_', percentage, '.', output[[1]][2])
write.csv(new_hr, file = outputFileName, row.names=FALSE)
outputFileName
}
set.seed(101)
percentages = seq(0, 100 )
confidenceFactors = seq(0.05, 0.5, 0.05)
dataset = read.csv( inputFileName, sep = "," )
sample = sample.split(dataset$left, SplitRatio = .8)
train = subset(dataset, sample == TRUE)
test = subset(dataset, sample == FALSE)
dir.create(file.path("C:\\Users\\Lucas\\Desktop\\AA", "datasets\\5"))
write.csv(train, file = "datasets\\5\\HR_train.csv", row.names=FALSE)
write.csv(test, file = "datasets\\5\\HR_test.csv", row.names=FALSE)
sizeResults = data.frame( "CF" = double(), "Percentage" = double(), "Value" = integer(), stringsAsFactors=FALSE)
accuracyResults = data.frame( "CF" = double(), "Method" = character(), "Percentage" = double(), "Value" = integer(), stringsAsFactors=FALSE)
for( percentage in percentages ){
percentage = percentage / 100
# load training dataset
outputFile = ruido( "datasets\\5\\HR_train.csv", percentage * 100 )
training = read.csv( outputFile )
training$number_project = as.factor( training$number_project )
training$time_spend_company = as.factor( training$time_spend_company )
for( c in confidenceFactors ){
# create tree
print( paste0("Decision tree for: ", "percentage=", percentage, ", CF=", c) )
tree = J48( as.factor(left) ~ ., training, control = Weka_control(M=2, C=c) )
treeSize = tree$classifier$numElements()
sizeResults = rbind(sizeResults, data.frame(CF=c, Percentage=percentage, Value=treeSize))
# test tree
pred <- predict(tree, test, type='class')
trainingAccuracy = summary(tree)$details[[1]]
testAccuracy = confusionMatrix(table(test$left, pred))
accuracyResults = rbind(accuracyResults, data.frame(CF=c, Percentage=percentage, Method="Entrenamiento", Value=trainingAccuracy))
accuracyResults = rbind(accuracyResults, data.frame(CF=c, Percentage=percentage, Method="Validaci�n", Value=testAccuracy$overall[[1]] * 100))
}
}
ggplot(data=sizeResults, aes(x=Percentage, y=Value)) +
geom_point() +
ylab("Nodes") + # Set axis labels
scale_colour_hue(name="") # Set legend title
ggplot(data=accuracyResults, aes(x=Percentage, y=Value, group=Method, color=Method)) +
geom_point() +
ylab("Accuracy") + # Set axis labels
scale_colour_hue(name="")
库(阿鲁莱斯)
图书馆(鲁韦卡)
图书馆(readr)
图书馆(插入符号)
图书馆(caTools)
setwd(“C:\\Users\\Lucas\\Desktop\\AA”)
inputFileName=“hr.csv”
ruido=函数(数据集名称,百分比){
hr=读取\u csv(数据集名称)
样本=样本.分割(hr$左,分割比率=百分比/100)
toModify=子集(hr,样本==真)
NotModify=子集(hr,sample==FALSE)
toModify$left=sapply(toModify$left,函数(x)!x)
新建\u hr=rbind(toModify,toNotModify)
输出=strsplit(datasetName,“[.]”)
outputFileName=paste0(输出[[1]][1],'百分比','输出[[1]][2])
write.csv(new_hr,file=outputFileName,row.names=FALSE)
输出文件名
}
种子集(101)
百分比=序号(0,100)
信心系数=序号(0.05,0.5,0.05)
dataset=read.csv(inputFileName,sep=“,”)
sample=sample.split(数据集$left,SplitRatio=.8)
序列=子集(数据集,样本==真)
测试=子集(数据集,样本==FALSE)
目录创建(文件路径(“C:\\Users\\Lucas\\Desktop\\AA”,“数据集\\5”))
write.csv(train,file=“datasets\\5\\HR\u train.csv”,row.names=FALSE)
write.csv(test,file=“datasets\\5\\HR\u test.csv”,row.names=FALSE)
sizeResults=data.frame(“CF”=double(),“Percentage”=double(),“Value”=integer(),stringsAsFactors=FALSE)
accuracyResults=data.frame(“CF”=double(),“Method”=character(),“Percentage”=double(),“Value”=integer(),stringsAsFactors=FALSE)
用于(百分比中的百分比){
百分比=百分比/100
#负荷训练数据集
outputFile=ruido(“数据集\\5\\HR\u train.csv”,百分比*100)
培训=读取.csv(输出文件)
培训$number\u项目=as.factor(培训$number\u项目)
培训$time\u-spend\u-company=as.factor(培训$time\u-spend\u-company)
对于(c)信心因素{
#创建树
打印(粘贴0(“决策树:”,“百分比=”,百分比,”,CF=,c))
tree=J48(同因子(左)~,训练,控制=Weka_控制(M=2,C=C))
treeSize=树$classifier$numElements()
sizeResults=rbind(sizeResults,data.frame(CF=c,Percentage=Percentage,Value=treeSize))
#测试树
你可能想问这个问题,而完全相同的问题并不是真的被嘲笑