Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/66.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R 决策树:完美分类(二分法)类噪声为100%,但几乎为零预测噪声为99%。(在右侧尝试了2个alg)。为什么?_R_Machine Learning_Weka_Cart_Decision Tree - Fatal编程技术网

R 决策树:完美分类(二分法)类噪声为100%,但几乎为零预测噪声为99%。(在右侧尝试了2个alg)。为什么?

R 决策树:完美分类(二分法)类噪声为100%,但几乎为零预测噪声为99%。(在右侧尝试了2个alg)。为什么?,r,machine-learning,weka,cart,decision-tree,R,Machine Learning,Weka,Cart,Decision Tree,我正在使用一个具有二分法类的数据集,并使用R-performance测试噪声如何影响来自Rweka的决策树j48。我添加了噪声,并在实验过程中使用了0.01到0.5的置信因子 由于班级是二分法的,训练集的最差表现为50%是班级二分法是完全合理的 现在。。。随着验证的进行,当噪声为总噪声时,我有一些精度几乎完美的点。我怎么解释呢 红色表示训练集;蓝色用于验证(80/20) 准确度=(正确预测等级/总测试等级)×100%或正确分类实例的百分比(TP+TN)/(TP+TN+FP+FN)。其中TP、F

我正在使用一个具有二分法类的数据集,并使用R-performance测试噪声如何影响来自Rweka的决策树j48。我添加了噪声,并在实验过程中使用了0.01到0.5的置信因子

由于班级是二分法的,训练集的最差表现为50%是班级二分法是完全合理的

现在。。。随着验证的进行,当噪声为总噪声时,我有一些精度几乎完美的点。我怎么解释呢

红色表示训练集;蓝色用于验证(80/20)

准确度=(正确预测等级/总测试等级)×100%或正确分类实例的百分比(TP+TN)/(TP+TN+FP+FN)。其中TP、FN、FP和TN分别表示真阳性、假阴性、假阳性和真阴性的数量

我使用的数据集是这个数据集的随机样本(10%)

这是我的代码:

library(arules)
library(RWeka)
library(readr)
library(caret)
library(caTools)


setwd("C:\\Users\\Lucas\\Desktop\\AA")
inputFileName = "hr.csv"

ruido = function( datasetName, percentage ) {
  hr = read_csv(datasetName)

  sample = sample.split(hr$left, SplitRatio = percentage / 100)
  toModify = subset(hr, sample == TRUE)
  NotModify = subset(hr, sample == FALSE)

  toModify$left = sapply(toModify$left, function(x) !x )

  new_hr = rbind(toModify, toNotModify)

  output = strsplit(datasetName, "[.]")
  outputFileName = paste0(output[[1]][1], '_ruido_', percentage, '.', output[[1]][2])

  write.csv(new_hr, file = outputFileName, row.names=FALSE)
  outputFileName
}

set.seed(101) 
percentages = seq(0, 100 )
confidenceFactors = seq(0.05, 0.5, 0.05)

dataset = read.csv( inputFileName, sep = "," )

sample = sample.split(dataset$left, SplitRatio = .8)
train = subset(dataset, sample == TRUE)
test  = subset(dataset, sample == FALSE)

dir.create(file.path("C:\\Users\\Lucas\\Desktop\\AA", "datasets\\5"))
write.csv(train, file = "datasets\\5\\HR_train.csv", row.names=FALSE)
write.csv(test, file = "datasets\\5\\HR_test.csv", row.names=FALSE)

sizeResults = data.frame( "CF" = double(), "Percentage" = double(), "Value" = integer(), stringsAsFactors=FALSE)
accuracyResults = data.frame( "CF" = double(), "Method" = character(), "Percentage" = double(), "Value" = integer(), stringsAsFactors=FALSE)

for( percentage in percentages ){

    percentage = percentage / 100

    # load training dataset
    outputFile = ruido( "datasets\\5\\HR_train.csv", percentage * 100 )
    training = read.csv( outputFile )
    training$number_project = as.factor( training$number_project )
    training$time_spend_company = as.factor( training$time_spend_company )


    for( c in confidenceFactors ){
      # create tree
      print( paste0("Decision tree for: ", "percentage=", percentage, ", CF=", c) )
      tree = J48( as.factor(left) ~ ., training, control = Weka_control(M=2, C=c) )
      treeSize = tree$classifier$numElements()
      sizeResults = rbind(sizeResults, data.frame(CF=c, Percentage=percentage, Value=treeSize))

      # test tree
      pred <- predict(tree, test, type='class')
      trainingAccuracy = summary(tree)$details[[1]]
      testAccuracy = confusionMatrix(table(test$left, pred))
      accuracyResults = rbind(accuracyResults, data.frame(CF=c, Percentage=percentage, Method="Entrenamiento", Value=trainingAccuracy))
      accuracyResults = rbind(accuracyResults, data.frame(CF=c, Percentage=percentage, Method="Validaci�n", Value=testAccuracy$overall[[1]] * 100))
    }
}


ggplot(data=sizeResults, aes(x=Percentage, y=Value)) +
  geom_point() +
  ylab("Nodes") + # Set axis labels
  scale_colour_hue(name="")       # Set legend title  

ggplot(data=accuracyResults, aes(x=Percentage, y=Value, group=Method, color=Method)) +
  geom_point() +
  ylab("Accuracy") + # Set axis labels
  scale_colour_hue(name="") 
库(阿鲁莱斯)
图书馆(鲁韦卡)
图书馆(readr)
图书馆(插入符号)
图书馆(caTools)
setwd(“C:\\Users\\Lucas\\Desktop\\AA”)
inputFileName=“hr.csv”
ruido=函数(数据集名称,百分比){
hr=读取\u csv(数据集名称)
样本=样本.分割(hr$左,分割比率=百分比/100)
toModify=子集(hr,样本==真)
NotModify=子集(hr,sample==FALSE)
toModify$left=sapply(toModify$left,函数(x)!x)
新建\u hr=rbind(toModify,toNotModify)
输出=strsplit(datasetName,“[.]”)
outputFileName=paste0(输出[[1]][1],'百分比','输出[[1]][2])
write.csv(new_hr,file=outputFileName,row.names=FALSE)
输出文件名
}
种子集(101)
百分比=序号(0,100)
信心系数=序号(0.05,0.5,0.05)
dataset=read.csv(inputFileName,sep=“,”)
sample=sample.split(数据集$left,SplitRatio=.8)
序列=子集(数据集,样本==真)
测试=子集(数据集,样本==FALSE)
目录创建(文件路径(“C:\\Users\\Lucas\\Desktop\\AA”,“数据集\\5”))
write.csv(train,file=“datasets\\5\\HR\u train.csv”,row.names=FALSE)
write.csv(test,file=“datasets\\5\\HR\u test.csv”,row.names=FALSE)
sizeResults=data.frame(“CF”=double(),“Percentage”=double(),“Value”=integer(),stringsAsFactors=FALSE)
accuracyResults=data.frame(“CF”=double(),“Method”=character(),“Percentage”=double(),“Value”=integer(),stringsAsFactors=FALSE)
用于(百分比中的百分比){
百分比=百分比/100
#负荷训练数据集
outputFile=ruido(“数据集\\5\\HR\u train.csv”,百分比*100)
培训=读取.csv(输出文件)
培训$number\u项目=as.factor(培训$number\u项目)
培训$time\u-spend\u-company=as.factor(培训$time\u-spend\u-company)
对于(c)信心因素{
#创建树
打印(粘贴0(“决策树:”,“百分比=”,百分比,”,CF=,c))
tree=J48(同因子(左)~,训练,控制=Weka_控制(M=2,C=C))
treeSize=树$classifier$numElements()
sizeResults=rbind(sizeResults,data.frame(CF=c,Percentage=Percentage,Value=treeSize))
#测试树

你可能想问这个问题,而完全相同的问题并不是真的被嘲笑