C# 如何在Accord.Net中同步训练和测试码本
问题:是否有一个随机林示例将火车和测试集分开?我在Accord Net ML测试项目中发现的当前示例使用非常相同的数据进行培训和测试 显然,我遇到的问题是在测试集和列车集之间同步生成的标签(INT)。我正在生成列车标签,如下所示:C# 如何在Accord.Net中同步训练和测试码本,c#,tree,random-forest,accord.net,C#,Tree,Random Forest,Accord.net,问题:是否有一个随机林示例将火车和测试集分开?我在Accord Net ML测试项目中发现的当前示例使用非常相同的数据进行培训和测试 显然,我遇到的问题是在测试集和列车集之间同步生成的标签(INT)。我正在生成列车标签,如下所示: int[] trainOutputs = trainCodebook.Translate("Output", trainLabels); And the test labels similarly: int[] testOutputs = testCodebook
int[] trainOutputs = trainCodebook.Translate("Output", trainLabels);
And the test labels similarly:
int[] testOutputs = testCodebook.Translate("Output", testLabels);
Finally I train with the train data and test with the test data:
var forest = teacher.Learn(trainVectors, trainOutputs);
int[] predicted = forest.Decide(testVectors);
除非列车和测试集中的前三行相同,否则标签不同,因此产生非常高的错误率
我试图用三元字符串手动创建代码本:
new Codification("-1","0","1");
不幸的是,这会产生一个运行时错误,指出给定的键不在字典中。我确信有一种方法可以在两个单独的代码本中同步密钥生成。如果我在测试数据的顶部添加三行包含所有三个键的列车数据,我就能够使用下面的代码。不是我喜欢的解决方案;=)
以下是我正在运行的整个测试:
[Test]
public void test_learn()
{
Accord.Math.Random.Generator.Seed = 1;
/////////// TRAINING SET ///////////
// First, let's load the TRAINING set into an array of text that we can process
string[][] text = Resources.train.Split(new[] { "\r\n" },
StringSplitOptions.RemoveEmptyEntries).Apply(x => x.Split(','));
int length = text[0].Length;
List<int> columns = new List<int>();
for (int i = 1; i < length; i++)
{
columns.Add(i);
}
double[][] trainVectors = text.GetColumns(columns.ToArray()).To<double[][]>();
// The first column contains the expected ternary category (i.e. -1, 0, or 1)
string[] trainLabels = text.GetColumn(0);
var trainCodebook = new Codification("Output", trainLabels);
int[] trainOutputs = trainCodebook.Translate("Output", trainLabels);
////////// TEST SET ////////////
text = Resources.test.Split(new[] { "\r\n" },
StringSplitOptions.RemoveEmptyEntries).Apply(x => x.Split(','));
double[][] testVectors = text.GetColumns(columns.ToArray()).To<double[][]>();
string[] testLabels = text.GetColumn(0);
var testCodebook = new Codification("Output", testLabels);
int[] testOutputs = testCodebook.Translate("Output", testLabels);
var teacher = new RandomForestLearning()
{
NumberOfTrees = 10,
};
var forest = teacher.Learn(trainVectors, trainOutputs);
int[] predicted = forest.Decide(testVectors);
int lineNum = 1;
foreach (int prediction in predicted)
{
Console.WriteLine("Prediction " + lineNum + ": "
+ trainCodebook.Translate("Output", prediction));
lineNum++;
}
// I'm using the test vectors to calculate the error rate
double error = new ZeroOneLoss(testOutputs).Loss(forest.Decide(testVectors));
Console.WriteLine("Error term is " + error);
Assert.IsTrue(error < 0.20); // humble expectations ;-)
}
[测试]
公共无效测试_learn()
{
Accord.Math.Random.Generator.Seed=1;
///////////训练集///////////
//首先,让我们将训练集加载到可以处理的文本数组中
字符串[][]text=Resources.train.Split(新[]{“\r\n”},
StringSplitOptions.RemoveEmptyEntries.Apply(x=>x.Split(',');
int length=文本[0]。长度;
列表列=新列表();
for(int i=1;ix.Split(',');
double[]testVectors=text.GetColumns(columns.ToArray()).To();
string[]testLabels=text.GetColumn(0);
var testCodebook=新编码(“输出”,testLabels);
int[]testOutputs=testCodebook.Translate(“输出”,testLabels);
var teacher=new RandomForestLearning()
{
NumberOfTrees=10,
};
var forest=教师学习(培训向量、培训输出);
int[]predicted=forest.decise(testVectors);
int lineNum=1;
foreach(预测中的整数预测)
{
Console.WriteLine(“预测”+lineNum+”:“
+trainCodebook.Translate(“输出”,预测));
lineNum++;
}
//我用测试向量来计算错误率
双重错误=新的ZeroOneLoss(testOutputs).Loss(forest.Decise(testVectors));
Console.WriteLine(“错误项为”+错误);
Assert.IsTrue(错误<0.20);//谦逊的期望;-)
}
好吧,我想出来了。请参阅下面的代码:
好吧,我想我能修好它。问题是DecisionTree中序列化的错误实现。幸运的是,我们有代码-请参阅下面的修复程序:
namespace Accord.MachineLearning.DecisionTrees
{
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Data;
using System.Runtime.Serialization;
using System.Runtime.Serialization.Formatters.Binary;
using System.IO;
using Accord.Statistics.Filters;
using Accord.Math;
using AForge;
using Accord.Statistics;
using System.Threading;
/// <summary>
/// Random Forest.
/// </summary>
///
/// <remarks>
/// <para>
/// Represents a random forest of <see cref="DecisionTree"/>s. For
/// sample usage and example of learning, please see the documentation
/// page for <see cref="RandomForestLearning"/>.</para>
/// </remarks>
///
/// <seealso cref="DecisionTree"/>
/// <seealso cref="RandomForestLearning"/>
///
[Serializable]
public class RandomForest : MulticlassClassifierBase, IParallel
{
private DecisionTree[] trees;
**[NonSerialized]
private ParallelOptions parallelOptions;**
/// <summary>
/// Gets the trees in the random forest.
/// </summary>
///
public DecisionTree[] Trees
{
get { return trees; }
}
/// <summary>
/// Gets the number of classes that can be recognized
/// by this random forest.
/// </summary>
///
[Obsolete("Please use NumberOfOutputs instead.")]
public int Classes { get { return NumberOfOutputs; } }
/// <summary>
/// Gets or sets the parallelization options for this algorithm.
/// </summary>
///
**public ParallelOptions ParallelOptions { get { return parallelOptions; } set { parallelOptions = value; } }**
/// <summary>
/// Gets or sets a cancellation token that can be used
/// to cancel the algorithm while it is running.
/// </summary>
///
public CancellationToken Token
{
get { return ParallelOptions.CancellationToken; }
set { ParallelOptions.CancellationToken = value; }
}
/// <summary>
/// Creates a new random forest.
/// </summary>
///
/// <param name="trees">The number of trees in the forest.</param>
/// <param name="classes">The number of classes in the classification problem.</param>
///
public RandomForest(int trees, int classes)
{
this.trees = new DecisionTree[trees];
this.NumberOfOutputs = classes;
this.ParallelOptions = new ParallelOptions();
}
/// <summary>
/// Computes the decision output for a given input vector.
/// </summary>
///
/// <param name="data">The input vector.</param>
///
/// <returns>The forest decision for the given vector.</returns>
///
[Obsolete("Please use Decide() instead.")]
public int Compute(double[] data)
{
return Decide(data);
}
/// <summary>
/// Computes a class-label decision for a given <paramref name="input" />.
/// </summary>
/// <param name="input">The input vector that should be classified into
/// one of the <see cref="ITransform.NumberOfOutputs" /> possible classes.</param>
/// <returns>A class-label that best described <paramref name="input" /> according
/// to this classifier.</returns>
public override int Decide(double[] input)
{
int[] responses = new int[NumberOfOutputs];
Parallel.For(0, trees.Length, ParallelOptions, i =>
{
int j = trees[i].Decide(input);
Interlocked.Increment(ref responses[j]);
});
return responses.ArgMax();
}
[OnDeserializing()]
internal void OnDeserializingMethod(StreamingContext context)
{
this.ParallelOptions = new ParallelOptions();
}
}
}
namespace Accord.MachineLearning.DecisionTrees
{
使用制度;
使用System.Collections.Generic;
使用System.Linq;
使用系统文本;
使用System.Threading.Tasks;
使用系统数据;
使用System.Runtime.Serialization;
使用System.Runtime.Serialization.Formatters.Binary;
使用System.IO;
使用Accord.Statistics.Filters;
使用Accord.Math;
使用冲锋枪;
采用一致性统计;
使用系统线程;
///
///随机森林。
///
///
///
///
///表示s.的随机林
///示例用法和学习示例,请参阅文档
///第页,共页。
///
///
///
///
///
[可序列化]
公共类RandomForest:多类分类数据库,IParallel
{
私有决策树[]树;
**[非串行化]
私人平行期权**
///
///获取随机林中的树。
///
///
公共决策树
{
获取{返回树;}
}
///
///获取可以识别的类的数目
///在这片随机的森林旁。
///
///
[过时(“请改用NumberOfOutputs。”)]
公共int类{get{returnnumberofoutputs;}}
///
///获取或设置此算法的并行化选项。
///
///
**公共ParallelOptions ParallelOptions{get{return ParallelOptions;}set{ParallelOptions=value;}}**
///
///获取或设置可使用的取消令牌
///在算法运行时取消该算法。
///
///
公共取消令牌
{
获取{return ParallelOptions.CancellationToken;}
设置{ParallelOptions.CancellationToken=value;}
}
///
///创建一个新的随机林。
///
///
///森林中树木的数量。
///分类问题中的类数。
///
公共林(int树,int类)
{
this.trees=新决策树[树];
this.NumberOfOutputs=类;
this.ParallelOptions=新的ParallelOptions();
}
///
///计算给定输入向量的决策输出。
///
///
///输入向量。
///
///给定向量的森林决策。
///
[过时(“请改用decise())]
公共整数计算(双[]数据)
{
返回决定(数据);
}
///
///计算给定对象的类标签决策。
///
///应分类为的输入向量
///一个可能的类。
///最好根据以下内容描述的类标签:
///这个分类器。
公共覆盖整数决定(双[]输入)
{
int[]responses=新的int[NumberOfOutputs];
Parallel.For(0,trees.Length,ParallelOptions,i)