C# 如何在Accord.Net中同步训练和测试码本_C#_Tree_Random Forest_Accord.net

C# 如何在Accord.Net中同步训练和测试码本

c# tree

C# 如何在Accord.Net中同步训练和测试码本,c#,tree,random-forest,accord.net,C#,Tree,Random Forest,Accord.net,问题：是否有一个随机林示例将火车和测试集分开？我在Accord Net ML测试项目中发现的当前示例使用非常相同的数据进行培训和测试显然，我遇到的问题是在测试集和列车集之间同步生成的标签（INT）。我正在生成列车标签，如下所示： int[] trainOutputs = trainCodebook.Translate("Output", trainLabels); And the test labels similarly: int[] testOutputs = testCodebook

问题：是否有一个随机林示例将火车和测试集分开？我在Accord Net ML测试项目中发现的当前示例使用非常相同的数据进行培训和测试

显然，我遇到的问题是在测试集和列车集之间同步生成的标签（INT）。我正在生成列车标签，如下所示：

int[] trainOutputs = trainCodebook.Translate("Output", trainLabels);

And the test labels similarly:

int[] testOutputs = testCodebook.Translate("Output", testLabels);

Finally I train with the train data and test with the test data:

var forest = teacher.Learn(trainVectors, trainOutputs);

int[] predicted = forest.Decide(testVectors);

除非列车和测试集中的前三行相同，否则标签不同，因此产生非常高的错误率

我试图用三元字符串手动创建代码本：

new Codification("-1","0","1");

不幸的是，这会产生一个运行时错误，指出给定的键不在字典中。我确信有一种方法可以在两个单独的代码本中同步密钥生成。如果我在测试数据的顶部添加三行包含所有三个键的列车数据，我就能够使用下面的代码。不是我喜欢的解决方案；=）

以下是我正在运行的整个测试：

 [Test]
 public void test_learn()
 {
 Accord.Math.Random.Generator.Seed = 1;

    /////////// TRAINING SET ///////////
    // First, let's load the TRAINING set into an array of text that we can process
    string[][] text = Resources.train.Split(new[] { "\r\n" },
        StringSplitOptions.RemoveEmptyEntries).Apply(x => x.Split(','));

    int length = text[0].Length;
    List<int> columns = new List<int>();
    for (int i = 1; i < length; i++)
    {
        columns.Add(i);
    }
    double[][] trainVectors = text.GetColumns(columns.ToArray()).To<double[][]>();

    // The first column contains the expected ternary category (i.e. -1, 0, or 1)
    string[] trainLabels = text.GetColumn(0);
    var trainCodebook = new Codification("Output", trainLabels);
    int[] trainOutputs = trainCodebook.Translate("Output", trainLabels);

    ////////// TEST SET ////////////

    text = Resources.test.Split(new[] { "\r\n" },
        StringSplitOptions.RemoveEmptyEntries).Apply(x => x.Split(','));

    double[][] testVectors = text.GetColumns(columns.ToArray()).To<double[][]>();
    string[] testLabels = text.GetColumn(0);
    var testCodebook = new Codification("Output", testLabels);
    int[] testOutputs = testCodebook.Translate("Output", testLabels);

    var teacher = new RandomForestLearning()
    {
        NumberOfTrees = 10,
    };

    var forest = teacher.Learn(trainVectors, trainOutputs);
    int[] predicted = forest.Decide(testVectors);

    int lineNum = 1;
    foreach (int prediction in predicted)
    {
        Console.WriteLine("Prediction " + lineNum + ": " 
        + trainCodebook.Translate("Output", prediction));
        lineNum++;
    }
    // I'm using the test vectors to calculate the error rate
    double error = new ZeroOneLoss(testOutputs).Loss(forest.Decide(testVectors));

    Console.WriteLine("Error term is " + error);

    Assert.IsTrue(error < 0.20); // humble expectations ;-)
}

[测试]
公共无效测试_learn（）
{
Accord.Math.Random.Generator.Seed=1；
///////////训练集///////////
//首先，让我们将训练集加载到可以处理的文本数组中
字符串[][]text=Resources.train.Split（新[]{“\r\n”}，
StringSplitOptions.RemoveEmptyEntries.Apply（x=>x.Split（'，'）；
int length=文本[0]。长度；
列表列=新列表（）；
for（int i=1；ix.Split（'，'）；
double[]testVectors=text.GetColumns（columns.ToArray（））.To（）；
string[]testLabels=text.GetColumn（0）；
var testCodebook=新编码（“输出”，testLabels）；
int[]testOutputs=testCodebook.Translate（“输出”，testLabels）；
var teacher=new RandomForestLearning（）
{
NumberOfTrees=10，
};
var forest=教师学习（培训向量、培训输出）；
int[]predicted=forest.decise（testVectors）；
int lineNum=1；
foreach（预测中的整数预测）
{
Console.WriteLine（“预测”+lineNum+”：“
+trainCodebook.Translate（“输出”，预测））；
lineNum++；
}
//我用测试向量来计算错误率
双重错误=新的ZeroOneLoss（testOutputs）.Loss（forest.Decise（testVectors））；
Console.WriteLine（“错误项为”+错误）；
Assert.IsTrue（错误<0.20）；//谦逊的期望；-）
}

好吧，我想出来了。请参阅下面的代码：

好吧，我想我能修好它。问题是DecisionTree中序列化的错误实现。幸运的是，我们有代码-请参阅下面的修复程序：

namespace Accord.MachineLearning.DecisionTrees
{
  using System;
  using System.Collections.Generic;
  using System.Linq;
  using System.Text;
  using System.Threading.Tasks;
  using System.Data;
  using System.Runtime.Serialization;
  using System.Runtime.Serialization.Formatters.Binary;
  using System.IO;
  using Accord.Statistics.Filters;
  using Accord.Math;
  using AForge;
  using Accord.Statistics;
  using System.Threading;


/// <summary>
///   Random Forest.
/// </summary>
/// 
/// <remarks>
/// <para>
///   Represents a random forest of <see cref="DecisionTree"/>s. For 
///   sample usage and example of learning, please see the documentation
///   page for <see cref="RandomForestLearning"/>.</para>
/// </remarks>
/// 
/// <seealso cref="DecisionTree"/>
/// <seealso cref="RandomForestLearning"/>
/// 
[Serializable]
public class RandomForest : MulticlassClassifierBase, IParallel
{
    private DecisionTree[] trees;
    **[NonSerialized]
    private ParallelOptions parallelOptions;**


    /// <summary>
    ///   Gets the trees in the random forest.
    /// </summary>
    /// 
    public DecisionTree[] Trees
    {
        get { return trees; }
    }

    /// <summary>
    ///   Gets the number of classes that can be recognized
    ///   by this random forest.
    /// </summary>
    /// 
    [Obsolete("Please use NumberOfOutputs instead.")]
    public int Classes { get { return NumberOfOutputs; } }

    /// <summary>
    ///   Gets or sets the parallelization options for this algorithm.
    /// </summary>
    ///
    **public ParallelOptions ParallelOptions { get { return parallelOptions; } set { parallelOptions = value; } }**

    /// <summary>
    /// Gets or sets a cancellation token that can be used
    /// to cancel the algorithm while it is running.
    /// </summary>
    /// 
    public CancellationToken Token
    {
        get { return ParallelOptions.CancellationToken; }
        set { ParallelOptions.CancellationToken = value; }
    }

    /// <summary>
    ///   Creates a new random forest.
    /// </summary>
    /// 
    /// <param name="trees">The number of trees in the forest.</param>
    /// <param name="classes">The number of classes in the classification problem.</param>
    /// 
    public RandomForest(int trees, int classes)
    {
        this.trees = new DecisionTree[trees];
        this.NumberOfOutputs = classes;
        this.ParallelOptions = new ParallelOptions();
    }

    /// <summary>
    ///   Computes the decision output for a given input vector.
    /// </summary>
    /// 
    /// <param name="data">The input vector.</param>
    /// 
    /// <returns>The forest decision for the given vector.</returns>
    /// 
    [Obsolete("Please use Decide() instead.")]
    public int Compute(double[] data)
    {
        return Decide(data);
    }


    /// <summary>
    /// Computes a class-label decision for a given <paramref name="input" />.
    /// </summary>
    /// <param name="input">The input vector that should be classified into
    /// one of the <see cref="ITransform.NumberOfOutputs" /> possible classes.</param>
    /// <returns>A class-label that best described <paramref name="input" /> according
    /// to this classifier.</returns>
    public override int Decide(double[] input)
    {
        int[] responses = new int[NumberOfOutputs];
        Parallel.For(0, trees.Length, ParallelOptions, i =>
        {
            int j = trees[i].Decide(input);
            Interlocked.Increment(ref responses[j]);
        });

        return responses.ArgMax();
    }

   [OnDeserializing()]
    internal void OnDeserializingMethod(StreamingContext context)
    {
        this.ParallelOptions = new ParallelOptions();
    }
}
}

namespace Accord.MachineLearning.DecisionTrees
{
使用制度；
使用System.Collections.Generic；
使用System.Linq；
使用系统文本；
使用System.Threading.Tasks；
使用系统数据；
使用System.Runtime.Serialization；
使用System.Runtime.Serialization.Formatters.Binary；
使用System.IO；
使用Accord.Statistics.Filters；
使用Accord.Math；
使用冲锋枪；
采用一致性统计；
使用系统线程；
/// 
///随机森林。
/// 
/// 
/// 
/// 
///表示s.的随机林
///示例用法和学习示例，请参阅文档
///第页，共页。
/// 
/// 
/// 
/// 
/// 
[可序列化]
公共类RandomForest：多类分类数据库，IParallel
{
私有决策树[]树；
**[非串行化]
私人平行期权**
/// 
///获取随机林中的树。
/// 
/// 
公共决策树
{
获取{返回树；}
}
/// 
///获取可以识别的类的数目
///在这片随机的森林旁。
/// 
/// 
[过时（“请改用NumberOfOutputs。”）]
公共int类{get{returnnumberofoutputs；}}
/// 
///获取或设置此算法的并行化选项。
/// 
///
**公共ParallelOptions ParallelOptions{get{return ParallelOptions；}set{ParallelOptions=value；}}**
/// 
///获取或设置可使用的取消令牌
///在算法运行时取消该算法。
/// 
/// 
公共取消令牌
{
获取{return ParallelOptions.CancellationToken；}
设置{ParallelOptions.CancellationToken=value；}
}
/// 
///创建一个新的随机林。
/// 
/// 
///森林中树木的数量。
///分类问题中的类数。
/// 
公共林（int树，int类）
{
this.trees=新决策树[树]；
this.NumberOfOutputs=类；
this.ParallelOptions=新的ParallelOptions（）；
}
/// 
///计算给定输入向量的决策输出。
/// 
/// 
///输入向量。
/// 
///给定向量的森林决策。
/// 
[过时（“请改用decise（））]
公共整数计算（双[]数据）
{
返回决定（数据）；
}
/// 
///计算给定对象的类标签决策。
/// 
///应分类为的输入向量
///一个可能的类。
///最好根据以下内容描述的类标签：
///这个分类器。
公共覆盖整数决定（双[]输入）
{
int[]responses=新的int[NumberOfOutputs]；
Parallel.For（0，trees.Length，ParallelOptions，i）