Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/csharp/308.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C# 基于贝叶斯的文本分类_C#_Machine Learning_Naivebayes_Accord.net - Fatal编程技术网

C# 基于贝叶斯的文本分类

C# 基于贝叶斯的文本分类,c#,machine-learning,naivebayes,accord.net,C#,Machine Learning,Naivebayes,Accord.net,我试图按类别对一系列文本示例新闻进行分类。我有一个巨大的新闻文本数据集,数据库中有分类。机器应经过培训并确定新闻类别 public static string[] Tokenize(string text) { StringBuilder sb = new StringBuilder(text); char[] invalid = "!-;':'\",.?\n\r\t".ToCharArray(); for (int i = 0

我试图按类别对一系列文本示例新闻进行分类。我有一个巨大的新闻文本数据集,数据库中有分类。机器应经过培训并确定新闻类别

    public static string[] Tokenize(string text)
    {
        StringBuilder sb = new StringBuilder(text);

        char[] invalid = "!-;':'\",.?\n\r\t".ToCharArray();

        for (int i = 0; i < invalid.Length; i++)
            sb.Replace(invalid[i], ' ');

        return sb.ToString().Split(new[] { ' ' }, System.StringSplitOptions.RemoveEmptyEntries);
    }
    private void Form1_Load(object sender, EventArgs e)
    {
        string strDSN = "Provider=Microsoft.ACE.OLEDB.12.0;Data Source = c:\\users\\158820\\Documents\\Database4.accdb";
        string strSQL = "SELECT * FROM NewsRepository";
        // create Objects of ADOConnection and ADOCommand  
        OleDbConnection myConn = new OleDbConnection(strDSN);
        OleDbDataAdapter myCmd = new OleDbDataAdapter(strSQL, myConn);
        myConn.Open();
        DataSet dtSet = new DataSet();
        myCmd.Fill(dtSet, "NewsRepository");
        DataTable dTable = dtSet.Tables[0];
        myConn.Close();

        StringBuilder sWords = new StringBuilder();
        string[][] swords = new string[dTable.Rows.Count][];
        int i = 0;

        foreach (DataRowView dr in dTable.DefaultView)
        {
            swords[i] = Tokenize(dr[1].ToString());
            i++;
        }

        Codification codebook = new Codification(dTable, new string[] { "NewsTitle", "Category" });
        DataTable symbols = codebook.Apply(dTable);
        int[][] inputs = symbols.ToJagged<int>(new string[] { "NewsTitle" });
        int[] outputs = symbols.ToArray<int>("Category");

        bagOfWords(inputs, outputs);
    }


    private static void bagOfWords(int[][] inputs, int[] outputs)
    {
        var bow = new BagOfWords<int>();
        var quantizer = bow.Learn(inputs);
        string filenamebow = Path.Combine(Application.StartupPath, "News_BOW.accord");
        Serializer.Save(obj: bow, path: filenamebow);
        double[][] histograms = quantizer.Transform(inputs);

        // One way to perform sequence classification with an SVM is to use
        // a kernel defined over sequences, such as DynamicTimeWarping.

        // Create the multi-class learning algorithm as one-vs-one with DTW:
        var teacher = new MulticlassSupportVectorLearning<ChiSquare, double[]>()
        {
            Learner = (p) => new SequentialMinimalOptimization<ChiSquare, double[]>()
            {
               // Complexity = 100 // Create a hard SVM
            }
        };

        // Learn a multi-label SVM using the teacher
        var svm = teacher.Learn(histograms, outputs);

        // Get the predictions for the inputs
        int[] predicted = svm.Decide(histograms);

        // Create a confusion matrix to check the quality of the predictions:
        var cm = new GeneralConfusionMatrix(predicted: predicted, expected: outputs);

        // Check the accuracy measure:
        double accuracy = cm.Accuracy;

        string filename = Path.Combine(Application.StartupPath, "News_SVM.accord");
        Serializer.Save(obj: svm, path: filename);
    }
公共静态字符串[]标记化(字符串文本)
{
StringBuilder sb=新的StringBuilder(文本);
字符[]无效=“!-;”:“\”,.?\n\r\t“.ToCharArray();
for(int i=0;i新的顺序最小优化()
{
//复杂性=100//创建一个硬SVM
}
};
//使用教师学习多标签SVM
var svm=教师学习(直方图、输出);
//获取输入的预测
int[]predicted=svm.decision(直方图);
//创建混淆矩阵以检查预测的质量:
var cm=新的GeneralConfusionMatrix(预测:预测,预期:输出);
//检查测量的准确性:
双精度=厘米精度;
字符串文件名=Path.Combine(Application.StartupPath,“News_SVM.accord”);
Save(obj:svm,路径:filename);
}
我对如何训练accord.net对象有点困惑。我能够序列化经过训练的模型(9个类别中3600条独特新闻的容量约为106 MB)


如何使用该模型预测一组新的新闻文本的类别?

对不在训练集中的数据使用模型非常简单,只需调用svm来做出另一个决定:

svm.Decide(outofSampleData)

由于您已经序列化了经过训练的模型,您可以使用
序列化器实例化svm对象。加载
,这是有文档记录的。

对不在训练集中的数据使用您的模型就像调用您的svm来做另一个决定一样简单:

svm.Decide(outofSampleData)
由于您已经序列化了经过训练的模型,因此可以使用文档化的
Serializer.Load
实例化svm对象