Java 简单的Mahout分类示例_Java_Mahout_Document Classification

Java 简单的Mahout分类示例

java

Java 简单的Mahout分类示例,java,mahout,document-classification,Java,Mahout,Document Classification,我想训练驯服员进行分类。对我来说，这篇文章来自数据库，我真的不想把它们存储到文件中，以供mahout培训。我检查了MIA源代码，并更改了以下代码，以完成非常基本的培训任务。 mahout示例的常见问题是，它们显示了如何使用20新闻组从cmd提示符使用mahout，或者代码非常依赖Hadoop Zookeeper等。如果有人能看看我的代码，或者给我介绍一个非常简单的教程，说明如何训练模型，然后使用它，我将非常感激到目前为止，在下面的代码中，如果（best！=null），我将永远无法通过if（be

我想训练驯服员进行分类。对我来说，这篇文章来自数据库，我真的不想把它们存储到文件中，以供mahout培训。我检查了MIA源代码，并更改了以下代码，以完成非常基本的培训任务。 mahout示例的常见问题是，它们显示了如何使用20新闻组从cmd提示符使用mahout，或者代码非常依赖Hadoop Zookeeper等。如果有人能看看我的代码，或者给我介绍一个非常简单的教程，说明如何训练模型，然后使用它，我将非常感激

到目前为止，在下面的代码中，如果（best！=null），我将永远无法通过

if（best！=null）

，因为

learningAlgorithm.getBest（）总是返回null
很抱歉发布了整个代码，但没有看到任何其他选项
public class Classifier {

    private static final int FEATURES = 10000;
    private static final TextValueEncoder encoder = new TextValueEncoder("body");
    private static final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
    private static final String[] LEAK_LABELS = {"none", "month-year", "day-month-year"};

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws Exception {
        int leakType = 0;
        // TODO code application logic here
        AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(20, FEATURES, new L1());
        Dictionary newsGroups = new Dictionary();
        //ModelDissector md = new ModelDissector();
        ListMultimap<String, String> noteBySection = LinkedListMultimap.create();
        noteBySection.put("good", "I love this product, the screen is a pleasure to work with and is a great choice for any business");
        noteBySection.put("good", "What a product!! Really amazing clarity and works pretty well");
        noteBySection.put("good", "This product has good battery life and is a little bit heavy but I like it");

        noteBySection.put("bad", "I am really bored with the same UI, this is their 5th version(or fourth or sixth, who knows) and it looks just like the first one");
        noteBySection.put("bad", "The phone is bulky and useless");
        noteBySection.put("bad", "I wish i had never bought this laptop. It died in the first year and now i am not able to return it");


        encoder.setProbes(2);
        double step = 0;
        int[] bumps = {1, 2, 5};
        double averageCorrect = 0;
        double averageLL = 0;
        int k = 0;
        //-------------------------------------
        //notes.keySet()
        for (String key : noteBySection.keySet()) {
            System.out.println(key);
            List<String> notes = noteBySection.get(key);
            for (Iterator<String> it = notes.iterator(); it.hasNext();) {
                String note = it.next();


                int actual = newsGroups.intern(key);
                Vector v = encodeFeatureVector(note);
                learningAlgorithm.train(actual, v);

                k++;
                int bump = bumps[(int) Math.floor(step) % bumps.length];
                int scale = (int) Math.pow(10, Math.floor(step / bumps.length));
                State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();
                double maxBeta;
                double nonZeros;
                double positive;
                double norm;

                double lambda = 0;
                double mu = 0;
                if (best != null) {
                    CrossFoldLearner state = best.getPayload().getLearner();
                    averageCorrect = state.percentCorrect();
                    averageLL = state.logLikelihood();

                    OnlineLogisticRegression model = state.getModels().get(0);
                    // finish off pending regularization
                    model.close();

                    Matrix beta = model.getBeta();
                    maxBeta = beta.aggregate(Functions.MAX, Functions.ABS);
                    nonZeros = beta.aggregate(Functions.PLUS, new DoubleFunction() {

                        @Override
                        public double apply(double v) {
                            return Math.abs(v) > 1.0e-6 ? 1 : 0;
                        }
                    });
                    positive = beta.aggregate(Functions.PLUS, new DoubleFunction() {

                        @Override
                        public double apply(double v) {
                            return v > 0 ? 1 : 0;
                        }
                    });
                    norm = beta.aggregate(Functions.PLUS, Functions.ABS);

                    lambda = learningAlgorithm.getBest().getMappedParams()[0];
                    mu = learningAlgorithm.getBest().getMappedParams()[1];
                } else {
                    maxBeta = 0;
                    nonZeros = 0;
                    positive = 0;
                    norm = 0;
                }
                System.out.println(k % (bump * scale));
                if (k % (bump * scale) == 0) {

                    if (learningAlgorithm.getBest() != null) {
                        System.out.println("----------------------------");
                        ModelSerializer.writeBinary("c:/tmp/news-group-" + k + ".model",
                                learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
                    }

                    step += 0.25;
                    System.out.printf("%.2f\t%.2f\t%.2f\t%.2f\t%.8g\t%.8g\t", maxBeta, nonZeros, positive, norm, lambda, mu);
                    System.out.printf("%d\t%.3f\t%.2f\t%s\n",
                            k, averageLL, averageCorrect * 100, LEAK_LABELS[leakType % 3]);
                }
            }

        }
         learningAlgorithm.close();
    }

    private static Vector encodeFeatureVector(String text) {
        encoder.addText(text.toLowerCase());
        //System.out.println(encoder.asString(text));
        Vector v = new RandomAccessSparseVector(FEATURES);
        bias.addToVector((byte[]) null, 1, v);
        encoder.flush(1, v);
        return v;
    }
}

公共类分类器{
专用静态最终整数特性=10000；
专用静态最终TextValueEncoder编码器=新的TextValueEncoder（“正文”）；
专用静态最终功能矢量编码器偏差=新的ConstantValueEncoder（“截取”）；
私有静态最终字符串[]泄漏标签={“无”、“月-年”、“日-月-年”}；
/**
*@param指定命令行参数
*/
公共静态void main（字符串[]args）引发异常{
int-leakType=0；
//此处的TODO代码应用程序逻辑
AdaptivelogisticRegressionLearningAlgorithm=新的AdaptiveLogisticRegression（20，特征，新L1（））；
字典新闻组=新字典（）；
//ModelDissector md=新的ModelDissector（）；
ListMultimap noteBySection=LinkedListMultimap.create（）；
noteBySection.put（“好”，“我喜欢这个产品，屏幕是一种工作的乐趣，是任何企业的最佳选择”）；
放（“好”，“多好的产品！！非常令人惊讶的清晰度和工作非常好”）；
注：部分。put（“好”，“本产品电池寿命好，有点重，但我喜欢”）；
put（“糟糕”，“我真的对同一个UI感到厌烦，这是他们的第五个版本（或者第四个或第六个，谁知道呢），看起来就像第一个版本”）；
注意。放（“坏”，“电话笨重又没用”）；
noteBySection.put（“糟糕”，“我真希望我从来没有买过这台笔记本电脑。它在第一年就死了，现在我无法归还它”）；
编码器。设置探头（2）；
双台阶=0；
int[]凹凸={1,2,5}；
双平均正确=0；
双平均值=0；
int k=0；
//-------------------------------------
//notes.keySet（）
for（字符串键：noteBySection.keySet（））{
系统输出打印项次（键）；
List notes=noteBySection.get（键）；
for（Iterator it=notes.Iterator（）；it.hasNext（）；）{
String note=it.next（）；
int实际=新闻组.intern（键）；
向量v=编码特征向量（注）；
学习算法。训练（实际，v）；
k++；
int bump=bumps[（int）数学地板（步长）%bumps.length]；
int scale=（int）Math.pow（10，Math.floor（步长/凹凸长度））；
State best=learningAlgorithm.getBest（）；
双maxBeta；
双非零；
双阳性；
双范数；
双λ=0；
双μ=0；
如果（最佳！=null）{
CrossFoldLearner state=best.getPayload（）.getLearner（）；
averageCorrect=state.percentCorrect（）；
averageLL=state.loglikelion（）；
OnlineLogistic回归模型=state.getModels（）.get（0）；
//完成待处理的正则化
model.close（）；
矩阵beta=model.getBeta（）；
maxBeta=beta.aggregate（Functions.MAX，Functions.ABS）；
非零=beta.aggregate（Functions.PLUS，new DoubleFunction（）{
@凌驾
公开双申请（双v）{
返回数学abs（v）>1.0e-6？1:0；
}
});
正值=beta.aggregate（Functions.PLUS，新的DoubleFunction（）{
@凌驾
公开双申请（双v）{
返回v>0？1:0；
}
});
norm=beta.聚合（Functions.PLUS，Functions.ABS）；
lambda=learningAlgorithm.getBest（）.getMappedParams（）[0]；
mu=learningAlgorithm.getBest（）.getMappedParams（）[1]；
}否则{
maxBeta=0；
非零=0；
正=0；
范数=0；
}
系统输出打印项次（k%（凹凸*比例））；
如果（k%（凹凸*比例）==0）{
if（learningAlgorithm.getBest（）！=null）{
System.out.println（“-------------------------------”；
ModelSerializer.writeBinary（“c:/tmp/news group-”+k+“.model”，
learningAlgorithm.getBest（）.getPayload（）.getLearner（）.getModels（）.get（0））；
}
阶跃+=0.25；
System.out.printf（“%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.8g\t%.8g\t”，最大β，非零，正，范数，λ，μ）；
System.out.printf（“%d\t%.3f\t%.2f\t%s\n”，
k、 averageLL，averageCorrect*100，泄漏标签[leakType%3]；
}
}
}
learningAlgorithm.close（）；
}
专用静态向量编码器FeatureVector（字符串文本）{
addText（text.toLowerCase（））；
//System.out.println（编码器.asString（文本））；
向量v=ne
        bias.addToVector((byte[]) null, 1, v);

learningAlgorithm.setInterval(1);
learningAlgorithm.setAveragingWindow(1);