Java 简单的Mahout分类示例
我想训练驯服员进行分类。对我来说,这篇文章来自数据库,我真的不想把它们存储到文件中,以供mahout培训。我检查了MIA源代码,并更改了以下代码,以完成非常基本的培训任务。 mahout示例的常见问题是,它们显示了如何使用20新闻组从cmd提示符使用mahout,或者代码非常依赖Hadoop Zookeeper等。如果有人能看看我的代码,或者给我介绍一个非常简单的教程,说明如何训练模型,然后使用它,我将非常感激 到目前为止,在下面的代码中,如果(best!=null),我将永远无法通过Java 简单的Mahout分类示例,java,mahout,document-classification,Java,Mahout,Document Classification,我想训练驯服员进行分类。对我来说,这篇文章来自数据库,我真的不想把它们存储到文件中,以供mahout培训。我检查了MIA源代码,并更改了以下代码,以完成非常基本的培训任务。 mahout示例的常见问题是,它们显示了如何使用20新闻组从cmd提示符使用mahout,或者代码非常依赖Hadoop Zookeeper等。如果有人能看看我的代码,或者给我介绍一个非常简单的教程,说明如何训练模型,然后使用它,我将非常感激 到目前为止,在下面的代码中,如果(best!=null),我将永远无法通过if(be
if(best!=null)
,因为learningAlgorithm.getBest()代码>总是返回null
很抱歉发布了整个代码,但没有看到任何其他选项
public class Classifier {
private static final int FEATURES = 10000;
private static final TextValueEncoder encoder = new TextValueEncoder("body");
private static final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
private static final String[] LEAK_LABELS = {"none", "month-year", "day-month-year"};
/**
* @param args the command line arguments
*/
public static void main(String[] args) throws Exception {
int leakType = 0;
// TODO code application logic here
AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(20, FEATURES, new L1());
Dictionary newsGroups = new Dictionary();
//ModelDissector md = new ModelDissector();
ListMultimap<String, String> noteBySection = LinkedListMultimap.create();
noteBySection.put("good", "I love this product, the screen is a pleasure to work with and is a great choice for any business");
noteBySection.put("good", "What a product!! Really amazing clarity and works pretty well");
noteBySection.put("good", "This product has good battery life and is a little bit heavy but I like it");
noteBySection.put("bad", "I am really bored with the same UI, this is their 5th version(or fourth or sixth, who knows) and it looks just like the first one");
noteBySection.put("bad", "The phone is bulky and useless");
noteBySection.put("bad", "I wish i had never bought this laptop. It died in the first year and now i am not able to return it");
encoder.setProbes(2);
double step = 0;
int[] bumps = {1, 2, 5};
double averageCorrect = 0;
double averageLL = 0;
int k = 0;
//-------------------------------------
//notes.keySet()
for (String key : noteBySection.keySet()) {
System.out.println(key);
List<String> notes = noteBySection.get(key);
for (Iterator<String> it = notes.iterator(); it.hasNext();) {
String note = it.next();
int actual = newsGroups.intern(key);
Vector v = encodeFeatureVector(note);
learningAlgorithm.train(actual, v);
k++;
int bump = bumps[(int) Math.floor(step) % bumps.length];
int scale = (int) Math.pow(10, Math.floor(step / bumps.length));
State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();
double maxBeta;
double nonZeros;
double positive;
double norm;
double lambda = 0;
double mu = 0;
if (best != null) {
CrossFoldLearner state = best.getPayload().getLearner();
averageCorrect = state.percentCorrect();
averageLL = state.logLikelihood();
OnlineLogisticRegression model = state.getModels().get(0);
// finish off pending regularization
model.close();
Matrix beta = model.getBeta();
maxBeta = beta.aggregate(Functions.MAX, Functions.ABS);
nonZeros = beta.aggregate(Functions.PLUS, new DoubleFunction() {
@Override
public double apply(double v) {
return Math.abs(v) > 1.0e-6 ? 1 : 0;
}
});
positive = beta.aggregate(Functions.PLUS, new DoubleFunction() {
@Override
public double apply(double v) {
return v > 0 ? 1 : 0;
}
});
norm = beta.aggregate(Functions.PLUS, Functions.ABS);
lambda = learningAlgorithm.getBest().getMappedParams()[0];
mu = learningAlgorithm.getBest().getMappedParams()[1];
} else {
maxBeta = 0;
nonZeros = 0;
positive = 0;
norm = 0;
}
System.out.println(k % (bump * scale));
if (k % (bump * scale) == 0) {
if (learningAlgorithm.getBest() != null) {
System.out.println("----------------------------");
ModelSerializer.writeBinary("c:/tmp/news-group-" + k + ".model",
learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
}
step += 0.25;
System.out.printf("%.2f\t%.2f\t%.2f\t%.2f\t%.8g\t%.8g\t", maxBeta, nonZeros, positive, norm, lambda, mu);
System.out.printf("%d\t%.3f\t%.2f\t%s\n",
k, averageLL, averageCorrect * 100, LEAK_LABELS[leakType % 3]);
}
}
}
learningAlgorithm.close();
}
private static Vector encodeFeatureVector(String text) {
encoder.addText(text.toLowerCase());
//System.out.println(encoder.asString(text));
Vector v = new RandomAccessSparseVector(FEATURES);
bias.addToVector((byte[]) null, 1, v);
encoder.flush(1, v);
return v;
}
}
公共类分类器{
专用静态最终整数特性=10000;
专用静态最终TextValueEncoder编码器=新的TextValueEncoder(“正文”);
专用静态最终功能矢量编码器偏差=新的ConstantValueEncoder(“截取”);
私有静态最终字符串[]泄漏标签={“无”、“月-年”、“日-月-年”};
/**
*@param指定命令行参数
*/
公共静态void main(字符串[]args)引发异常{
int-leakType=0;
//此处的TODO代码应用程序逻辑
AdaptivelogisticRegressionLearningAlgorithm=新的AdaptiveLogisticRegression(20,特征,新L1());
字典新闻组=新字典();
//ModelDissector md=新的ModelDissector();
ListMultimap noteBySection=LinkedListMultimap.create();
noteBySection.put(“好”,“我喜欢这个产品,屏幕是一种工作的乐趣,是任何企业的最佳选择”);
放(“好”,“多好的产品!!非常令人惊讶的清晰度和工作非常好”);
注:部分。put(“好”,“本产品电池寿命好,有点重,但我喜欢”);
put(“糟糕”,“我真的对同一个UI感到厌烦,这是他们的第五个版本(或者第四个或第六个,谁知道呢),看起来就像第一个版本”);
注意。放(“坏”,“电话笨重又没用”);
noteBySection.put(“糟糕”,“我真希望我从来没有买过这台笔记本电脑。它在第一年就死了,现在我无法归还它”);
编码器。设置探头(2);
双台阶=0;
int[]凹凸={1,2,5};
双平均正确=0;
双平均值=0;
int k=0;
//-------------------------------------
//notes.keySet()
for(字符串键:noteBySection.keySet()){
系统输出打印项次(键);
List notes=noteBySection.get(键);
for(Iterator it=notes.Iterator();it.hasNext();){
String note=it.next();
int实际=新闻组.intern(键);
向量v=编码特征向量(注);
学习算法。训练(实际,v);
k++;
int bump=bumps[(int)数学地板(步长)%bumps.length];
int scale=(int)Math.pow(10,Math.floor(步长/凹凸长度));
State best=learningAlgorithm.getBest();
双maxBeta;
双非零;
双阳性;
双范数;
双λ=0;
双μ=0;
如果(最佳!=null){
CrossFoldLearner state=best.getPayload().getLearner();
averageCorrect=state.percentCorrect();
averageLL=state.loglikelion();
OnlineLogistic回归模型=state.getModels().get(0);
//完成待处理的正则化
model.close();
矩阵beta=model.getBeta();
maxBeta=beta.aggregate(Functions.MAX,Functions.ABS);
非零=beta.aggregate(Functions.PLUS,new DoubleFunction(){
@凌驾
公开双申请(双v){
返回数学abs(v)>1.0e-6?1:0;
}
});
正值=beta.aggregate(Functions.PLUS,新的DoubleFunction(){
@凌驾
公开双申请(双v){
返回v>0?1:0;
}
});
norm=beta.聚合(Functions.PLUS,Functions.ABS);
lambda=learningAlgorithm.getBest().getMappedParams()[0];
mu=learningAlgorithm.getBest().getMappedParams()[1];
}否则{
maxBeta=0;
非零=0;
正=0;
范数=0;
}
系统输出打印项次(k%(凹凸*比例));
如果(k%(凹凸*比例)==0){
if(learningAlgorithm.getBest()!=null){
System.out.println(“-------------------------------”;
ModelSerializer.writeBinary(“c:/tmp/news group-”+k+“.model”,
learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
}
阶跃+=0.25;
System.out.printf(“%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.8g\t%.8g\t”,最大β,非零,正,范数,λ,μ);
System.out.printf(“%d\t%.3f\t%.2f\t%s\n”,
k、 averageLL,averageCorrect*100,泄漏标签[leakType%3];
}
}
}
learningAlgorithm.close();
}
专用静态向量编码器FeatureVector(字符串文本){
addText(text.toLowerCase());
//System.out.println(编码器.asString(文本));
向量v=ne
bias.addToVector((byte[]) null, 1, v);
learningAlgorithm.setInterval(1);
learningAlgorithm.setAveragingWindow(1);