Nlp Weka-朴素贝叶斯总是给出边界结果_Nlp_Weka_Text Classification

Nlp Weka-朴素贝叶斯总是给出边界结果

nlp

Nlp Weka-朴素贝叶斯总是给出边界结果,nlp,weka,text-classification,Nlp,Weka,Text Classification,我正试图用朴素贝叶斯（naivebayes）在Weka中编写一个文本分类器。我收集了一组Foursquare提示作为培训数据，其中有近500个提示在excel文件中标记为正，几乎相同的提示标记为负。输入文件有两列，第一列是提示文本，第二列是标记的极性。我正在使用AFINN-111.txt添加一个属性来增强输出。它计算该提示中的所有极性单词，并给出所有单词的最终分数。这是我的全部代码： public class DataReader { static Map<String,

我正试图用朴素贝叶斯（naivebayes）在Weka中编写一个文本分类器。我收集了一组Foursquare提示作为培训数据，其中有近500个提示在excel文件中标记为正，几乎相同的提示标记为负。输入文件有两列，第一列是提示文本，第二列是标记的极性。我正在使用AFINN-111.txt添加一个属性来增强输出。它计算该提示中的所有极性单词，并给出所有单词的最终分数。这是我的全部代码：

    public class DataReader {

    static Map<String, Integer> affinMap = new HashMap<String, Integer>();

    public List<List<Object>> createAttributeList() {
        ClassLoader classLoader = getClass().getClassLoader();
        initializeAFFINMap(classLoader);
        File inputWorkbook = new File(classLoader
                .getResource("Tip_dataset2.xls").getFile());
        Workbook w;
        Sheet sheet = null;
        try {
            w = Workbook.getWorkbook(inputWorkbook);
            // Get the first sheet
            sheet = w.getSheet(0);
        } catch (Exception e) {
            e.printStackTrace();
        }
        List<List<Object>> attributeList = new ArrayList<List<Object>>();
        for (int i = 1; i < sheet.getRows(); i++) {
            String tip = sheet.getCell(0, i).getContents();

            tip = tip.replaceAll("'", "");
            tip = tip.replaceAll("\"", "");
            tip = tip.replaceAll("%", " percent");
            tip = tip.replaceAll("@", " ATAUTHOR");
            String polarity = getPolarity(sheet.getCell(1, i).getContents());
            int affinScore = 0;
            String[] arr = tip.split(" ");
            for (int j = 0; j < arr.length; j++) {
                if (affinMap.containsKey(arr[j].toLowerCase())) {
                    affinScore = affinScore
                            + affinMap.get(arr[j].toLowerCase());
                }
            }
            List<Object> attrs = new ArrayList<Object>();
            attrs.add(tip);
            attrs.add(affinScore);
            attrs.add(polarity);

            attributeList.add(attrs);
        }
        return attributeList;
    }

    private String getPolarity(String cell) {
        if (cell.equalsIgnoreCase("positive")) {
            return "positive";
        } else {
            return "negative";
        }
    }

    private void initializeAFFINMap(ClassLoader classLoader) {
        try {
            InputStream stream = classLoader
                    .getResourceAsStream("AFINN-111.txt");
            DataInputStream in = new DataInputStream(stream);
            BufferedReader br = new BufferedReader(new InputStreamReader(in));
            String str;
            while ((str = br.readLine()) != null) {
                String[] array = str.split("\t");
                affinMap.put(array[0], Integer.parseInt(array[1]));
            }
            in.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) throws Exception {
        List<List<Object>> attrList=new DataReader().createAttributeList();
        new CreateTrainedModel().createTrainingData(attrList);
    }

}

公共类数据读取器{
静态映射仿射映射=新HashMap（）；
公共列表createAttributeList（）{
ClassLoader ClassLoader=getClass（）.getClassLoader（）；
初始化AffinMap（类加载器）；
文件输入工作簿=新文件（类加载器
.getResource（“Tip_dataset2.xls”）.getFile（）；
工作手册w；
Sheet=null；
试一试{
w=Workbook.getWorkbook（inputWorkbook）；
//拿到第一张
图纸=w.getSheet（0）；
}捕获（例外e）{
e、 printStackTrace（）；
}
List attributeList=新建ArrayList（）；
对于（int i=1；i


以下是实际的分类器类：
public class CreateTrainedModel {

    public void createTrainingData(List<List<Object>> attrList)
            throws Exception {

        Attribute tip = new Attribute("tip", (FastVector) null);
        Attribute affin = new Attribute("affinScore");

        FastVector pol = new FastVector(2);
        pol.addElement("positive");
        pol.addElement("negative");
        Attribute polaritycl = new Attribute("polarity", pol);

        FastVector inputDataDesc = new FastVector(3);
        inputDataDesc.addElement(tip);
        inputDataDesc.addElement(affin);
        inputDataDesc.addElement(polaritycl);

        Instances dataSet = new Instances("dataset", inputDataDesc,
                attrList.size());
        // Set class index
        dataSet.setClassIndex(2);

        for (List<Object> onList : attrList) {
            Instance in = new Instance(3);
            in.setValue((Attribute) inputDataDesc.elementAt(0), onList.get(0)
                    .toString());
            in.setValue((Attribute) inputDataDesc.elementAt(1),
                    Integer.parseInt(onList.get(1).toString()));
            in.setValue((Attribute) inputDataDesc.elementAt(2), onList.get(2)
                    .toString());

            dataSet.add(in);
        }

        Filter f = new StringToWordVector();
        f.setInputFormat(dataSet);
        dataSet = Filter.useFilter(dataSet, f);

        Classifier model = (Classifier) new NaiveBayes();
        try {
            model.buildClassifier(dataSet);
        } catch (Exception e1) { // TODO Auto-generated catch block
            e1.printStackTrace();
        }

        ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(
                "FS-TipsNaiveBayes.model"));
        oos.writeObject(model);
        oos.flush();
        oos.close();

        FastVector fvWekaAttributes1 = new FastVector(3);
        fvWekaAttributes1.addElement(tip);
        fvWekaAttributes1.addElement(affin);

        Instance in = new Instance(3);
        in.setValue((Attribute) fvWekaAttributes1.elementAt(0),
                "burger here is good");
        in.setValue((Attribute) fvWekaAttributes1.elementAt(1), 0);

        Instances testSet = new Instances("dataset", fvWekaAttributes1, 1);
        in.setDataset(testSet);

        double[] fDistribution = model.distributionForInstance(in);
        System.out.println(fDistribution);

    }

}

public类CreateTrainedModel{
public void createTrainingData（列表属性列表）
抛出异常{
属性tip=新属性（“tip”，（FastVector）null）；
属性仿射=新属性（“仿射分数”）；
FastVector pol=新的FastVector（2）；
pol.addElement（“正数”）；
pol.addElement（“负数”）；
属性极性周期=新属性（“极性”，pol）；
FastVector inputDataDesc=新的FastVector（3）；
输入数据描述添加元素（tip）；
inputDataDesc.addElement（仿射）；
输入数据描述加法器（polaritycl）；
实例数据集=新实例（“数据集”，inputDataDesc，
attrList.size（））；
//集合类索引
dataSet.setClassIndex（2）；
对于（仅列表：属性列表）{
实例in=新实例（3）；
在.setValue（（属性）inputDataDesc.elementAt（0），onList.get（0）
.toString（））；
在.setValue（（属性）inputDataDesc.elementAt（1）中，
Integer.parseInt（onList.get（1.toString（））；
在.setValue（（属性）inputDataDesc.elementAt（2），onList.get（2）
.toString（））；
dataSet.add（in）；
}
过滤器f=新的StringToOrdVector（）；
f、 setInputFormat（数据集）；
dataSet=Filter.useFilter（数据集，f）；
分类器模型=（分类器）新朴素贝叶斯（）；
试一试{
构建分类器（数据集）；
}catch（异常e1）{//TODO自动生成的catch块
e1.printStackTrace（）；
}
ObjectOutputStream oos=新的ObjectOutputStream（新文件OutputStream(
“FS Tipsbayes.model”）；
oos.writeObject（模型）；
oos.flush（）；
oos.close（）；
FastVector fvWekaAttributes1=新的FastVector（3）；
fvWekaAttributes1.附加元素（tip）；
fvWekaAttributes1.加法（仿射）；
实例in=新实例（3）；
在.setValue（（属性）fvWekaAttributes1.elementAt（0）中，
“这里的汉堡很好”）；
in.setValue（（属性）fvWekaAttributes1.elementAt（1），0）；
实例测试集=新实例（“数据集”，fvWekaAttributes1，1）；
in.setDataset（testSet）；
double[]fddistribution=model.distribution例如（in）；
System.out.println（fddistribution）；
}
}

我面临的问题是，对于任何输入，输出分布总是在[0.52314376998377，0.47685623001622995]的范围内
。而且它总是更倾向于正数而不是负数。这些数字变化不大。你知道我做错了什么吗？
我没有读你的代码，但有一点我可以说，仿射分数是在一定范围内标准化的。如果y