Java 如何为支持向量机训练数据？_Java_Classification_Svm_Libsvm

Java 如何为支持向量机训练数据？

java

Java 如何为支持向量机训练数据？,java,classification,svm,libsvm,Java,Classification,Svm,Libsvm,我想使用java ml来训练我的数据，以便对一些文档进行分类，我在做什么：我有两个类别，每个类别有11000个文档。我总共有92199个功能，用于信息增益-卡方检验-互信息-基尼，我使用其中20000个2列所以我有22000个文档和20000个特性来训练数据，我发现每个文档和特性的交叉点，所以我有：每个文档和功能的交集不同：要素中存在但文档中不存在的数据因此，我在一个文档2序列中发送交叉点，其tf_idf，与th_idf=0不同我是如何做到这一点的： public void buil

我想使用java ml来训练我的数据，以便对一些文档进行分类，我在做什么：

我有两个类别，每个类别有11000个文档。我总共有92199个功能，用于

信息增益-卡方检验-互信息-基尼

，我使用其中20000个2列

所以我有22000个文档和20000个特性来训练数据，我发现每个文档和特性的交叉点，所以我有：

每个文档和功能的交集

不同：要素中存在但文档中不存在的数据

因此，我在一个文档2序列中发送交叉点，其

tf_idf

，与

th_idf=0

不同

我是如何做到这一点的：

public void buildDataset() {
    DBDocMeta dbDocMeta; // the table that contains documents
    dataset = new DefaultDataset();
    neighbors.add(new Neighbor<Integer>("cat1")); // each neighbor contains a Document List 
    neighbors.add(new Neighbor<Integer>("cat2"));// neighbor integer: document{index,tf_idf} neighbor string : {word,tf_idf}
    try {
        dbDocMeta = new DBDocMeta();
        Map<Long, String> docInfo = dbDocMeta.getDocInfo();
        int count = 1;
        id:
        for (Long id : docInfo.keySet()) {
            count++;
            String cat = docInfo.get(id);
            System.out.println("***********************************************");
            System.out.println("Available processors (cores): " + Runtime.getRuntime().availableProcessors());
            Long freeMemory = Runtime.getRuntime().freeMemory();
            System.out.println("Free memory (bytes): " + freeMemory);
            if (freeMemory <= 500000000) {
                System.out.println("memory problem occurred !!!");
                net.sf.javaml.tools.data.FileHandler.exportDataset(dataset, new File("dataset.data"));
                break id;
            }
            long maxMemory = Runtime.getRuntime().maxMemory();
            System.out.println("Maximum memory (bytes): " + (maxMemory == Long.MAX_VALUE ? "no limit" : maxMemory));
            System.out.println("Total memory available to JVM (bytes): " + Runtime.getRuntime().totalMemory());
            System.out.println("category : " + cat);
            System.out.println("***********************************************");
            Document<String> doc1 = dbWeight.getNeighbors(id);

            Instance instance = new SparseInstance();
            instance.setClassValue(cat);
            if (!doc1.getAttributes().isEmpty()) {

                neighbors:
                for (Neighbor<Integer> neighbor : neighbors) {
                    if (!neighbor.getCategory().equalsIgnoreCase(cat)) {

                        continue neighbors;
                    }

                    Set<String> intersectionWords = intersection(features, doc1.getAttributes().keySet());
                    if (intersectionWords.isEmpty()) {
                        continue id;
                    }
                    HashSet<String> different = new HashSet<String>(features);
                    for (String word : intersectionWords) {
                        instance.put(dbWeight.getIndex(word), doc1.getAttributes().get(word));
                        different.remove(word);
                    }
                    for (String word : different) {
                        instance.put(dbWeight.getIndex(word), 0.0);
                    }
                    dataset.add(instance);

                    break neighbors;
                }
            }
        }
    } catch (InterruptedException e) {
        e.printStackTrace();
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
    try {
        net.sf.javaml.tools.data.FileHandler.exportDataset(dataset, new File("save.data"));
        System.out.println("dataset has exported successfully");
    } catch (Exception e) {
        System.out.println("failed to export dataset");
        e.printStackTrace();
    }

}



private static <A> Set<A> intersection(final Set<A> xs, final Set<A> ys) {
    // make sure that xs is the smaller set
    if (ys.size() < xs.size()) {
        return intersection(ys, xs);
    }

    final HashSet<A> result = new HashSet<A>();
    for (A x : xs) {
        if (ys.contains(x)) {
            result.add(x);
        }
    }

    return result;
}

public void buildDataset（）{
DBDocMeta DBDocMeta；//包含文档的表
dataset=新的DefaultDataset（）；
add（新邻居（“cat1”）；//每个邻居包含一个文档列表
add（新邻居（“cat2”）；//邻居整数：文档{index，tf_idf}邻居字符串：{word，tf_idf}
试一试{
dbDocMeta=新的dbDocMeta（）；
Map docInfo=dbDocMeta.getDocInfo（）；
整数计数=1；
身份证件：
for（长id:docInfo.keySet（））{
计数++；
字符串cat=docInfo.get（id）；
System.out.println（“**********************************************************************”）；
System.out.println（“可用处理器（核心）：”+Runtime.getRuntime（）.availableProcessor（））；
Long freemory=Runtime.getRuntime（）.freemory（）；
System.out.println（“可用内存（字节）：”+可用内存）；
如果（freeMemory我的尝试
publicstaticvoidmain（字符串…arg）{
BagoWords=prepareBOW（数据集）；//提供数据集
编制意向书意向书清单（negData，“-1”）；
编制实体实体清单（posData，“+1”）；
}
公共列表prepareBOW（列表数据集）{
BagoWords=新的ArrayList（）；
//迭代每一组数据/句子。
用于（字符串s:数据集）{
字符串[]字=s.split（“”）；
添加（“*&^（0”）；
//在列表中添加句子/数据的每个单词。
for（int i=0；i
为什么不使用Instance.put

？它的研究目的..但是你能告诉我我应该把Instance.put放在哪里而不是arrayList吗？在

PrepareSenticationsEntencesList

上你可以使用

Instance.put（key，value

），它

key

是索引，

value

是权重`

public static void main(String...  arg){ 

 bagOfWords = prepareBOW(dataSet); // Provide dataset 

  prepareSentimentalSentencesList(negData, "-1 ");

   prepareSentimentalSentencesList(posData, "+1 ");

}


public List<String> prepareBOW(List<String> dataSet) {

    bagOfWords = new ArrayList<String>();

    // iterating each and every set of data/sentence.
    for (String s : dataSet) {

        String[] words = s.split(" ");
        bagOfWords.add("*&^(0");


        // adding each word of sentence/data in list.
        for (int i = 0; i < words.length; i++) {
            words[i] = words[i].replaceAll(",", "");
            words[i] = words[i].replaceAll(" ", "");
            words[i] = words[i].replaceAll("\\.", "");
            words[i] = words[i].toLowerCase();
            bagOfWords.add(words[i]);

        }

    }
    bagOfWords.remove("");
    bagOfWords = new ArrayList<String>(new LinkedHashSet<String>(bagOfWords));// Removing duplicates.

    return bagOfWords;

}

public void prepareSentimentalSentencesList(List<String> dataSet, String label) {
        List<String> list = new ArrayList<String>();
        for (String data : dataSet) {

        String wordsIndex = label;
        for (String word : data.split(" ")) {
            word = word.replaceAll(",", "");
            word = word.replaceAll(" ", "");
            word = word.replaceAll("\\.", "");
            word = word.toLowerCase();
            int index = getIndex(word);
            if (index != -1) {
                wordsIndex += (index) + ":1 ";
            }


        }
        list.add(wordsIndex);
    }

    for (String s : list) {
          System.out.println(s);
    }
}