Java 如何为支持向量机训练数据?
我想使用java ml来训练我的数据,以便对一些文档进行分类,我在做什么: 我有两个类别,每个类别有11000个文档。我总共有92199个功能,用于Java 如何为支持向量机训练数据?,java,classification,svm,libsvm,Java,Classification,Svm,Libsvm,我想使用java ml来训练我的数据,以便对一些文档进行分类,我在做什么: 我有两个类别,每个类别有11000个文档。我总共有92199个功能,用于信息增益-卡方检验-互信息-基尼,我使用其中20000个2列 所以我有22000个文档和20000个特性来训练数据,我发现每个文档和特性的交叉点,所以我有: 每个文档和功能的交集 不同:要素中存在但文档中不存在的数据 因此,我在一个文档2序列中发送交叉点,其tf_idf,与th_idf=0不同 我是如何做到这一点的: public void buil
信息增益-卡方检验-互信息-基尼
,我使用其中20000个2列
所以我有22000个文档和20000个特性来训练数据,我发现每个文档和特性的交叉点,所以我有:
每个文档和功能的交集
不同:要素中存在但文档中不存在的数据
因此,我在一个文档2序列中发送交叉点,其tf_idf
,与th_idf=0
不同
我是如何做到这一点的:
public void buildDataset() {
DBDocMeta dbDocMeta; // the table that contains documents
dataset = new DefaultDataset();
neighbors.add(new Neighbor<Integer>("cat1")); // each neighbor contains a Document List
neighbors.add(new Neighbor<Integer>("cat2"));// neighbor integer: document{index,tf_idf} neighbor string : {word,tf_idf}
try {
dbDocMeta = new DBDocMeta();
Map<Long, String> docInfo = dbDocMeta.getDocInfo();
int count = 1;
id:
for (Long id : docInfo.keySet()) {
count++;
String cat = docInfo.get(id);
System.out.println("***********************************************");
System.out.println("Available processors (cores): " + Runtime.getRuntime().availableProcessors());
Long freeMemory = Runtime.getRuntime().freeMemory();
System.out.println("Free memory (bytes): " + freeMemory);
if (freeMemory <= 500000000) {
System.out.println("memory problem occurred !!!");
net.sf.javaml.tools.data.FileHandler.exportDataset(dataset, new File("dataset.data"));
break id;
}
long maxMemory = Runtime.getRuntime().maxMemory();
System.out.println("Maximum memory (bytes): " + (maxMemory == Long.MAX_VALUE ? "no limit" : maxMemory));
System.out.println("Total memory available to JVM (bytes): " + Runtime.getRuntime().totalMemory());
System.out.println("category : " + cat);
System.out.println("***********************************************");
Document<String> doc1 = dbWeight.getNeighbors(id);
Instance instance = new SparseInstance();
instance.setClassValue(cat);
if (!doc1.getAttributes().isEmpty()) {
neighbors:
for (Neighbor<Integer> neighbor : neighbors) {
if (!neighbor.getCategory().equalsIgnoreCase(cat)) {
continue neighbors;
}
Set<String> intersectionWords = intersection(features, doc1.getAttributes().keySet());
if (intersectionWords.isEmpty()) {
continue id;
}
HashSet<String> different = new HashSet<String>(features);
for (String word : intersectionWords) {
instance.put(dbWeight.getIndex(word), doc1.getAttributes().get(word));
different.remove(word);
}
for (String word : different) {
instance.put(dbWeight.getIndex(word), 0.0);
}
dataset.add(instance);
break neighbors;
}
}
}
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
try {
net.sf.javaml.tools.data.FileHandler.exportDataset(dataset, new File("save.data"));
System.out.println("dataset has exported successfully");
} catch (Exception e) {
System.out.println("failed to export dataset");
e.printStackTrace();
}
}
private static <A> Set<A> intersection(final Set<A> xs, final Set<A> ys) {
// make sure that xs is the smaller set
if (ys.size() < xs.size()) {
return intersection(ys, xs);
}
final HashSet<A> result = new HashSet<A>();
for (A x : xs) {
if (ys.contains(x)) {
result.add(x);
}
}
return result;
}
public void buildDataset(){
DBDocMeta DBDocMeta;//包含文档的表
dataset=新的DefaultDataset();
add(新邻居(“cat1”);//每个邻居包含一个文档列表
add(新邻居(“cat2”);//邻居整数:文档{index,tf_idf}邻居字符串:{word,tf_idf}
试一试{
dbDocMeta=新的dbDocMeta();
Map docInfo=dbDocMeta.getDocInfo();
整数计数=1;
身份证件:
for(长id:docInfo.keySet()){
计数++;
字符串cat=docInfo.get(id);
System.out.println(“**********************************************************************”);
System.out.println(“可用处理器(核心):”+Runtime.getRuntime().availableProcessor());
Long freemory=Runtime.getRuntime().freemory();
System.out.println(“可用内存(字节):”+可用内存);
如果(freeMemory我的尝试
publicstaticvoidmain(字符串…arg){
BagoWords=prepareBOW(数据集);//提供数据集
编制意向书意向书清单(negData,“-1”);
编制实体实体清单(posData,“+1”);
}
公共列表prepareBOW(列表数据集){
BagoWords=新的ArrayList();
//迭代每一组数据/句子。
用于(字符串s:数据集){
字符串[]字=s.split(“”);
添加(“*&^(0”);
//在列表中添加句子/数据的每个单词。
for(int i=0;i
为什么不使用Instance.put
?它的研究目的..但是你能告诉我我应该把Instance.put放在哪里而不是arrayList吗?在PrepareSenticationsEntencesList
上你可以使用Instance.put(key,value
),它key
是索引,value
是权重`
public static void main(String... arg){
bagOfWords = prepareBOW(dataSet); // Provide dataset
prepareSentimentalSentencesList(negData, "-1 ");
prepareSentimentalSentencesList(posData, "+1 ");
}
public List<String> prepareBOW(List<String> dataSet) {
bagOfWords = new ArrayList<String>();
// iterating each and every set of data/sentence.
for (String s : dataSet) {
String[] words = s.split(" ");
bagOfWords.add("*&^(0");
// adding each word of sentence/data in list.
for (int i = 0; i < words.length; i++) {
words[i] = words[i].replaceAll(",", "");
words[i] = words[i].replaceAll(" ", "");
words[i] = words[i].replaceAll("\\.", "");
words[i] = words[i].toLowerCase();
bagOfWords.add(words[i]);
}
}
bagOfWords.remove("");
bagOfWords = new ArrayList<String>(new LinkedHashSet<String>(bagOfWords));// Removing duplicates.
return bagOfWords;
}
public void prepareSentimentalSentencesList(List<String> dataSet, String label) {
List<String> list = new ArrayList<String>();
for (String data : dataSet) {
String wordsIndex = label;
for (String word : data.split(" ")) {
word = word.replaceAll(",", "");
word = word.replaceAll(" ", "");
word = word.replaceAll("\\.", "");
word = word.toLowerCase();
int index = getIndex(word);
if (index != -1) {
wordsIndex += (index) + ":1 ";
}
}
list.add(wordsIndex);
}
for (String s : list) {
System.out.println(s);
}
}