Java 如何在trainig集合中定义多个标签以用于Deeplearning4j?
我是新来的no ML,我已经开始使用Deeplearning4j库。我在源代码中迷失了方向。如何读取具有多个标签(而不仅仅是1个)的训练集?例如,我不想教lstm将文本分为4类。我如何才能阅读trainig数据集? 谢谢 编辑: 这就是我的迭代器代码现在的样子。我已经得到了POJO类的空缺,其中只包含技能的ID和空缺文本列表。在每个列车/测试集的每个文件中,有两行:一行带有ID(逗号是分隔符)和文本。所有的技能集包含4项技能,所以net的输出等于5项。我已经训练了word2vec模型,所以我的迭代器也使用了它 我使用原始代码作为示例 我的迭代器:Java 如何在trainig集合中定义多个标签以用于Deeplearning4j?,java,machine-learning,deep-learning,deeplearning4j,Java,Machine Learning,Deep Learning,Deeplearning4j,我是新来的no ML,我已经开始使用Deeplearning4j库。我在源代码中迷失了方向。如何读取具有多个标签(而不仅仅是1个)的训练集?例如,我不想教lstm将文本分为4类。我如何才能阅读trainig数据集? 谢谢 编辑: 这就是我的迭代器代码现在的样子。我已经得到了POJO类的空缺,其中只包含技能的ID和空缺文本列表。在每个列车/测试集的每个文件中,有两行:一行带有ID(逗号是分隔符)和文本。所有的技能集包含4项技能,所以net的输出等于5项。我已经训练了word2vec模型,所以我的迭
package SkillsMiner;
import SkillsMiner.Entities.VacancyLightEntity;
import SkillsMiner.Utils.Reader;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.deeplearning4j.datasets.iterator.DataSetIterator;
import org.deeplearning4j.models.embeddings.wordvectors.WordVectors;
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.dataset.DataSet;
import org.nd4j.linalg.dataset.api.DataSetPreProcessor;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.indexing.INDArrayIndex;
import org.nd4j.linalg.indexing.NDArrayIndex;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.NoSuchElementException;
/** This is a DataSetIterator that is specialized for the IMDB review dataset used in the Word2VecSentimentRNN example
* It takes either the train or test set data from this data set, plus a WordVectors object (typically the Google News
* 300 pretrained vectors from https://code.google.com/p/word2vec/) and generates training data sets.<br>
* Inputs/features: variable-length time series, where each word (with unknown words removed) is represented by
* its Word2Vec vector representation.<br>
* Labels/target: a single class (negative or positive), predicted at the final time step (word) of each review
*
* @author Alex Black
*/
public class SentimentExampleIterator implements DataSetIterator {
private final WordVectors wordVectors;
private final int batchSize;
private final int vectorSize;
private final int truncateLength;
private int cursor = 0;
private final File[] filePathes;
private final TokenizerFactory tokenizerFactory;
private int labelsCount = 4;
/**
* @param dataDirectory the directory of the IMDB review data set
* @param wordVectors WordVectors object
* @param batchSize Size of each minibatch for training
* @param truncateLength If reviews exceed
* @param train If true: return the training data. If false: return the testing data.
*/
public SentimentExampleIterator(String dataDirectory, WordVectors wordVectors, int batchSize, int truncateLength, boolean train) throws IOException {
this.batchSize = batchSize;
this.vectorSize = wordVectors.lookupTable().layerSize();
File p = new File(FilenameUtils.concat(dataDirectory, "learning/" + (train ? "train" : "test")) + "/");
filePathes = p.listFiles();
this.wordVectors = wordVectors;
this.truncateLength = truncateLength;
tokenizerFactory = new DefaultTokenizerFactory();
tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());
}
@Override
public DataSet next(int num) {
if (cursor >= filePathes.length) throw new NoSuchElementException();
try{
return nextDataSet(num);
}catch(IOException e){
throw new RuntimeException(e);
}
}
private DataSet nextDataSet(int num) throws IOException {
List<VacancyLightEntity> vacancies = new ArrayList<>(num);
boolean[] positive = new boolean[num];
for( int i=0; i<num && cursor<totalExamples(); i++ ){
String path = filePathes[cursor].getAbsolutePath();
vacancies.add(Reader.readVacancyFromFile(path));
cursor++;
}
//Second: tokenize vacancies and filter out unknown words
List<List<String>> allTokens = new ArrayList<>(vacancies.size());
int maxLength = 0;
for(VacancyLightEntity v : vacancies){
List<String> tokens = tokenizerFactory.create(v.getText()).getTokens();
List<String> tokensFiltered = new ArrayList<>();
for(String t : tokens ){
if(wordVectors.hasWord(t)) tokensFiltered.add(t);
}
allTokens.add(tokensFiltered);
maxLength = Math.max(maxLength,tokensFiltered.size());
}
//If longest review exceeds 'truncateLength': only take the first 'truncateLength' words
if(maxLength > truncateLength) maxLength = truncateLength;
//Create data for training
//Here: we have vacancies.size() examples of varying lengths
INDArray features = Nd4j.create(vacancies.size(), vectorSize, maxLength);
INDArray labels = Nd4j.create(vacancies.size(), labelsCount, maxLength); //Two labels: positive or negative
//Because we are dealing with vacancies of different lengths and only one output at the final time step: use padding arrays
//Mask arrays contain 1 if data is present at that time step for that example, or 0 if data is just padding
INDArray featuresMask = Nd4j.zeros(vacancies.size(), maxLength);
INDArray labelsMask = Nd4j.zeros(vacancies.size(), maxLength);
int[] temp = new int[2];
for( int i=0; i<vacancies.size(); i++ ){
List<String> tokens = allTokens.get(i);
temp[0] = i;
//Get word vectors for each word in review, and put them in the training data
for( int j=0; j<tokens.size() && j<maxLength; j++ ){
String token = tokens.get(j);
INDArray vector = wordVectors.getWordVectorMatrix(token);
features.put(new INDArrayIndex[]{NDArrayIndex.point(i), NDArrayIndex.all(), NDArrayIndex.point(j)}, vector);
temp[1] = j;
featuresMask.putScalar(temp, 1.0); //Word is present (not padding) for this example + time step -> 1.0 in features mask
}
int idx = (positive[i] ? 0 : 1);
int lastIdx = Math.min(tokens.size(),maxLength);
labels.putScalar(new int[]{i,idx,lastIdx-1},1.0); //Set label: [0,1] for negative, [1,0] for positive
labelsMask.putScalar(new int[]{i,lastIdx-1},1.0); //Specify that an output exists at the final time step for this example
}
return new DataSet(features,labels,featuresMask,labelsMask);
}
@Override
public int totalExamples() {
return filePathes.length;
}
@Override
public int inputColumns() {
return vectorSize;
}
@Override
public int totalOutcomes() {
return 2;
}
@Override
public void reset() {
cursor = 0;
}
@Override
public int batch() {
return batchSize;
}
@Override
public int cursor() {
return cursor;
}
@Override
public int numExamples() {
return totalExamples();
}
@Override
public void setPreProcessor(DataSetPreProcessor preProcessor) {
throw new UnsupportedOperationException();
}
@Override
public List<String> getLabels() {
return Arrays.asList("positive","negative");
}
@Override
public boolean hasNext() {
return cursor < numExamples();
}
@Override
public DataSet next() {
return next(batchSize);
}
@Override
public void remove() {
}
}
packageskillsminer;
导入SkillsMiner.Entities.vacancylightity;
导入SkillsMiner.Utils.Reader;
导入org.apache.commons.io.FileUtils;
导入org.apache.commons.io.FilenameUtils;
导入org.deeplearning4j.datasets.iterator.DataSetIterator;
导入org.deeplearning4j.models.embeddings.wordvectors.wordvectors;
导入org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
导入org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
导入org.deeplearning4j.text.tokenization.tokenizerfactory.tokenizerfactory;
导入org.nd4j.linalg.api.ndarray.INDArray;
导入org.nd4j.linalg.dataset.dataset;
导入org.nd4j.linalg.dataset.api.DataSetPreProcessor;
导入org.nd4j.linalg.factory.nd4j;
导入org.nd4j.linalg.index.INDArrayIndex;
导入org.nd4j.linalg.index.NDArrayIndex;
导入java.io.File;
导入java.io.IOException;
导入java.util.ArrayList;
导入java.util.array;
导入java.util.List;
导入java.util.NoSuchElementException;
/**这是一个DataSetitor,专门用于Word2VecPernn示例中使用的IMDB review数据集
*它从这个数据集中获取火车或测试集数据,再加上一个WordVectors对象(通常是Google新闻)
*300个预训练向量来自https://code.google.com/p/word2vec/)并生成训练数据集。
*输入/特征:可变长度时间序列,其中每个单词(删除未知单词)由
*它的Word2Vec向量表示法。
*标签/目标:单个类别(负面或正面),在每次评审的最后时间步(word)预测
*
*@作者亚历克斯·布莱克
*/
公共类ExampleIterator实现DataSetIterator{
私有最终字向量;
私有最终整数批量大小;
私有最终整数向量大小;
私人最终整数截断长度;
私有int游标=0;
私有最终文件[]文件路径;
私人最终TokenizerFactory TokenizerFactory;
私有int标签计数=4;
/**
*@param dataDirectory IMDB审查数据集的目录
*@param wordVectors wordVectors对象
*@param batchSize用于培训的每个小批量的大小
*@param truncateLength如果评论超过
*@param train If true:返回训练数据;If false:返回测试数据。
*/
公共情感ExampleIterator(字符串数据目录、字向量、字向量、int-batchSize、int-truncateLength、布尔序列)抛出IOException{
this.batchSize=batchSize;
this.vectorSize=wordVectors.lookupTable().layerSize();
文件p=新文件(FilenameUtils.concat(数据目录,“学习/”+(训练?):“测试”)+“/”;
filepaths=p.listFiles();
this.wordVectors=wordVectors;
this.truncateLength=truncateLength;
tokenizerFactory=新的DefaultTokenizerFactory();
setTokenPreProcessor(新的CommonPreprocessor());
}
@凌驾
公共数据集下一个(int num){
如果(cursor>=filepaths.length)抛出新的NoSuchElementException();
试一试{
返回下一个数据集(num);
}捕获(IOE异常){
抛出新的运行时异常(e);
}
}
私有数据集nextDataSet(int num)引发IOException{
列表空缺=新阵列列表(num);
布尔[]正=新布尔[num];
for(int i=0;i显示您迄今为止的尝试。共享您的代码。这不是谷歌,抱歉。@MangeshHotage yeap,抱歉,我认为我的代码没有帮助。我无法回答您的问题,但如果您查看教程,它会指向JavaDoc API,我认为您可以将同一图像放在多个目录中。因此,请复制“pets.jpg”进入dogs/pets.jpg和cats/pets.jpg。如果我有机会尝试,我会让你知道它是如何工作的。