Java 如何使用mapreduce避免在我的KNN程序中溢出?
我在下面编写的程序产生了难以置信的溢出量(溢出量高达几GB,而我的输入和输出数据仅为20MB左右) 我所做的只是将测试文件存储在缓存中,并在每次将一行列车数据传递到Java 如何使用mapreduce避免在我的KNN程序中溢出?,java,hadoop,mapreduce,knn,bigdata,Java,Hadoop,Mapreduce,Knn,Bigdata,我在下面编写的程序产生了难以置信的溢出量(溢出量高达几GB,而我的输入和输出数据仅为20MB左右) 我所做的只是将测试文件存储在缓存中,并在每次将一行列车数据传递到map()函数时获取它。我不能在这里设置Combiner,因为对于我的实现来说,每个map()生成的N=测试数据的数量记录没有一个共享同一个键(我使用测试数据的索引作为键)是没有意义的 下面是我写的KnnDriver,如果你喜欢的话 import java.net.URI; import org.apache.hadoop.conf.
map()
函数时获取它。我不能在这里设置Combiner
,因为对于我的实现来说,每个map()生成的N=测试数据的数量
记录没有一个共享同一个键(我使用测试数据的索引作为键)是没有意义的
下面是我写的KnnDriver
,如果你喜欢的话
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class KnnDriver extends Configured implements Tool{
/*
* args = N, test.csv, train.csv, outputpath
*/
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new KnnDriver(), args);
System.exit(res);
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
conf.set("N", args[0]);
Job job = Job.getInstance(conf, "K-Nearest-Neighbor mapreduce");
job.setJarByClass(KnnDriver.class);
job.addCacheFile(new URI(args[1]));
if (args.length != 4) {
System.err.println("Number of parameter is not correct!");
System.exit(2);
}
job.setMapperClass(KnnMapper.class);
job.setReducerClass(KnnReducer.class);
// TODO: specify output types
job.setOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(DistClassPair.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// TODO: specify input and output DIRECTORIES (not files)
FileInputFormat.setInputPaths(job, new Path(args[2]));
Path outputPath = new Path(args[3]);
FileSystem.get(conf).delete(outputPath, true);
FileOutputFormat.setOutputPath(job, outputPath);
return(job.waitForCompletion(true) ? 0 : -1);
}
}
非常感谢。为什么不放弃彼此距离太远的结果呢?您正在创建一个完整的NxM距离矩阵,这就是为什么会有如此大量的溢出。为什么不丢弃彼此距离太远的结果呢?您正在创建一个完整的NxM距离矩阵,这就是为什么会有如此大量的溢出。
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class DistClassPair implements WritableComparable<DistClassPair> {
private Double dist;
private String cls;
public DistClassPair(Double dist, String cls) {
this.dist = dist;
this.cls = cls;
}
@Override
public void readFields(DataInput in) throws IOException {
dist = in.readDouble();
cls = in.readLine();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeDouble(dist);
out.writeBytes(cls);
}
@Override
public int compareTo(DistClassPair o) {
return Double.compare(dist, o.dist);
}
public String getCls() {
return cls;
}
}
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class KnnDriver extends Configured implements Tool{
/*
* args = N, test.csv, train.csv, outputpath
*/
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new KnnDriver(), args);
System.exit(res);
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
conf.set("N", args[0]);
Job job = Job.getInstance(conf, "K-Nearest-Neighbor mapreduce");
job.setJarByClass(KnnDriver.class);
job.addCacheFile(new URI(args[1]));
if (args.length != 4) {
System.err.println("Number of parameter is not correct!");
System.exit(2);
}
job.setMapperClass(KnnMapper.class);
job.setReducerClass(KnnReducer.class);
// TODO: specify output types
job.setOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(DistClassPair.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// TODO: specify input and output DIRECTORIES (not files)
FileInputFormat.setInputPaths(job, new Path(args[2]));
Path outputPath = new Path(args[3]);
FileSystem.get(conf).delete(outputPath, true);
FileOutputFormat.setOutputPath(job, outputPath);
return(job.waitForCompletion(true) ? 0 : -1);
}
}