Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/hadoop/6.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Java 如何使用mapreduce避免在我的KNN程序中溢出?_Java_Hadoop_Mapreduce_Knn_Bigdata - Fatal编程技术网

Java 如何使用mapreduce避免在我的KNN程序中溢出?

Java 如何使用mapreduce避免在我的KNN程序中溢出?,java,hadoop,mapreduce,knn,bigdata,Java,Hadoop,Mapreduce,Knn,Bigdata,我在下面编写的程序产生了难以置信的溢出量(溢出量高达几GB,而我的输入和输出数据仅为20MB左右) 我所做的只是将测试文件存储在缓存中,并在每次将一行列车数据传递到map()函数时获取它。我不能在这里设置Combiner,因为对于我的实现来说,每个map()生成的N=测试数据的数量记录没有一个共享同一个键(我使用测试数据的索引作为键)是没有意义的 下面是我写的KnnDriver,如果你喜欢的话 import java.net.URI; import org.apache.hadoop.conf.

我在下面编写的程序产生了难以置信的溢出量(溢出量高达几GB,而我的输入和输出数据仅为20MB左右)

我所做的只是将测试文件存储在缓存中,并在每次将一行列车数据传递到
map()
函数时获取它。我不能在这里设置
Combiner
,因为对于我的实现来说,每个map()生成的
N=测试数据的数量
记录没有一个共享同一个键(我使用测试数据的索引作为键)是没有意义的

下面是我写的
KnnDriver
,如果你喜欢的话

import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class KnnDriver extends Configured implements Tool{
    /*
     *  args = N, test.csv, train.csv, outputpath
     */
    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new KnnDriver(), args);
        System.exit(res);
    }

    @Override
    public int run(String[] args) throws Exception {
        Configuration conf = getConf();
        conf.set("N", args[0]);

        Job job = Job.getInstance(conf, "K-Nearest-Neighbor mapreduce");        
        job.setJarByClass(KnnDriver.class);

        job.addCacheFile(new URI(args[1]));

        if (args.length != 4) {
            System.err.println("Number of parameter is not correct!");
            System.exit(2);
        }

        job.setMapperClass(KnnMapper.class);
        job.setReducerClass(KnnReducer.class);

        // TODO: specify output types
        job.setOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(DistClassPair.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        // TODO: specify input and output DIRECTORIES (not files)
        FileInputFormat.setInputPaths(job, new Path(args[2]));

        Path outputPath = new Path(args[3]);
        FileSystem.get(conf).delete(outputPath, true);
        FileOutputFormat.setOutputPath(job, outputPath);

        return(job.waitForCompletion(true) ? 0 : -1);
    }

}

非常感谢。

为什么不放弃彼此距离太远的结果呢?您正在创建一个完整的NxM距离矩阵,这就是为什么会有如此大量的溢出。为什么不丢弃彼此距离太远的结果呢?您正在创建一个完整的NxM距离矩阵,这就是为什么会有如此大量的溢出。
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class DistClassPair implements WritableComparable<DistClassPair> {   
    private Double dist;
    private String cls;

    public DistClassPair(Double dist, String cls) {
        this.dist = dist;
        this.cls = cls;
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        dist = in.readDouble();
        cls = in.readLine();
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeDouble(dist);
        out.writeBytes(cls);
    }

    @Override
    public int compareTo(DistClassPair o) {
        return Double.compare(dist, o.dist);
    }

    public String getCls() {
        return cls;
    }
}
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class KnnDriver extends Configured implements Tool{
    /*
     *  args = N, test.csv, train.csv, outputpath
     */
    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new KnnDriver(), args);
        System.exit(res);
    }

    @Override
    public int run(String[] args) throws Exception {
        Configuration conf = getConf();
        conf.set("N", args[0]);

        Job job = Job.getInstance(conf, "K-Nearest-Neighbor mapreduce");        
        job.setJarByClass(KnnDriver.class);

        job.addCacheFile(new URI(args[1]));

        if (args.length != 4) {
            System.err.println("Number of parameter is not correct!");
            System.exit(2);
        }

        job.setMapperClass(KnnMapper.class);
        job.setReducerClass(KnnReducer.class);

        // TODO: specify output types
        job.setOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(DistClassPair.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        // TODO: specify input and output DIRECTORIES (not files)
        FileInputFormat.setInputPaths(job, new Path(args[2]));

        Path outputPath = new Path(args[3]);
        FileSystem.get(conf).delete(outputPath, true);
        FileOutputFormat.setOutputPath(job, outputPath);

        return(job.waitForCompletion(true) ? 0 : -1);
    }

}