Mapreduce 在master和reduce之间共享数据

Mapreduce 在master和reduce之间共享数据,mapreduce,elastic-map-reduce,Mapreduce,Elastic Map Reduce,我需要使用所有reduce任务的结果执行聚合。基本上,reduce任务会找到sum和count以及一个值。我需要把所有的总数和计数相加,找到最终的平均值 我尝试在reduce中使用conf.setInt。但是当我试图从主函数访问它时,它失败了 class Main { public static class MyReducer extends Reducer<Text, Text,Text,IntWritable> { public void reduce(Text k

我需要使用所有reduce任务的结果执行聚合。基本上,reduce任务会找到sum和count以及一个值。我需要把所有的总数和计数相加,找到最终的平均值

我尝试在reduce中使用
conf.setInt
。但是当我试图从主函数访问它时,它失败了

class Main {

public static class MyReducer 
extends Reducer<Text, Text,Text,IntWritable> {

    public void reduce(Text key, Iterable<Text> values, 
            Context context
            ) throws IOException, InterruptedException {
        int i = 0;
        int fd = 0, fc = 0;
        fd = context.getConfiguration().getInt("fd", -1);
        fc = context.getConfiguration().getInt("fc", -1);
        //when I check the value of fd, fc here they are fine. fc fd is shared across all reduce tasks and the updated value is seen by all reduce task. Only main function doesnt seem to have access to it.
    }
}

public static void main(String[] args) throws Exception{
    Configuration conf = new Configuration();
    conf.setInt("fc", 5);

    Job job = new Job(conf, "Flight Data");
    job.setJarByClass(FlightData.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setReducerClass(MyReducer.class);

    job.setPartitionerClass(FirstPartitioner.class);
    job.setGroupingComparatorClass(GroupComparator.class);
    job.setSortComparatorClass(KeyComparator.class);


    job.setNumReduceTasks(10);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);


    flightCount = job.getConfiguration().getInt("fc", -1);
    flightDelay = job.getConfiguration().getInt("fd", -1);
    //here when I access fc, fd, I get back 5 & 5
    System.out.println("Final " + flightCount +" " + flightDelay+ " " + flightDelay/flightCount);
}
主类{
公共静态类MyReducer
伸缩减速机{
public void reduce(文本键、Iterable值、,
语境
)抛出IOException、InterruptedException{
int i=0;
intfd=0,fc=0;
fd=context.getConfiguration().getInt(“fd”,-1);
fc=context.getConfiguration().getInt(“fc”,-1);
//当我检查fd的值时,这里的fc很好。fc fd在所有reduce任务中共享,更新的值被所有reduce任务看到。只有main函数似乎没有访问它的权限。
}
}
公共静态void main(字符串[]args)引发异常{
Configuration conf=新配置();
conf.setInt(“fc”,5);
作业作业=新作业(形态,“飞行数据”);
job.setJarByClass(FlightData.class);
setMapperClass(TokenizerMapper.class);
job.setReducerClass(MyReducer.class);
job.setPartitionerClass(FirstPartitioner.class);
job.setGroupingComparatorClass(GroupComparator.class);
job.setSortComparatorClass(KeyComparator.class);
job.setNumReduceTasks(10);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
flightCount=job.getConfiguration().getInt(“fc”,-1);
flightDelay=job.getConfiguration().getInt(“fd”,-1);
//在这里,当我访问fc、fd时,我返回5和5
System.out.println(“最终”+flightCount+“”+flightDelay+“”+flightDelay/flightCount);
}
使用新的
org.apache.hadoop.mapreduce
API覆盖映射器和reducer的
run()
。在这些方法中,您可以从每个映射器或reducer发出累计总和/计数

此外,您还需要将reducer count限制为1,以便获得多个映射器生成的所有和的全局和

请参阅以下代码以了解更多信息:

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class AggregationExample extends Configured implements Tool {

    /**
     * This is Mapper.
     * 
     */
    public static class MapJob extends Mapper<LongWritable, Text, Text, Text> {

        private Text outputKey = new Text();
        private Text outputValue = new Text();
        private double sum;

        @Override
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            try {
                // say that you need to sum up the value part
                sum+= Double.valueOf(value);
        }

        @Override
        public void run(Context context) throws IOException, InterruptedException {

            setup(context);
            while (context.nextKeyValue()) {
                map(context.getCurrentKey(), context.getCurrentValue(), context);
            }

            // emit out the sum per mapper
            outputKey.set(sum);
            context.write(outputKey, outputValue);// Notice that the outputValue is empty
            cleanup(context);

        }
    }

    /**
     * This is Reducer.
     * 
     */
    public static class ReduceJob extends Reducer<Text, Text, Text, Text> {

        private Text outputKey = new Text();
        private Text outputValue = new Text();
        private double sum;

        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException,
                InterruptedException {


            // summation of values from each mapper
            sum += Double.valueOf(key.toString());

        }

        @Override
        public void run(Context context) throws IOException, InterruptedException {

            setup(context);
            while (context.nextKey()) {
                reduce(context.getCurrentKey(), context.getValues(), context);
            }

            // emit out the global sums
            outputKey.set(sum);
            context.write(outputKey, outputValue);
            cleanup(context);
        }
    }

    @Override
    public int run(String[] args) throws Exception {

        try {
            Configuration conf = getConf();

            // output key and value separator is empty as in final output only
            // key is emitted and value is empty
            conf.set("mapred.textoutputformat.separator", "");

            // Configuring mapred to have just one reducer as we need to find
            // single sum values from all the inputs
            conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 1);
            conf.setInt("mapred.reduce.tasks", 1);

            Job job = new Job(conf);

            job.setJarByClass(AggregationExample.class);
            job.setJobName("Aggregation Example");

            job.setMapperClass(MapJob.class);
            job.setReducerClass(ReduceJob.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);

            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            FileInputFormat.setInputPaths(job, args[0]);
            FileOutputFormat.setOutputPath(job, new Path(args[1]));

            boolean success = job.waitForCompletion(true);
            return success ? 0 : 1;
        } catch (Exception e) {
            e.printStackTrace();
            return 1;
        }

    }

    public static void main(String[] args) throws Exception {

        if (args.length < 2) {
            System.out
                    .println("Usage: AggregationExample <comma sparated list of input directories> <output dir>");
            System.exit(-1);
        }

        int result = ToolRunner.run(new AggregationExample(), args);
        System.exit(result);
    }

}
import java.io.IOException;
导入org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.conf.Configured;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.Job;
导入org.apache.hadoop.mapreduce.Mapper;
导入org.apache.hadoop.mapreduce.Reducer;
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
导入org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
导入org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
导入org.apache.hadoop.util.Tool;
导入org.apache.hadoop.util.ToolRunner;
公共类AggregationExample扩展配置的实现工具{
/**
*我是Mapper。
* 
*/
公共静态类映射作业扩展映射器{
私有文本输出键=新文本();
私有文本输出值=新文本();
私人双和;
@凌驾
公共void映射(LongWritable键、文本值、上下文上下文)引发IOException、InterruptedException{
试一试{
//说你需要总结价值部分
总和+=两倍的值(值);
}
@凌驾
公共void运行(上下文上下文)引发IOException、InterruptedException{
设置(上下文);
while(context.nextKeyValue()){
映射(context.getCurrentKey(),context.getCurrentValue(),context);
}
//输出每个映射器的和
outputKey.set(总和);
write(outputKey,outputValue);//注意outputValue是空的
清理(上下文);
}
}
/**
*这是减速机。
* 
*/
公共静态类ReduceJob扩展Reducer{
私有文本输出键=新文本();
私有文本输出值=新文本();
私人双和;
@凌驾
受保护的void reduce(文本键、Iterable值、上下文上下文)引发IOException,
中断异常{
//每个映射器的值总和
sum+=Double.valueOf(key.toString());
}
@凌驾
公共void运行(上下文上下文)引发IOException、InterruptedException{
设置(上下文);
while(context.nextKey()){
reduce(context.getCurrentKey(),context.getValues(),context);
}
//发出全局和
outputKey.set(总和);
write(outputKey,outputValue);
清理(上下文);
}
}
@凌驾
公共int运行(字符串[]args)引发异常{
试一试{
配置conf=getConf();
//输出键和值分隔符为空,仅在最终输出中为空
//已发射密钥,值为空
conf.set(“mapred.textoutputformat.separator”,”;
//将mapred配置为只有一个我们需要找到的减速器
//所有输入的单和值
conf.setInt(“mapred.tasktracker.reduce.tasks.max”,1);
conf.setInt(“mapred.reduce.tasks”,1);
作业=新作业(配置);
job.setJarByClass(AggregationExample.class);
setJobName(“聚合示例”);
setMapperClass(MapJob.class);
job.setReduceClass(ReduceJob.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
setInputFormatClass(TextInputFormat.class);
setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
//enum for counters used by reducers
public static enum FlightCounters {
    FLIGHT_COUNT,
    FLIGHT_DELAY;
}
public static class MyReducer 
extends Reducer<Text, Text,Text,IntWritable> {

    public void reduce(Text key, Iterable<Text> values, 
            Context context
            ) throws IOException, InterruptedException {


        delay1 = Float.parseFloat(origin[5]);
        delay2 = Float.parseFloat(dest[5]);
        context.getCounter(FlightCounters.FLIGHT_COUNT).increment(1);
        context.getCounter(FlightCounters.FLIGHT_DELAY)
        .increment((long) (delay1 + delay2));

    }
}
public static void main(String[] args) throws Exception{
    float flightCount, flightDelay;
    job.waitForCompletion(true);
    //get the final results updated in counter by all map and reduce tasks
    flightCount = job.getCounters()
            .findCounter(FlightCounters.FLIGHT_COUNT).getValue();
    flightDelay = job.getCounters()
            .findCounter(FlightCounters.FLIGHT_DELAY).getValue();
}