Hadoop 在java map reduce中选择不同的查询

Hadoop 在java map reduce中选择不同的查询,hadoop,dictionary,mapreduce,Hadoop,Dictionary,Mapreduce,我的目标是删除dup值,输出如下 10001|76884|1995-06-24|1996-06-23 10001|76884|1995-06-24|1996-06-23 10001|75286|1993-06-24|1994-06-24 我编写了如下代码 10001|76884|1995-06-24|1996-06-23 10001|75286|1993-06-24|1994-06-24 import java.io.IOException; 导入java.util.*; 导入org.apa

我的目标是删除dup值,输出如下

10001|76884|1995-06-24|1996-06-23
10001|76884|1995-06-24|1996-06-23
10001|75286|1993-06-24|1994-06-24
我编写了如下代码

10001|76884|1995-06-24|1996-06-23
10001|75286|1993-06-24|1994-06-24
import java.io.IOException;
导入java.util.*;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.conf.*;
导入org.apache.hadoop.io.*;
导入org.apache.hadoop.mapred.JobClient;
导入org.apache.hadoop.mapreduce.*;
导入org.apache.hadoop.mapreduce.Mapper.Context;
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
导入org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
导入org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
公共舱位租船区{
公共静态类映射扩展映射器{
私有字符串tableKey,tableValue;
公共void映射(可长写键、文本值、上下文)
抛出IOException、InterruptedException{
字符串行=value.toString();
String splitarray[]=line.split(“\\\\”,2);
tableKey=splitarray[0]。trim();
tableValue=splitarray[1]。trim();
write(新文本(tableKey)、新文本(tableValue));
}
}               
公共静态类Reduce扩展了Reducer{
公共void reduce(文本键、迭代器值、上下文)
抛出IOException、InterruptedException{
字符串ColumnDelim=“”;
String tableOutValue=ColumnDelim+值;
编写(新文本(键),新文本(tableOutValue));
}
}
公共静态void main(字符串[]args)引发异常{
Configuration conf=新配置();
作业作业=新作业(conf,“charterSelDistRec”);
job.getConfiguration().set(“mapreduce.job.queuename”、“root.Dev”);
job.getConfiguration().set(“mapreduce.output.textoutputformat.separator”、“|”);
job.setJobName(“工作许可证机顶盒外接chtr\U vod\U fyi\U映射”);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
setInputFormatClass(TextInputFormat.class);
setOutputFormatClass(TextOutputFormat.class);
addInputPath(作业,新路径(args[0]);
setOutputPath(作业,新路径(args[1]);
job.setJarByClass(charterSelDistRec.class);
job.waitForCompletion(true);
}
}

但输出文件仍然有DUP。请务必告诉我哪里错了。

第一行有两条记录,第二行有一条记录。在地图中读取完后,您将基于|进行拆分,但我可以看到,您的行(实体)是由空格分隔的。只需验证实际数据是否如此。传统的格式是,将每一行(实体)放在一行中,map reduce在映射阶段后过滤唯一的键。一旦您的输入是这种格式,您在reducer中得到的就是唯一的键


如果你的输入有任何不同(如同一行中的2条以上记录),则需要考虑不同的输入格式,或者不同地处理逻辑。了解map reduce的工作原理及其所采用的格式将对您有更多帮助。快乐学习

第一行有两条记录,第二行有一条记录。在地图中读取完后,您将基于|进行拆分,但我可以看到,您的行(实体)是由空格分隔的。只需验证实际数据是否如此。传统的格式是,将每一行(实体)放在一行中,map reduce在映射阶段后过滤唯一的键。一旦您的输入是这种格式,您在reducer中得到的就是唯一的键


如果你的输入有任何不同(如同一行中的2条以上记录),则需要考虑不同的输入格式,或者不同地处理逻辑。了解map reduce的工作原理及其所采用的格式将对您有更多帮助。快乐学习不必那么复杂。你所要做的就是:

  • 在mapper中,将每一行作为键和任意值发出

  • 在reducer中,只需发出键并忽略值

  • 共享代码:

    以下是输入:

    import java.io.IOException;
    import java.util.*;
    
    import org.apache.hadoop.fs.Path;
    
    import org.apache.hadoop.conf.*;
    
    import org.apache.hadoop.io.*;
    
    import org.apache.hadoop.mapred.JobClient;
    
    import org.apache.hadoop.mapreduce.*;
    
    import org.apache.hadoop.mapreduce.Mapper.Context;
    
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    
    public class charterSelDistRec {
    
            public static class Map extends Mapper <LongWritable, Text, Text, Text> {
                private String tableKey,tableValue;
    
                public void map(LongWritable key, Text value, Context context)
     throws IOException, InterruptedException {
    
                        String line = value.toString();
                        String splitarray[] = line.split("\\|",2);
                        tableKey = splitarray[0].trim();
                        tableValue = splitarray[1].trim();
    
                        context.write(new Text(tableKey), new Text(tableValue));     
                    }
            }               
    
            public static class Reduce extends Reducer <Text, Text, Text, Text> {                         
                public void reduce(Text key, Iterator<Text> values, Context context) 
                          throws IOException, InterruptedException {
                        String ColumnDelim="";
                        String tableOutValue=ColumnDelim+values;
                        context.write(new Text(key), new Text(tableOutValue));
    
                    }
            }
    
            public static void main(String[] args) throws Exception {
                    Configuration conf = new Configuration();
                    Job job = new Job(conf,"charterSelDistRec");
                    job.getConfiguration().set("mapreduce.job.queuename", "root.Dev");
                    job.getConfiguration().set("mapreduce.output.textoutputformat.separator","|");
                    job.setJobName("work_charter_stb.ext_chtr_vod_fyi_mapped");
                    job.setOutputKeyClass(Text.class);
                    job.setOutputValueClass(Text.class);
    
                    job.setMapperClass(Map.class);
    
                    job.setReducerClass(Reduce.class);
    
                    job.setInputFormatClass(TextInputFormat.class);
                    job.setOutputFormatClass(TextOutputFormat.class);
    
    
                    FileInputFormat.addInputPath(job, new Path(args[0]));
                    FileOutputFormat.setOutputPath(job, new Path(args[1]));
                    job.setJarByClass(charterSelDistRec.class); 
                    job.waitForCompletion(true);
              }
          }
    
    10001|76884|1995-06-24|1996-06-23 10001|76884|1995-06-24|1996-06-23 10001|75286|1993-06-24|1994-06-24 代码如下:

    10001|76884|1995-06-24|1996-06-23 10001|76884|1995-06-24|1996-06-23 10001|75286|1993-06-24|1994-06-24
    公共类StackRemoveDup{
    公共静态类MyMapper扩展映射器{
    @凌驾
    公共void映射(LongWritable忽略、文本值、上下文)
    抛出java.io.IOException、InterruptedException{
    write(value,nullwriteable.get());
    }  
    }
    公共静态类MyReducer扩展了Reducer{
    @凌驾
    公共void reduce(文本键、Iterable值、上下文)
    抛出IOException、InterruptedException{
    write(key,nullwriteable.get());
    }
    }       
    公共静态void main(字符串[]args)
    抛出IOException、ClassNotFoundException、InterruptedException{
    作业=新作业();
    job.setJarByClass(StackRemoveDup.class);
    job.setJobName(“StackRemoveDup”);
    setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    addInputPath(作业,新路径(args[0]);
    setOutputPath(作业,新路径(args[1]);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);
    job.waitForCompletion(true);
    }
    }
    
    以下是输出:

    public class StackRemoveDup {
    
        public  static class MyMapper extends Mapper<LongWritable,Text, Text, NullWritable> {
    
            @Override
            public void map(LongWritable ignore, Text value, Context context)
                throws java.io.IOException, InterruptedException {
                context.write(value,NullWritable.get());
            }  
        }
    
        public static class MyReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
    
          @Override
          public void reduce(Text key, Iterable<NullWritable> values, Context context)
              throws IOException, InterruptedException {
            context.write(key, NullWritable.get());
          }
        }       
    
      public static void main(String[] args) 
                      throws IOException, ClassNotFoundException, InterruptedException {
    
        Job job = new Job();
        job.setJarByClass(StackRemoveDup.class);
        job.setJobName("StackRemoveDup");
    
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);
    
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
    
        job.waitForCompletion(true);
      }
    }
    
    10001|75286|1993-06-24|1994-06-24 10001|76884|1995-06-24|1996-06-23
    它不必如此复杂。你所要做的就是:

  • 在mapper中,将每一行作为键和任意值发出

  • 在减速机中,只是 10001|75286|1993-06-24|1994-06-24 10001|76884|1995-06-24|1996-06-23
    import java.io.IOException;
    import java.util.*;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.conf.*;
    import org.apache.hadoop.io.*;
    import org.apache.hadoop.mapred.JobClient;
    import org.apache.hadoop.mapreduce.*;
    import org.apache.hadoop.mapreduce.Mapper.Context;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    
    public class charterSelDistRec {
    
        public  static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
    
            @Override
            public void map(LongWritable ignore, Text value, Context context)
                throws IOException, InterruptedException {
                context.write(value, value);
            }  
        }
    
        public static class MyReducer extends Reducer<Text, Text, Text, NullWritable> {    
          @Override
          public void reduce(Text key, Iterable<Text> values, Context context)
              throws IOException, InterruptedException {
              for (Text value : values){
                  context.write(value, NullWritable.get());
                  break;
              }
          }
        }       
    
      /* This is your main. Changed the outputValueClass method only */
      public static void main(String[] args) throws Exception {
          Configuration conf = new Configuration();
          Job job = new Job(conf,"charterSelDistRec");
          job.getConfiguration().set("mapreduce.job.queuename", "root.Dev");
          job.getConfiguration().set("mapreduce.output.textoutputformat.separator","|");
          job.setJobName("work_charter_stb.ext_chtr_vod_fyi_mapped");
          job.setOutputKeyClass(Text.class);
          job.setOutputValueClass(NullWritable.class);
    
          job.setMapperClass(Map.class);
    
          job.setReducerClass(Reduce.class);
    
          job.setInputFormatClass(TextInputFormat.class);
          job.setOutputFormatClass(TextOutputFormat.class);
    
    
          FileInputFormat.addInputPath(job, new Path(args[0]));
          FileOutputFormat.setOutputPath(job, new Path(args[1]));
          job.setJarByClass(charterSelDistRec.class); 
          job.waitForCompletion(true);
       }
    }