Hadoop 在java map reduce中选择不同的查询_Hadoop_Dictionary_Mapreduce

Hadoop 在java map reduce中选择不同的查询

hadoop dictionary mapreduce

Hadoop 在java map reduce中选择不同的查询,hadoop,dictionary,mapreduce,Hadoop,Dictionary,Mapreduce,我的目标是删除dup值，输出如下 10001|76884|1995-06-24|1996-06-23 10001|76884|1995-06-24|1996-06-23 10001|75286|1993-06-24|1994-06-24 我编写了如下代码 10001|76884|1995-06-24|1996-06-23 10001|75286|1993-06-24|1994-06-24 import java.io.IOException；导入java.util.*；导入org.apa

我的目标是删除dup值，输出如下

10001|76884|1995-06-24|1996-06-23
10001|76884|1995-06-24|1996-06-23
10001|75286|1993-06-24|1994-06-24

我编写了如下代码

10001|76884|1995-06-24|1996-06-23
10001|75286|1993-06-24|1994-06-24

import java.io.IOException；
导入java.util.*；
导入org.apache.hadoop.fs.Path；
导入org.apache.hadoop.conf.*；
导入org.apache.hadoop.io.*；
导入org.apache.hadoop.mapred.JobClient；
导入org.apache.hadoop.mapreduce.*；
导入org.apache.hadoop.mapreduce.Mapper.Context；
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat；
导入org.apache.hadoop.mapreduce.lib.input.TextInputFormat；
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat；
导入org.apache.hadoop.mapreduce.lib.output.TextOutputFormat；
公共舱位租船区{
公共静态类映射扩展映射器{
私有字符串tableKey，tableValue；
公共void映射（可长写键、文本值、上下文）
抛出IOException、InterruptedException{
字符串行=value.toString（）；
String splitarray[]=line.split（“\\\\”，2）；
tableKey=splitarray[0]。trim（）；
tableValue=splitarray[1]。trim（）；
write（新文本（tableKey）、新文本（tableValue））；
}
}               
公共静态类Reduce扩展了Reducer{
公共void reduce（文本键、迭代器值、上下文）
抛出IOException、InterruptedException{
字符串ColumnDelim=“”；
String tableOutValue=ColumnDelim+值；
编写（新文本（键），新文本（tableOutValue））；
}
}
公共静态void main（字符串[]args）引发异常{
Configuration conf=新配置（）；
作业作业=新作业（conf，“charterSelDistRec”）；
job.getConfiguration（）.set（“mapreduce.job.queuename”、“root.Dev”）；
job.getConfiguration（）.set（“mapreduce.output.textoutputformat.separator”、“|”）；
job.setJobName（“工作许可证机顶盒外接chtr\U vod\U fyi\U映射”）；
job.setOutputKeyClass（Text.class）；
job.setOutputValueClass（Text.class）；
job.setMapperClass（Map.class）；
job.setReducerClass（Reduce.class）；
setInputFormatClass（TextInputFormat.class）；
setOutputFormatClass（TextOutputFormat.class）；
addInputPath（作业，新路径（args[0]）；
setOutputPath（作业，新路径（args[1]）；
job.setJarByClass（charterSelDistRec.class）；
job.waitForCompletion（true）；
}
}

但输出文件仍然有DUP。请务必告诉我哪里错了。

第一行有两条记录，第二行有一条记录。在地图中读取完后，您将基于|进行拆分，但我可以看到，您的行（实体）是由空格分隔的。只需验证实际数据是否如此。传统的格式是，将每一行（实体）放在一行中，map reduce在映射阶段后过滤唯一的键。一旦您的输入是这种格式，您在reducer中得到的就是唯一的键

如果你的输入有任何不同（如同一行中的2条以上记录），则需要考虑不同的输入格式，或者不同地处理逻辑。了解map reduce的工作原理及其所采用的格式将对您有更多帮助。快乐学习

如果你的输入有任何不同（如同一行中的2条以上记录），则需要考虑不同的输入格式，或者不同地处理逻辑。了解map reduce的工作原理及其所采用的格式将对您有更多帮助。快乐学习不必那么复杂。你所要做的就是：

在mapper中，将每一行作为键和任意值发出

在reducer中，只需发出键并忽略值

共享代码：

以下是输入：

import java.io.IOException;
import java.util.*;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;

import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobClient;

import org.apache.hadoop.mapreduce.*;

import org.apache.hadoop.mapreduce.Mapper.Context;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class charterSelDistRec {

        public static class Map extends Mapper <LongWritable, Text, Text, Text> {
            private String tableKey,tableValue;

            public void map(LongWritable key, Text value, Context context)
 throws IOException, InterruptedException {

                    String line = value.toString();
                    String splitarray[] = line.split("\\|",2);
                    tableKey = splitarray[0].trim();
                    tableValue = splitarray[1].trim();

                    context.write(new Text(tableKey), new Text(tableValue));     
                }
        }               

        public static class Reduce extends Reducer <Text, Text, Text, Text> {                         
            public void reduce(Text key, Iterator<Text> values, Context context) 
                      throws IOException, InterruptedException {
                    String ColumnDelim="";
                    String tableOutValue=ColumnDelim+values;
                    context.write(new Text(key), new Text(tableOutValue));

                }
        }

        public static void main(String[] args) throws Exception {
                Configuration conf = new Configuration();
                Job job = new Job(conf,"charterSelDistRec");
                job.getConfiguration().set("mapreduce.job.queuename", "root.Dev");
                job.getConfiguration().set("mapreduce.output.textoutputformat.separator","|");
                job.setJobName("work_charter_stb.ext_chtr_vod_fyi_mapped");
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);

                job.setMapperClass(Map.class);

                job.setReducerClass(Reduce.class);

                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);


                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));
                job.setJarByClass(charterSelDistRec.class); 
                job.waitForCompletion(true);
          }
      }

10001|76884|1995-06-24|1996-06-23 10001|76884|1995-06-24|1996-06-23 10001|75286|1993-06-24|1994-06-24 代码如下：

10001|76884|1995-06-24|1996-06-23 10001|76884|1995-06-24|1996-06-23 10001|75286|1993-06-24|1994-06-24

公共类StackRemoveDup{
公共静态类MyMapper扩展映射器{
@凌驾
公共void映射（LongWritable忽略、文本值、上下文）
抛出java.io.IOException、InterruptedException{
write（value，nullwriteable.get（））；
}  
}
公共静态类MyReducer扩展了Reducer{
@凌驾
公共void reduce（文本键、Iterable值、上下文）
抛出IOException、InterruptedException{
write（key，nullwriteable.get（））；
}
}       
公共静态void main（字符串[]args）
抛出IOException、ClassNotFoundException、InterruptedException{
作业=新作业（）；
job.setJarByClass（StackRemoveDup.class）；
job.setJobName（“StackRemoveDup”）；
setMapperClass（MyMapper.class）；
job.setReducerClass（MyReducer.class）；
addInputPath（作业，新路径（args[0]）；
setOutputPath（作业，新路径（args[1]）；
job.setOutputKeyClass（Text.class）；
job.setOutputValueClass（NullWritable.class）；
job.waitForCompletion（true）；
}
}

以下是输出：

public class StackRemoveDup {

    public  static class MyMapper extends Mapper<LongWritable,Text, Text, NullWritable> {

        @Override
        public void map(LongWritable ignore, Text value, Context context)
            throws java.io.IOException, InterruptedException {
            context.write(value,NullWritable.get());
        }  
    }

    public static class MyReducer extends Reducer<Text, NullWritable, Text, NullWritable> {

      @Override
      public void reduce(Text key, Iterable<NullWritable> values, Context context)
          throws IOException, InterruptedException {
        context.write(key, NullWritable.get());
      }
    }       

  public static void main(String[] args) 
                  throws IOException, ClassNotFoundException, InterruptedException {

    Job job = new Job();
    job.setJarByClass(StackRemoveDup.class);
    job.setJobName("StackRemoveDup");

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    job.waitForCompletion(true);
  }
}

10001|75286|1993-06-24|1994-06-24 10001|76884|1995-06-24|1996-06-23

它不必如此复杂。你所要做的就是：

在mapper中，将每一行作为键和任意值发出

在减速机中，只是 10001|75286|1993-06-24|1994-06-24 10001|76884|1995-06-24|1996-06-23

import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class charterSelDistRec {

    public  static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {

        @Override
        public void map(LongWritable ignore, Text value, Context context)
            throws IOException, InterruptedException {
            context.write(value, value);
        }  
    }

    public static class MyReducer extends Reducer<Text, Text, Text, NullWritable> {    
      @Override
      public void reduce(Text key, Iterable<Text> values, Context context)
          throws IOException, InterruptedException {
          for (Text value : values){
              context.write(value, NullWritable.get());
              break;
          }
      }
    }       

  /* This is your main. Changed the outputValueClass method only */
  public static void main(String[] args) throws Exception {
      Configuration conf = new Configuration();
      Job job = new Job(conf,"charterSelDistRec");
      job.getConfiguration().set("mapreduce.job.queuename", "root.Dev");
      job.getConfiguration().set("mapreduce.output.textoutputformat.separator","|");
      job.setJobName("work_charter_stb.ext_chtr_vod_fyi_mapped");
      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(NullWritable.class);

      job.setMapperClass(Map.class);

      job.setReducerClass(Reduce.class);

      job.setInputFormatClass(TextInputFormat.class);
      job.setOutputFormatClass(TextOutputFormat.class);


      FileInputFormat.addInputPath(job, new Path(args[0]));
      FileOutputFormat.setOutputPath(job, new Path(args[1]));
      job.setJarByClass(charterSelDistRec.class); 
      job.waitForCompletion(true);
   }
}