Hadoop 在java map reduce中选择不同的查询
我的目标是删除dup值,输出如下Hadoop 在java map reduce中选择不同的查询,hadoop,dictionary,mapreduce,Hadoop,Dictionary,Mapreduce,我的目标是删除dup值,输出如下 10001|76884|1995-06-24|1996-06-23 10001|76884|1995-06-24|1996-06-23 10001|75286|1993-06-24|1994-06-24 我编写了如下代码 10001|76884|1995-06-24|1996-06-23 10001|75286|1993-06-24|1994-06-24 import java.io.IOException; 导入java.util.*; 导入org.apa
10001|76884|1995-06-24|1996-06-23
10001|76884|1995-06-24|1996-06-23
10001|75286|1993-06-24|1994-06-24
我编写了如下代码
10001|76884|1995-06-24|1996-06-23
10001|75286|1993-06-24|1994-06-24
import java.io.IOException;
导入java.util.*;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.conf.*;
导入org.apache.hadoop.io.*;
导入org.apache.hadoop.mapred.JobClient;
导入org.apache.hadoop.mapreduce.*;
导入org.apache.hadoop.mapreduce.Mapper.Context;
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
导入org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
导入org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
公共舱位租船区{
公共静态类映射扩展映射器{
私有字符串tableKey,tableValue;
公共void映射(可长写键、文本值、上下文)
抛出IOException、InterruptedException{
字符串行=value.toString();
String splitarray[]=line.split(“\\\\”,2);
tableKey=splitarray[0]。trim();
tableValue=splitarray[1]。trim();
write(新文本(tableKey)、新文本(tableValue));
}
}
公共静态类Reduce扩展了Reducer{
公共void reduce(文本键、迭代器值、上下文)
抛出IOException、InterruptedException{
字符串ColumnDelim=“”;
String tableOutValue=ColumnDelim+值;
编写(新文本(键),新文本(tableOutValue));
}
}
公共静态void main(字符串[]args)引发异常{
Configuration conf=新配置();
作业作业=新作业(conf,“charterSelDistRec”);
job.getConfiguration().set(“mapreduce.job.queuename”、“root.Dev”);
job.getConfiguration().set(“mapreduce.output.textoutputformat.separator”、“|”);
job.setJobName(“工作许可证机顶盒外接chtr\U vod\U fyi\U映射”);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
setInputFormatClass(TextInputFormat.class);
setOutputFormatClass(TextOutputFormat.class);
addInputPath(作业,新路径(args[0]);
setOutputPath(作业,新路径(args[1]);
job.setJarByClass(charterSelDistRec.class);
job.waitForCompletion(true);
}
}
但输出文件仍然有DUP。请务必告诉我哪里错了。第一行有两条记录,第二行有一条记录。在地图中读取完后,您将基于|进行拆分,但我可以看到,您的行(实体)是由空格分隔的。只需验证实际数据是否如此。传统的格式是,将每一行(实体)放在一行中,map reduce在映射阶段后过滤唯一的键。一旦您的输入是这种格式,您在reducer中得到的就是唯一的键
如果你的输入有任何不同(如同一行中的2条以上记录),则需要考虑不同的输入格式,或者不同地处理逻辑。了解map reduce的工作原理及其所采用的格式将对您有更多帮助。快乐学习
第一行有两条记录,第二行有一条记录。在地图中读取完后,您将基于|进行拆分,但我可以看到,您的行(实体)是由空格分隔的。只需验证实际数据是否如此。传统的格式是,将每一行(实体)放在一行中,map reduce在映射阶段后过滤唯一的键。一旦您的输入是这种格式,您在reducer中得到的就是唯一的键如果你的输入有任何不同(如同一行中的2条以上记录),则需要考虑不同的输入格式,或者不同地处理逻辑。了解map reduce的工作原理及其所采用的格式将对您有更多帮助。快乐学习不必那么复杂。你所要做的就是:
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class charterSelDistRec {
public static class Map extends Mapper <LongWritable, Text, Text, Text> {
private String tableKey,tableValue;
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String splitarray[] = line.split("\\|",2);
tableKey = splitarray[0].trim();
tableValue = splitarray[1].trim();
context.write(new Text(tableKey), new Text(tableValue));
}
}
public static class Reduce extends Reducer <Text, Text, Text, Text> {
public void reduce(Text key, Iterator<Text> values, Context context)
throws IOException, InterruptedException {
String ColumnDelim="";
String tableOutValue=ColumnDelim+values;
context.write(new Text(key), new Text(tableOutValue));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf,"charterSelDistRec");
job.getConfiguration().set("mapreduce.job.queuename", "root.Dev");
job.getConfiguration().set("mapreduce.output.textoutputformat.separator","|");
job.setJobName("work_charter_stb.ext_chtr_vod_fyi_mapped");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setJarByClass(charterSelDistRec.class);
job.waitForCompletion(true);
}
}
10001|76884|1995-06-24|1996-06-23
10001|76884|1995-06-24|1996-06-23
10001|75286|1993-06-24|1994-06-24
代码如下:
10001|76884|1995-06-24|1996-06-23
10001|76884|1995-06-24|1996-06-23
10001|75286|1993-06-24|1994-06-24
公共类StackRemoveDup{
公共静态类MyMapper扩展映射器{
@凌驾
公共void映射(LongWritable忽略、文本值、上下文)
抛出java.io.IOException、InterruptedException{
write(value,nullwriteable.get());
}
}
公共静态类MyReducer扩展了Reducer{
@凌驾
公共void reduce(文本键、Iterable值、上下文)
抛出IOException、InterruptedException{
write(key,nullwriteable.get());
}
}
公共静态void main(字符串[]args)
抛出IOException、ClassNotFoundException、InterruptedException{
作业=新作业();
job.setJarByClass(StackRemoveDup.class);
job.setJobName(“StackRemoveDup”);
setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
addInputPath(作业,新路径(args[0]);
setOutputPath(作业,新路径(args[1]);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.waitForCompletion(true);
}
}
以下是输出:
public class StackRemoveDup {
public static class MyMapper extends Mapper<LongWritable,Text, Text, NullWritable> {
@Override
public void map(LongWritable ignore, Text value, Context context)
throws java.io.IOException, InterruptedException {
context.write(value,NullWritable.get());
}
}
public static class MyReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
@Override
public void reduce(Text key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
public static void main(String[] args)
throws IOException, ClassNotFoundException, InterruptedException {
Job job = new Job();
job.setJarByClass(StackRemoveDup.class);
job.setJobName("StackRemoveDup");
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.waitForCompletion(true);
}
}
10001|75286|1993-06-24|1994-06-24
10001|76884|1995-06-24|1996-06-23
它不必如此复杂。你所要做的就是:
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class charterSelDistRec {
public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
@Override
public void map(LongWritable ignore, Text value, Context context)
throws IOException, InterruptedException {
context.write(value, value);
}
}
public static class MyReducer extends Reducer<Text, Text, Text, NullWritable> {
@Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
for (Text value : values){
context.write(value, NullWritable.get());
break;
}
}
}
/* This is your main. Changed the outputValueClass method only */
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf,"charterSelDistRec");
job.getConfiguration().set("mapreduce.job.queuename", "root.Dev");
job.getConfiguration().set("mapreduce.output.textoutputformat.separator","|");
job.setJobName("work_charter_stb.ext_chtr_vod_fyi_mapped");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setJarByClass(charterSelDistRec.class);
job.waitForCompletion(true);
}
}