使用hadoop map reduce比较两个文本文件_Hadoop_Mapreduce_Hadoop2

使用hadoop map reduce比较两个文本文件

hadoop mapreduce

使用hadoop map reduce比较两个文本文件,hadoop,mapreduce,hadoop2,Hadoop,Mapreduce,Hadoop2,我想逐行比较两个文本文件，看看它们是否相等。如何使用hadoop map reduce编程实现这一点 static int i=0; public void map(LongWritable key, String value, OutputCollector<String,IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); i+

我想逐行比较两个文本文件，看看它们是否相等。如何使用hadoop map reduce编程实现这一点

static int i=0;
public void map(LongWritable key, String value, OutputCollector<String,IntWritable> output, Reporter reporter) throws IOException {
      String line = value.toString();
     i++; //used as a line number
        output.collect(line, new IntWritable(i));
 }

static int i=0；
公共void映射（LongWritable键、字符串值、OutputCollector输出、Reporter报告器）引发IOException{
字符串行=value.toString（）；
i++；//用作行号
collect（行，新的intwriteable（i））；
}

我尝试用行号映射每一行。但如何减少行号并与另一个文件进行比较？

比较两个文本文件相当于在map reduce编程中连接两个文件。要连接两个文本文件，必须使用具有相同键的两个映射器。在您的情况下，可以将关键点用作线偏移，将值用作线。方法用于使用多个映射器和多个文本文件

请在下面找到使用JAVA在map reduce编程中比较两个文本文件的详细程序

程序的参数是文件1、文件2和输出文件

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class CompareTwoFiles {

    public static class Map extends
            Mapper<LongWritable, Text, LongWritable, Text> {

        @Override
        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            context.write(key, value);
        }
    }

    public static class Map2 extends
            Mapper<LongWritable, Text, LongWritable, Text> {

        @Override
        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            context.write(key, value);
        }
    }

    public static class Reduce extends
            Reducer<LongWritable, Text, LongWritable, Text> {

        @Override
        public void reduce(LongWritable key, Iterable<Text> values,
                Context context) throws IOException, InterruptedException {
            String[] lines = new String[2];
            int i = 0;
            for (Text text : values) {
                lines[i] = text.toString();
                i++;
            }
            if (lines[0].equals(lines[1])) {
                context.write(key, new Text("same"));
            } else {
                context.write(key,
                        new Text(lines[0] + "     vs    " + lines[1]));
            }

        }

    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.set("fs.default.name", "hdfs://localhost:8020");
        Job job = new Job(conf);
        job.setJarByClass(CompareTwoFiles.class);
        job.setJobName("Compare Two Files and Identify the Difference");
        FileOutputFormat.setOutputPath(job, new Path(args[2]));
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);
        MultipleInputs.addInputPath(job, new Path(args[0]),
                TextInputFormat.class, Map.class);
        MultipleInputs.addInputPath(job, new Path(args[1]),
                TextInputFormat.class, Map2.class);
        job.waitForCompletion(true);

    }

}

import java.io.IOException；
导入org.apache.hadoop.conf.Configuration；
导入org.apache.hadoop.fs.Path；
导入org.apache.hadoop.io.LongWritable；
导入org.apache.hadoop.io.Text；
导入org.apache.hadoop.mapreduce.Job；
导入org.apache.hadoop.mapreduce.Mapper；
导入org.apache.hadoop.mapreduce.Reducer；
导入org.apache.hadoop.mapreduce.lib.input.MultipleInputs；
导入org.apache.hadoop.mapreduce.lib.input.TextInputFormat；
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat；
公共类比较文件{
公共静态类映射扩展
制图员{
@凌驾
公共void映射（可长写键、文本值、上下文）
抛出IOException、InterruptedException{
编写（键、值）；
}
}
公共静态类Map2扩展
制图员{
@凌驾
公共void映射（可长写键、文本值、上下文）
抛出IOException、InterruptedException{
编写（键、值）；
}
}
公共静态类Reduce扩展
减速器{
@凌驾
public void reduce（可写长键、可写值、，
上下文）抛出IOException、InterruptedException{
字符串[]行=新字符串[2]；
int i=0；
用于（文本：值）{
行[i]=text.toString（）；
i++；
}
如果（行[0]。等于（行[1]））{
编写（关键，新文本（“相同”）；
}否则{
context.write（键，
新文本（行[0]+“vs”+行[1]）；
}
}
}
公共静态void main（字符串[]args）引发异常{
Configuration conf=新配置（）；
conf.set（“fs.default.name”hdfs://localhost:8020");
作业=新作业（配置）；
job.setJarByClass（compareTofiles.class）；
setJobName（“比较两个文件并识别差异”）；
setOutputPath（作业，新路径（args[2]）；
job.setReducerClass（Reduce.class）；
job.setOutputKeyClass（LongWritable.class）；
job.setOutputValueClass（Text.class）；
多个输入。addInputPath（作业，新路径（args[0]），
TextInputFormat.class，Map.class）；
多个输入。addInputPath（作业，新路径（args[1]），
TextInputFormat.class，Map2.class）；
job.waitForCompletion（true）；
}
}

我试图编写与字数相同的map reduce代码。但我不知道如何比较两个不同文件中的两行代码。通过在google中搜索，我知道我需要使用两个具有相同键（公共键）的mapper类，但我不知道如何在@Mins中执行。以下链接对我很有帮助