使用mapreduce在hadoop中进行文件比较 A.txt B.txt 输出文件_Hadoop_Mapreduce_Reduce

使用mapreduce在hadoop中进行文件比较 A.txt B.txt 输出文件

hadoop mapreduce

使用mapreduce在hadoop中进行文件比较 A.txt B.txt 输出文件,hadoop,mapreduce,reduce,Hadoop,Mapreduce,Reduce,1） col1和col2是本文档中的主键，如果任何键发生更改，我们将显示两条记录，如 in A.txt contain 1st Records:- A 120 140 160 180 in B.txt contain 1st Records:- A 110 140 160 180 在本例中，col2已更改，因此我必须显示两条记录 2）如果两个文件上的记录没有变化（我的意思是看起来一样），我们只能显示一条

1） col1和col2是本文档中的主键，如果任何键发生更改，我们将显示两条记录，如

in A.txt contain 1st Records:- A          120      140     160     180
in B.txt contain 1st Records:- A          110      140     160     180

在本例中，col2已更改，因此我必须显示两条记录

2）如果两个文件上的记录没有变化（我的意思是看起来一样），我们只能显示一条记录

3）在这两个文件中显示所有其他记录

最终输出应该如下所示

输出文件

使用PIG。加载两个文件，合并记录，然后将其区分开来

A = LOAD 'A.txt' USING PigStorage('\t');
B = LOAD 'B.txt' USING PigStorage('\t');
C = UNION A,B;
D = DISTINCT C;
DUMP D;

使用PIG。加载两个文件，合并记录，然后将其区分开来

A = LOAD 'A.txt' USING PigStorage('\t');
B = LOAD 'B.txt' USING PigStorage('\t');
C = UNION A,B;
D = DISTINCT C;
DUMP D;

以下是

mapreduce

解决方案：
将两个或多个文件放在一个目录中（输入-

arg1

），它将把所有文件合并到一个符合您所有要求的文件中。它还为一个键（col1+col2）的非macthing行匹配col3以结束。有关更多信息，请参阅注释

public class FileCompare  extends Configured implements Tool{

    public static class FileComapreMapper extends Mapper<Object, Text, Text, Text> {
        int lineno=0;

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException{
            try{
                lineno++;
                System.out.println(lineno + " -> " + value);
                //skip header - uncomment this line to include header in output
                if(lineno == 1) return; 

                String[] fields = value.toString().split("\\s+");//assuming input recs are whitespace seperated
                String col1_col2 = fields[0] + "," + fields[1]; //key
                String col3tolast="";
                for(int i=2; i < fields.length;i++)
                    col3tolast+=fields[i] + ","; //values

               col3tolast=col3tolast.substring(0, col3tolast.length()-1); //remove last char(',')
               context.write(new Text(col1_col2), new Text(col3tolast)); //send key, value pairs to reducer
            }catch(Exception e){
                System.err.println("Invaid Data at line: " + lineno + " Error: " + e.getMessage());
            }
        }   
    }

    public  static class FileComapreReducer extends Reducer<Text, Text, Text, Text> {
        @Override
        public void reduce(Text key, Iterable<Text> values, Context context) 
                throws IOException, InterruptedException {
            //Get unique col3 to last value
            Set<Text> uniqueCol3tolast = new HashSet<Text>();
            for(Text record : values)
                uniqueCol3tolast.add(record);
            //write key + value
            for(Text col3tolast:uniqueCol3tolast) //outputing tab delimited recs
                context.write(new Text(key.toString().replaceAll(",", "\t")), 
                        new Text(col3tolast.toString().replaceAll(",", "\t")));     
        }
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new FileCompare(), args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            System.err.println("Usage: <in> <out>");
            System.exit(2);
        }
        Configuration conf = this.getConf();
        Job job = Job.getInstance(conf, "merge-two-files");
        job.setJarByClass(FileCompare.class);
        job.setMapperClass(FileComapreMapper.class);
        job.setReducerClass(FileComapreReducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        FileSystem fs = null;
        Path dstFilePath = new Path(args[1]);
        try {
            fs = dstFilePath.getFileSystem(conf);
            if (fs.exists(dstFilePath))
                fs.delete(dstFilePath, true);
        } catch (IOException e1) {
            e1.printStackTrace();
        }
        return job.waitForCompletion(true) ? 0 : 1;
    } 
}

公共类文件比较扩展配置的实现工具{
公共静态类FileComaMapper扩展映射器{
int lineno=0；
公共void映射（对象键、文本值、上下文上下文）引发IOException、InterruptedException{
试一试{
lineno++；
系统输出打印项次（行号+“->”+值）；
//跳过标题-取消对此行的注释以将标题包含在输出中
如果（lineno==1）返回；
String[]fields=value.toString（）.split（\\s+）；//假设输入记录用空格分隔
字符串col1_col2=字段[0]+，“+字段[1]；//键
字符串col3tolast=“”；
for（int i=2；i


这里是mapreduce
解决方案：

将两个或多个文件放在一个目录中（输入-arg1
），它将把所有文件合并到一个符合您所有要求的文件中。它还为一个键（col1+col2）的非macthing行匹配col3以结束。有关更多信息，请参阅注释
public class FileCompare  extends Configured implements Tool{

    public static class FileComapreMapper extends Mapper<Object, Text, Text, Text> {
        int lineno=0;

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException{
            try{
                lineno++;
                System.out.println(lineno + " -> " + value);
                //skip header - uncomment this line to include header in output
                if(lineno == 1) return; 

                String[] fields = value.toString().split("\\s+");//assuming input recs are whitespace seperated
                String col1_col2 = fields[0] + "," + fields[1]; //key
                String col3tolast="";
                for(int i=2; i < fields.length;i++)
                    col3tolast+=fields[i] + ","; //values

               col3tolast=col3tolast.substring(0, col3tolast.length()-1); //remove last char(',')
               context.write(new Text(col1_col2), new Text(col3tolast)); //send key, value pairs to reducer
            }catch(Exception e){
                System.err.println("Invaid Data at line: " + lineno + " Error: " + e.getMessage());
            }
        }   
    }

    public  static class FileComapreReducer extends Reducer<Text, Text, Text, Text> {
        @Override
        public void reduce(Text key, Iterable<Text> values, Context context) 
                throws IOException, InterruptedException {
            //Get unique col3 to last value
            Set<Text> uniqueCol3tolast = new HashSet<Text>();
            for(Text record : values)
                uniqueCol3tolast.add(record);
            //write key + value
            for(Text col3tolast:uniqueCol3tolast) //outputing tab delimited recs
                context.write(new Text(key.toString().replaceAll(",", "\t")), 
                        new Text(col3tolast.toString().replaceAll(",", "\t")));     
        }
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new FileCompare(), args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            System.err.println("Usage: <in> <out>");
            System.exit(2);
        }
        Configuration conf = this.getConf();
        Job job = Job.getInstance(conf, "merge-two-files");
        job.setJarByClass(FileCompare.class);
        job.setMapperClass(FileComapreMapper.class);
        job.setReducerClass(FileComapreReducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        FileSystem fs = null;
        Path dstFilePath = new Path(args[1]);
        try {
            fs = dstFilePath.getFileSystem(conf);
            if (fs.exists(dstFilePath))
                fs.delete(dstFilePath, true);
        } catch (IOException e1) {
            e1.printStackTrace();
        }
        return job.waitForCompletion(true) ? 0 : 1;
    } 
}

公共类文件比较扩展配置的实现工具{
公共静态类FileComaMapper扩展映射器{
int lineno=0；
公共void映射（对象键、文本值、上下文上下文）引发IOException、InterruptedException{
试一试{
lineno++；
系统输出打印项次（行号+“->”+值）；
//跳过标题-取消对此行的注释以将标题包含在输出中
如果（lineno==1）返回；
String[]fields=value.toString（）.split（\\s+）；//假设输入记录用空格分隔
字符串col1_col2=字段[0]+，“+字段[1]；//键
字符串col3tolast=“”；
for（int i=2；iA = LOAD 'A.txt' USING PigStorage('\t');
B = LOAD 'B.txt' USING PigStorage('\t');
C = UNION A,B;
D = DISTINCT C;
DUMP D;

public class FileCompare  extends Configured implements Tool{

    public static class FileComapreMapper extends Mapper<Object, Text, Text, Text> {
        int lineno=0;

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException{
            try{
                lineno++;
                System.out.println(lineno + " -> " + value);
                //skip header - uncomment this line to include header in output
                if(lineno == 1) return; 

                String[] fields = value.toString().split("\\s+");//assuming input recs are whitespace seperated
                String col1_col2 = fields[0] + "," + fields[1]; //key
                String col3tolast="";
                for(int i=2; i < fields.length;i++)
                    col3tolast+=fields[i] + ","; //values

               col3tolast=col3tolast.substring(0, col3tolast.length()-1); //remove last char(',')
               context.write(new Text(col1_col2), new Text(col3tolast)); //send key, value pairs to reducer
            }catch(Exception e){
                System.err.println("Invaid Data at line: " + lineno + " Error: " + e.getMessage());
            }
        }   
    }

    public  static class FileComapreReducer extends Reducer<Text, Text, Text, Text> {
        @Override
        public void reduce(Text key, Iterable<Text> values, Context context) 
                throws IOException, InterruptedException {
            //Get unique col3 to last value
            Set<Text> uniqueCol3tolast = new HashSet<Text>();
            for(Text record : values)
                uniqueCol3tolast.add(record);
            //write key + value
            for(Text col3tolast:uniqueCol3tolast) //outputing tab delimited recs
                context.write(new Text(key.toString().replaceAll(",", "\t")), 
                        new Text(col3tolast.toString().replaceAll(",", "\t")));     
        }
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new FileCompare(), args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            System.err.println("Usage: <in> <out>");
            System.exit(2);
        }
        Configuration conf = this.getConf();
        Job job = Job.getInstance(conf, "merge-two-files");
        job.setJarByClass(FileCompare.class);
        job.setMapperClass(FileComapreMapper.class);
        job.setReducerClass(FileComapreReducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        FileSystem fs = null;
        Path dstFilePath = new Path(args[1]);
        try {
            fs = dstFilePath.getFileSystem(conf);
            if (fs.exists(dstFilePath))
                fs.delete(dstFilePath, true);
        } catch (IOException e1) {
            e1.printStackTrace();
        }
        return job.waitForCompletion(true) ? 0 : 1;
    } 
}