Java 如何找到HDFS中文件之间的对称差异？_Java_Hadoop_Hdfs

Java 如何找到HDFS中文件之间的对称差异？

java hadoop

Java 如何找到HDFS中文件之间的对称差异？,java,hadoop,hdfs,Java,Hadoop,Hdfs,我有两个hdfs文件：/my/path/in/hdfs/part-r-*（大约1000个部分，每个约10000行）和/my/other/path/in/hdfs/part-r-*（大小相同）。第一个文件包含以下格式的数据： id1 111 id6 212 id3 984 等等。第二个是： 999 id8 15 id4 93 id1 我想在第一个文件中找到第二个文件中未出现的所有id，反之亦然。有什么简单的方法可以做到吗？我必须承认，我怀疑这种计算是否适合MapReduce的严格范例，仅仅是

我有两个hdfs文件：/my/path/in/hdfs/part-r-*（大约1000个部分，每个约10000行）和/my/other/path/in/hdfs/part-r-*（大小相同）。第一个文件包含以下格式的数据：

id1 111
id6 212
id3 984

等等。第二个是：

999 id8
15 id4
93 id1

我想在第一个文件中找到第二个文件中未出现的所有id，反之亦然。有什么简单的方法可以做到吗？

我必须承认，我怀疑这种计算是否适合MapReduce的严格范例，仅仅是基于该过程的复杂性和计算量（尽管您说您案例中输入的两个文件大小相同），所以我认为这将是一个很好的案例，可以在保持简单的同时找到捷径

首先，为了消除额外的IO fat，您可能希望将这两个文件放在一个目录中（为了简单起见，这里我们假设

\input

），以绕过多输入混乱。在那之后，只需一个MapReduce作业，事情就变得容易多了

在映射阶段，您需要做的就是将两个文件中的ID设置为键，并将它们出现的“文件名”设置为值（这是一种安全的方法，可以找到对称差异，同时继续概括，一个ID在一个文件中可能会被多次看到）。这些“文件名”实际上不需要是实际的文件名，您只需放置

和

字符串，以指示此特定行中的特定ID分别在第一个或第二个文件中找到

在Reduce阶段，您可以将引用单个键/ID的所有值放入

HashSet

集合中，该集合保存您输入的所有唯一值。这意味着对于每个reducer（aka each ID），将创建一个
HashSet
，以放置
a
和
B
字符串的多个实例，仅存储其中一个实例。因此：

仅在第一个文件中看到的ID将有一个
HashSet
集合，其中只有
a

仅在第二个文件中看到的ID将有一个
HashSet
集合，其中只有
B

在两个文件中都可以看到的ID将有一个
HashSet
集合，其中包含
a
和
B
（也称为文件的交叉点，您不需要）

这样，您就可以简单地检查每个ID的
哈希集
，并只写前面两个选项中的那些，如上所述
这种类型的作业可以如下所示（这里的Reduce函数实际上不需要在键值对中有一个值，因此我只需要放置一个空的
字符串，以使事情更简单）： import org.apache.hadoop.conf.Configuration；导入org.apache.hadoop.fs.Path；导入org.apache.hadoop.io.Text；导入org.apache.hadoop.io.LongWritable；导入org.apache.hadoop.mapreduce.Job；导入org.apache.hadoop.mapreduce.Mapper；导入org.apache.hadoop.mapreduce.Reducer；导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat；导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat；导入org.apache.hadoop.util.GenericOptionsParser；导入org.apache.hadoop.fs.FileSystem；导入org.apache.hadoop.fs.FileStatus；导入org.apache.hadoop.fs.FSDataOutputStream；导入org.apache.hadoop.mapreduce.lib.input.FileSplit；导入java.io.*；导入java.io.IOException；导入java.util.*；导入java.nio.charset.StandardCharset；公共类SymDiff { /*输入： *输出： */ 公共静态类映射扩展映射器 { 公共void映射（LongWritable键、文本值、上下文上下文）引发IOException、InterruptedException { String[]line=value.toString（）.split（“”；//将每行拆分为两列 //如果第一列由整数组成，则将第二列的ID作为键 //并将“B”设置为表示在第二个文件中找到了特定ID的值 //否则，将第一列中的ID作为键 //并将“A”设置为表示在第一个文件中找到了特定ID的值 if（第[0]行）。匹配（“\\d+”）/（以一种不引发异常的方式查看第一个字符串是否为int）编写（新文本（第[1]行），新文本（“B”）；其他的编写（新文本（第[0]行），新文本（“A”）； } } /*输入： *输出： */ 公共静态类Reduce扩展Reducer { 公共void reduce（文本键、Iterable值、上下文上下文）引发IOException、InterruptedException { HashSet list_of_files=新HashSet（）； //将每个ID的“A”和“B”的所有实例存储在具有唯一值的哈希集中用于（文本值：值）列出所有文件。添加（value.toString（））； //仅写入其值在集合中仅包含“A”或“B”（而不是两者）的ID if（文件列表.contains（“A”）和&！文件列表.contains（“B”）|（！文件列表.contains（“A”）和文件列表.contains（“B”）） context.write（键，新文本（“”）； } } 公共静态void main（字符串[]args）引发异常 { //在HDFS中设置输入和输出目录的路径路径输入\ u dir=新路径（“输入”）；路径输出_dir=新路径（“输出”）； //如果输出目录已存在，请将其删除 Configuration conf=新配置（）； FileSystem fs=FileSystem.get（conf）；如果（fs.存在（输出目录）） fs.delete（output_dir，true）； //配置MapReduce作业 Job Job=Job.getInstance（conf，“对称差”）； job.setJarByClass（SymDiff.class）； job.setMapperClass（映射。 import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.*; import java.io.IOException; import java.util.*; import java.nio.charset.StandardCharsets; public class SymDiff { /* input: <byte_offset, line_of_dataset> * output: <ID, file> */ public static class Map extends Mapper<LongWritable, Text, Text, Text> { public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] line = value.toString().split(" "); // split each line to two columns // if the first column consists of integers, put the ID from the 2nd column as the key // and set "B" as the value to imply that the particular ID was found on the second file // else, put the ID from the first column as the key // and set "A" as the value to imply that the particular ID was found on the first file if(line[0].matches("\\d+")) // (lazy way to see if the first string is an int without throwing an exception) context.write(new Text(line[1]), new Text("B")); else context.write(new Text(line[0]), new Text("A")); } } /* input: <ID, file> * output: <ID, ""> */ public static class Reduce extends Reducer<Text, Text, Text, Text> { public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { HashSet<String> list_of_files = new HashSet<String>(); // store all the instances of "A" and "B" for each ID in a HashSet with unique values for(Text value : values) list_of_files.add(value.toString()); // only write the IDs which they values only contain "A" or "B" (and not both) on their set if(list_of_files.contains("A") && !list_of_files.contains("B") || (!list_of_files.contains("A") && list_of_files.contains("B"))) context.write(key, new Text("")); } } public static void main(String[] args) throws Exception { // set the paths of the input and output directories in the HDFS Path input_dir = new Path("input"); Path output_dir = new Path("output"); // in case the output directory already exists, delete it Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); if(fs.exists(output_dir)) fs.delete(output_dir, true); // configure the MapReduce job Job job = Job.getInstance(conf, "Symmetric Difference"); job.setJarByClass(SymDiff.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, input_dir); FileOutputFormat.setOutputPath(job, output_dir); job.waitForCompletion(true); } }