Java Hadoop MapReduce排序使用键减少输出
下面是一个map reduce程序,它计算几个文本文件的字数。 我的目标是让结果按照外观数量的降序排列 不幸的是,程序按关键字按字典顺序对输出进行排序。我想要整数值的自然顺序 因此,我添加了一个带有Java Hadoop MapReduce排序使用键减少输出,java,sorting,hadoop,mapreduce,comparator,Java,Sorting,Hadoop,Mapreduce,Comparator,下面是一个map reduce程序,它计算几个文本文件的字数。 我的目标是让结果按照外观数量的降序排列 不幸的是,程序按关键字按字典顺序对输出进行排序。我想要整数值的自然顺序 因此,我添加了一个带有job.setSortComparatorClass(IntComparator.class)的自定义比较器。但这并不像预期的那样有效。我得到以下异常: java.lang.Exception: java.nio.BufferUnderflowException at org.apache.h
job.setSortComparatorClass(IntComparator.class)
的自定义比较器。但这并不像预期的那样有效。我得到以下异常:
java.lang.Exception: java.nio.BufferUnderflowException
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:404)
Caused by: java.nio.BufferUnderflowException
at java.nio.Buffer.nextGetIndex(Buffer.java:498)
at java.nio.HeapByteBuffer.getInt(HeapByteBuffer.java:355)
at WordCount$IntComparator.compare(WordCount.java:128)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.compare(MapTask.java:987)
at org.apache.hadoop.util.QuickSort.sortInternal(QuickSort.java:100)
at org.apache.hadoop.util.QuickSort.sort(QuickSort.java:64)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:1277)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.flush(MapTask.java:1174)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.close(MapTask.java:609)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:675)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:330)
at org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:266)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
at java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:334)
at java.util.concurrent.FutureTask.run(FutureTask.java:166)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:722)
任何帮助都将不胜感激!:)
我在下面列出了整个程序,因为可能有一个例外的原因,我显然不知道。如您所见,我正在使用新的MapReduceAPI(org.apache.hadoop.mapreduce.*
)
import java.io.IOException;
导入java.nio.ByteBuffer;
导入java.util.StringTokenizer;
导入org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.fs.FileSystem;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.IntWritable;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.io.WritableComparator;
导入org.apache.hadoop.mapreduce.Job;
导入org.apache.hadoop.mapreduce.Mapper;
导入org.apache.hadoop.mapreduce.Reducer;
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
导入org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
导入org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
/**
*计算多个文本文件中的单词数。
*/
公共类字数{
/**
*将文本行映射到(单词、数量)对。
*/
公共静态类映射扩展映射器{
私有文本字=新文本();
私有可写金额=新可写金额(1);
@凌驾
受保护的void映射(可长写键、文本值、上下文)
抛出IOException、InterruptedException{
字符串textLine=value.toString();
StringTokenizer tokenizer=新的StringTokenizer(文本行);
while(tokenizer.hasMoreElements()){
set((字符串)标记器.nextElement());
上下文。写(单词、数量);
}
}
}
/**
*将(单词,数量)对减少为(数量,单词)列表。
*/
公共静态类Reduce扩展
减速器{
私有IntWritable金额=新IntWritable();
私人整数和;
@凌驾
受保护的void reduce(文本键、Iterable valueList、,
上下文)抛出IOException、InterruptedException{
总和=0;
for(可写入值:valueList){
sum+=value.get();
}
金额(总金额);
写入(数量、键);
}
}
公共静态类IntComparator扩展了WritableComparator{
公共IntComparator(){
super(IntWritable.class);
}
私有整数int1;
私有整数int2;
@凌驾
公共整数比较(字节[]raw1,整数偏移量1,整数长度1,字节[]raw2,
整数偏移量2,整数长度2){
int1=ByteBuffer.wrap(raw1,offset1,length1.getInt();
int2=ByteBuffer.wrap(raw2,offset2,length2.getInt();
返回int2.compareTo(int1);
}
}
/**
*作业配置。
*
*@param args
*@抛出异常
*@ClassNotFoundException
*@抛出中断异常
*/
公共静态void main(字符串[]args)引发IOException,
ClassNotFoundException,InterruptedException{
路径输入路径=新路径(args[0]);
路径outputPath=新路径(args[1]);
配置=新配置();
addResource(新路径(“/etc/hadoop/conf/core site.xml”);
作业=新作业(配置);
job.setJobName(“字数”);
job.setJarByClass(WordCount.class);
job.setMapOutputKeyClass(Text.class);
setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
setInputFormatClass(TextInputFormat.class);
setOutputFormatClass(TextOutputFormat.class);
job.setSortComparatorClass(IntComparator.class);
setInputPath(作业,inputPath);
get(configuration).delete(outputPath,true);
setOutputPath(作业,outputPath);
job.waitForCompletion(true);
}
}
比较器步骤发生在映射器
和减速机
之间,当您在减速机
本身中交换键和值时,这对您不起作用
默认的writeablecomparator
通常会处理您的数字排序,如果键是intwriteable
,除非它得到一个Text
键,从而导致字典排序
至于到底为什么结尾的输出没有按您写出来的
intwriteable
键排序,我不确定。也许这与TextOutputFormat
的工作方式有关?您可能需要深入研究TextOutputFormat
源代码以获取相关线索,但简言之,设置排序比较器恐怕对您没有帮助。比较器步骤发生在映射器
和减速机
之间,当您在减速机
本身中交换键和值时,这对您不起作用
默认的writeablecomparator
通常会处理您的数字排序,如果键是intwriteable
,除非它得到一个Text
键,从而导致字典排序
至于到底为什么结尾的输出没有按您写出来的intwriteable
键排序,我不确定。也许这与TextOutputFormat
的工作方式有关?您可能需要深入挖掘TextOutputFormat
源代码以获取相关线索,但简言之,设置排序比较器恐怕对您没有帮助。正如您的公司所说
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
/**
* Counts the words in several text files.
*/
public class WordCount {
/**
* Maps lines of text to (word, amount) pairs.
*/
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private Text word = new Text();
private IntWritable amount = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String textLine = value.toString();
StringTokenizer tokenizer = new StringTokenizer(textLine);
while (tokenizer.hasMoreElements()) {
word.set((String) tokenizer.nextElement());
context.write(word, amount);
}
}
}
/**
* Reduces (word, amount) pairs to (amount, word) list.
*/
public static class Reduce extends
Reducer<Text, IntWritable, IntWritable, Text> {
private IntWritable amount = new IntWritable();
private int sum;
@Override
protected void reduce(Text key, Iterable<IntWritable> valueList,
Context context) throws IOException, InterruptedException {
sum = 0;
for (IntWritable value : valueList) {
sum += value.get();
}
amount.set(sum);
context.write(amount, key);
}
}
public static class IntComparator extends WritableComparator {
public IntComparator() {
super(IntWritable.class);
}
private Integer int1;
private Integer int2;
@Override
public int compare(byte[] raw1, int offset1, int length1, byte[] raw2,
int offset2, int length2) {
int1 = ByteBuffer.wrap(raw1, offset1, length1).getInt();
int2 = ByteBuffer.wrap(raw2, offset2, length2).getInt();
return int2.compareTo(int1);
}
}
/**
* Job configuration.
*
* @param args
* @throws IOException
* @throws ClassNotFoundException
* @throws InterruptedException
*/
public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
Path inputPath = new Path(args[0]);
Path outputPath = new Path(args[1]);
Configuration configuration = new Configuration();
configuration.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
Job job = new Job(configuration);
job.setJobName("WordCount");
job.setJarByClass(WordCount.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setSortComparatorClass(IntComparator.class);
FileInputFormat.setInputPaths(job, inputPath);
FileSystem.get(configuration).delete(outputPath, true);
FileOutputFormat.setOutputPath(job, outputPath);
job.waitForCompletion(true);
}
}
private String word;
private Long count;