Java Hadoop MapReduce排序使用键减少输出

Java Hadoop MapReduce排序使用键减少输出,java,sorting,hadoop,mapreduce,comparator,Java,Sorting,Hadoop,Mapreduce,Comparator,下面是一个map reduce程序,它计算几个文本文件的字数。 我的目标是让结果按照外观数量的降序排列 不幸的是,程序按关键字按字典顺序对输出进行排序。我想要整数值的自然顺序 因此,我添加了一个带有job.setSortComparatorClass(IntComparator.class)的自定义比较器。但这并不像预期的那样有效。我得到以下异常: java.lang.Exception: java.nio.BufferUnderflowException at org.apache.h

下面是一个map reduce程序,它计算几个文本文件的字数。 我的目标是让结果按照外观数量的降序排列

不幸的是,程序按关键字按字典顺序对输出进行排序。我想要整数值的自然顺序

因此,我添加了一个带有
job.setSortComparatorClass(IntComparator.class)
的自定义比较器。但这并不像预期的那样有效。我得到以下异常:

java.lang.Exception: java.nio.BufferUnderflowException
    at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:404)
Caused by: java.nio.BufferUnderflowException
    at java.nio.Buffer.nextGetIndex(Buffer.java:498)
    at java.nio.HeapByteBuffer.getInt(HeapByteBuffer.java:355)
    at WordCount$IntComparator.compare(WordCount.java:128)
    at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.compare(MapTask.java:987)
    at org.apache.hadoop.util.QuickSort.sortInternal(QuickSort.java:100)
    at org.apache.hadoop.util.QuickSort.sort(QuickSort.java:64)
    at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:1277)
    at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.flush(MapTask.java:1174)
    at org.apache.hadoop.mapred.MapTask$NewOutputCollector.close(MapTask.java:609)
    at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:675)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:330)
    at org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:266)
    at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
    at java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:334)
    at java.util.concurrent.FutureTask.run(FutureTask.java:166)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
    at java.lang.Thread.run(Thread.java:722)
任何帮助都将不胜感激!:)

我在下面列出了整个程序,因为可能有一个例外的原因,我显然不知道。如您所见,我正在使用新的MapReduceAPI(
org.apache.hadoop.mapreduce.*

import java.io.IOException;
导入java.nio.ByteBuffer;
导入java.util.StringTokenizer;
导入org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.fs.FileSystem;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.IntWritable;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.io.WritableComparator;
导入org.apache.hadoop.mapreduce.Job;
导入org.apache.hadoop.mapreduce.Mapper;
导入org.apache.hadoop.mapreduce.Reducer;
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
导入org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
导入org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
/**
*计算多个文本文件中的单词数。
*/
公共类字数{
/**
*将文本行映射到(单词、数量)对。
*/
公共静态类映射扩展映射器{
私有文本字=新文本();
私有可写金额=新可写金额(1);
@凌驾
受保护的void映射(可长写键、文本值、上下文)
抛出IOException、InterruptedException{
字符串textLine=value.toString();
StringTokenizer tokenizer=新的StringTokenizer(文本行);
while(tokenizer.hasMoreElements()){
set((字符串)标记器.nextElement());
上下文。写(单词、数量);
}
}
}
/**
*将(单词,数量)对减少为(数量,单词)列表。
*/
公共静态类Reduce扩展
减速器{
私有IntWritable金额=新IntWritable();
私人整数和;
@凌驾
受保护的void reduce(文本键、Iterable valueList、,
上下文)抛出IOException、InterruptedException{
总和=0;
for(可写入值:valueList){
sum+=value.get();
}
金额(总金额);
写入(数量、键);
}
}
公共静态类IntComparator扩展了WritableComparator{
公共IntComparator(){
super(IntWritable.class);
}
私有整数int1;
私有整数int2;
@凌驾
公共整数比较(字节[]raw1,整数偏移量1,整数长度1,字节[]raw2,
整数偏移量2,整数长度2){
int1=ByteBuffer.wrap(raw1,offset1,length1.getInt();
int2=ByteBuffer.wrap(raw2,offset2,length2.getInt();
返回int2.compareTo(int1);
}
}
/**
*作业配置。
* 
*@param args
*@抛出异常
*@ClassNotFoundException
*@抛出中断异常
*/
公共静态void main(字符串[]args)引发IOException,
ClassNotFoundException,InterruptedException{
路径输入路径=新路径(args[0]);
路径outputPath=新路径(args[1]);
配置=新配置();
addResource(新路径(“/etc/hadoop/conf/core site.xml”);
作业=新作业(配置);
job.setJobName(“字数”);
job.setJarByClass(WordCount.class);
job.setMapOutputKeyClass(Text.class);
setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
setInputFormatClass(TextInputFormat.class);
setOutputFormatClass(TextOutputFormat.class);
job.setSortComparatorClass(IntComparator.class);
setInputPath(作业,inputPath);
get(configuration).delete(outputPath,true);
setOutputPath(作业,outputPath);
job.waitForCompletion(true);
}
}

比较器步骤发生在
映射器
减速机
之间,当您在
减速机
本身中交换键和值时,这对您不起作用

默认的
writeablecomparator
通常会处理您的数字排序,如果键是
intwriteable
,除非它得到一个
Text
键,从而导致字典排序


至于到底为什么结尾的输出没有按您写出来的
intwriteable
键排序,我不确定。也许这与
TextOutputFormat
的工作方式有关?您可能需要深入研究
TextOutputFormat
源代码以获取相关线索,但简言之,设置排序比较器恐怕对您没有帮助。

比较器步骤发生在
映射器
减速机
之间,当您在
减速机
本身中交换键和值时,这对您不起作用

默认的
writeablecomparator
通常会处理您的数字排序,如果键是
intwriteable
,除非它得到一个
Text
键,从而导致字典排序

至于到底为什么结尾的输出没有按您写出来的
intwriteable
键排序,我不确定。也许这与
TextOutputFormat
的工作方式有关?您可能需要深入挖掘
TextOutputFormat
源代码以获取相关线索,但简言之,设置排序比较器恐怕对您没有帮助。

正如您的公司所说
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

/**
 * Counts the words in several text files.
 */
public class WordCount {
  /**
   * Maps lines of text to (word, amount) pairs.
   */
  public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {

    private Text word = new Text();
    private IntWritable amount = new IntWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String textLine = value.toString();

      StringTokenizer tokenizer = new StringTokenizer(textLine);
      while (tokenizer.hasMoreElements()) {
        word.set((String) tokenizer.nextElement());

        context.write(word, amount);
      }
    }

  }

  /**
   * Reduces (word, amount) pairs to (amount, word) list.
   */
  public static class Reduce extends
      Reducer<Text, IntWritable, IntWritable, Text> {

    private IntWritable amount = new IntWritable();
    private int sum;

    @Override
    protected void reduce(Text key, Iterable<IntWritable> valueList,
        Context context) throws IOException, InterruptedException {
      sum = 0;

      for (IntWritable value : valueList) {
        sum += value.get();
      }

      amount.set(sum);
      context.write(amount, key);
    }
  }

  public static class IntComparator extends WritableComparator {
    public IntComparator() {
      super(IntWritable.class);
    }

    private Integer int1;
    private Integer int2;

    @Override
    public int compare(byte[] raw1, int offset1, int length1, byte[] raw2,
        int offset2, int length2) {
      int1 = ByteBuffer.wrap(raw1, offset1, length1).getInt();
      int2 = ByteBuffer.wrap(raw2, offset2, length2).getInt();

      return int2.compareTo(int1);
    }

  }

  /**
   * Job configuration.
   * 
   * @param args
   * @throws IOException
   * @throws ClassNotFoundException
   * @throws InterruptedException
   */
  public static void main(String[] args) throws IOException,
      ClassNotFoundException, InterruptedException {
    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1]);

    Configuration configuration = new Configuration();
    configuration.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
    Job job = new Job(configuration);
    job.setJobName("WordCount");
    job.setJarByClass(WordCount.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setSortComparatorClass(IntComparator.class);

    FileInputFormat.setInputPaths(job, inputPath);

    FileSystem.get(configuration).delete(outputPath, true);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.waitForCompletion(true);
  }
}
private String word;
private Long count;