Java 映射reduce以生成n个最频繁的单词
我有一个MapReduce代码,它可以打印一个单词在文档中出现的次数。我想更改此代码以生成N个最频繁的单词。N是一个输入参数。代码如下:Java 映射reduce以生成n个最频繁的单词,java,mapreduce,Java,Mapreduce,我有一个MapReduce代码,它可以打印一个单词在文档中出现的次数。我想更改此代码以生成N个最频繁的单词。N是一个输入参数。代码如下: import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCount {
public static void main(String [] args) throws Exception
{
Configuration c=new Configuration();
String[] files=new GenericOptionsParser(c,args).getRemainingArgs();
Path input=new Path(files[0]);
Path output=new Path(files[1]);
Job j=new Job(c,"wordcount");
j.setJarByClass(WordCount.class);
j.setMapperClass(MapForWordCount.class);
j.setReducerClass(ReduceForWordCount.class);
j.setOutputKeyClass(Text.class);
j.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(j, input);
FileOutputFormat.setOutputPath(j, output);
System.exit(j.waitForCompletion(true)?0:1);
}
public static class MapForWordCount extends Mapper<LongWritable, Text, Text, IntWritable>{
public void map(LongWritable key, Text value, Context con) throws IOException, InterruptedException
{
String line = value.toString();
String[] words=line.split("[^\\p{L}0-9]+");
for(String word: words )
{
if(word.length()<=3){
continue;
}else{
Text outputKey = new Text(word.toUpperCase().trim());
IntWritable outputValue = new IntWritable(1);
con.write(outputKey, outputValue);
}
} } }
public static class ReduceForWordCount extends Reducer<Text, IntWritable, Text, IntWritable>
{
public void reduce(Text word, Iterable<IntWritable> values, Context con) throws IOException, InterruptedException
{
int sum = 0;
for(IntWritable value : values)
{
sum += value.get();
}
con.write(word, new IntWritable(sum));
}
}
}
import java.io.IOException;
导入org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.IntWritable;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.Job;
导入org.apache.hadoop.mapreduce.Mapper;
导入org.apache.hadoop.mapreduce.Reducer;
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
导入org.apache.hadoop.util.GenericOptionsParser;
公共类字数{
公共静态void main(字符串[]args)引发异常
{
配置c=新配置();
String[]files=新的GenericOptionsParser(c,args).getRemainingArgs();
路径输入=新路径(文件[0]);
路径输出=新路径(文件[1]);
作业j=新作业(c,“字数”);
j、 setJarByClass(WordCount.class);
j、 setMapperClass(MapForWordCount.class);
j、 setReducerClass(reduceForDorCount.class);
j、 setOutputKeyClass(Text.class);
j、 setOutputValueClass(IntWritable.class);
addInputPath(j,输入);
setOutputPath(j,输出);
系统退出(j.waitForCompletion(true)-0:1);
}
公共静态类MapForWordCount扩展映射器{
公共void映射(LongWritable键、文本值、上下文con)引发IOException、InterruptedException
{
字符串行=value.toString();
String[]words=line.split([^\\p{L}0-9]+”;
for(字符串字:字)
{
if(word.length()请重新格式化您的代码。缩进是无法忍受的,而且您使用了太多的空行。为您进行此分析而改变Hadoop行为不是更好,而不是用Java对数据进行后处理吗?同意@tomayotatogy,一个粗略的搜索等等,其他地方建议分别使用Hadoop映射器和还原器,例如,为什么ot使用chain-mapreduce