Java Hadoop-MapReduce
我一直在尝试解决一个简单的Map/Reduce问题,在这个问题中,我将从一些输入文件中计算单词,然后将它们的频率作为一个键,将它们的单词长度作为另一个键。映射将在每次从文件中读取新词时发出一个,然后将所有相同的词组合在一起以获得它们的最终计数。然后作为一个输出,我想看看每个单词长度的统计数据,最常用的单词是什么 这是我们(我和我的团队)所能做到的: 这是WordCountMapper类Java Hadoop-MapReduce,java,hadoop,mapreduce,Java,Hadoop,Mapreduce,我一直在尝试解决一个简单的Map/Reduce问题,在这个问题中,我将从一些输入文件中计算单词,然后将它们的频率作为一个键,将它们的单词长度作为另一个键。映射将在每次从文件中读取新词时发出一个,然后将所有相同的词组合在一起以获得它们的最终计数。然后作为一个输出,我想看看每个单词长度的统计数据,最常用的单词是什么 这是我们(我和我的团队)所能做到的: 这是WordCountMapper类 import java.io.IOException; import java.util.ArrayList;
import java.io.IOException;
import java.util.ArrayList;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class WordCountMapper extends MapReduceBase implements
Mapper<LongWritable, Text, Text, CompositeGroupKey> {
private final IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value,
OutputCollector<Text, CompositeGroupKey> output, Reporter reporter)
throws IOException {
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line.toLowerCase());
while(itr.hasMoreTokens()) {
word.set(itr.nextToken());
CompositeGroupKey gky = new CompositeGroupKey(1, word.getLength());
output.collect(word, gky);
}
}
}
import java.io.IOException;
导入java.util.ArrayList;
导入java.util.StringTokenizer;
导入org.apache.hadoop.io.IntWritable;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapred.MapReduceBase;
导入org.apache.hadoop.mapred.Mapper;
导入org.apache.hadoop.mapred.OutputCollector;
导入org.apache.hadoop.mapred.Reporter;
公共类WordCountMapper扩展了MapReduceBase实现
制图员{
私有最终可写文件=新的可写文件(1);
私有文本字=新文本();
公共无效映射(可长写键、文本值、,
OutputCollector输出,报告器(报告器)
抛出IOException{
字符串行=value.toString();
StringTokenizer itr=新的StringTokenizer(line.toLowerCase());
而(itr.hasMoreTokens()){
set(itr.nextToken());
CompositeGroupKey gky=新的CompositeGroupKey(1,word.getLength());
输出.收集(word,gky);
}
}
}
这是wordcountreducer类:
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import com.sun.xml.internal.bind.CycleRecoverable.Context;
public class WordCountReducer extends MapReduceBase
implements Reducer<Text, CompositeGroupKey, Text, CompositeGroupKey> {
@Override
public void reduce(Text key, Iterator<CompositeGroupKey> values,
OutputCollector<Text, CompositeGroupKey> output, Reporter reporter)
throws IOException {
int sum = 0;
int length = 0;
while (values.hasNext()) {
CompositeGroupKey value = (CompositeGroupKey) values.next();
sum += (Integer) value.getCount(); // process value
length = (Integer) key.getLength();
}
CompositeGroupKey cgk = new CompositeGroupKey(sum,length);
output.collect(key, cgk);
}
}
import java.io.IOException;
导入java.util.ArrayList;
导入java.util.Iterator;
导入org.apache.hadoop.io.IntWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapred.MapReduceBase;
导入org.apache.hadoop.mapred.OutputCollector;
导入org.apache.hadoop.mapred.Reducer;
导入org.apache.hadoop.mapred.Reporter;
导入com.sun.xml.internal.bind.cyclererecoverable.Context;
公共类WordCountReducer扩展了MapReduceBase
机具减速器{
@凌驾
public void reduce(文本键、迭代器值、,
OutputCollector输出,报告器(报告器)
抛出IOException{
整数和=0;
整数长度=0;
while(values.hasNext()){
CompositeGroupKey值=(CompositeGroupKey)值。下一步();
sum+=(整数)值。getCount();//进程值
length=(整数)key.getLength();
}
CompositeGroupKey cgk=新的CompositeGroupKey(总和,长度);
输出.收集(键,cgk);
}
}
这是类字数
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobStatus;
import org.apache.hadoop.mapred.jobcontrol.Job;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils;
public class WordCount {
public static void main(String[] args) {
JobClient client = new JobClient();
JobConf conf = new JobConf(WordCount.class);
// specify output types
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(CompositeGroupKey.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(CompositeGroupKey.class);
// specify input and output dirs
FileInputFormat.addInputPath(conf, new Path("input"));
FileOutputFormat.setOutputPath(conf, new Path("output16"));
// specify a mapper
conf.setMapperClass(WordCountMapper.class);
// specify a reducer
conf.setReducerClass(WordCountReducer.class);
conf.setCombinerClass(WordCountReducer.class);
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (Exception e) {
e.printStackTrace();
}
}
}
And this is the groupcompositekey
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
public class CompositeGroupKey implements WritableComparable<CompositeGroupKey> {
int count;
int length;
public CompositeGroupKey(int c, int l) {
this.count = c;
this.length = l;
}
public void write(DataOutput out) throws IOException {
WritableUtils.writeVInt(out, count);
WritableUtils.writeVInt(out, length);
}
public void readFields(DataInput in) throws IOException {
this.count = WritableUtils.readVInt(in);
this.length = WritableUtils.readVInt(in);
}
public int compareTo(CompositeGroupKey pop) {
return 0;
}
public int getCount() {
return this.count;
}
public int getLength() {
return this.length;
}
}
import java.util.ArrayList;
导入org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.IntWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapred.FileInputFormat;
导入org.apache.hadoop.mapred.FileOutputFormat;
导入org.apache.hadoop.mapred.JobClient;
导入org.apache.hadoop.mapred.JobConf;
导入org.apache.hadoop.mapred.JobStatus;
导入org.apache.hadoop.mapred.jobcontrol.Job;
导入org.apache.hadoop.util.GenericOptionsParser;
导入org.apache.hadoop.util.StringUtils;
公共类字数{
公共静态void main(字符串[]args){
JobClient=newjobclient();
JobConf conf=newjobconf(WordCount.class);
//指定输出类型
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(CompositeGroupKey.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(CompositeGroupKey.class);
//指定输入和输出目录
addInputPath(conf,新路径(“输入”));
setOutputPath(conf,新路径(“output16”);
//指定映射器
conf.setMapperClass(WordCountMapper.class);
//指定减速器
conf.setReducerClass(WordCountReducer.class);
conf.setCombinerClass(WordCountReducer.class);
client.setConf(conf);
试一试{
runJob(conf);
}捕获(例外e){
e、 printStackTrace();
}
}
}
这是groupcompositekey
导入java.io.DataInput;
导入java.io.DataOutput;
导入java.io.IOException;
导入org.apache.hadoop.io.IntWritable;
导入org.apache.hadoop.io.WritableComparable;
导入org.apache.hadoop.io.WritableUtils;
公共类CompositeGroupKey实现了WritableComparable{
整数计数;
整数长度;
公共复合组密钥(int c,int l){
这个.count=c;
这个长度=l;
}
public void write(DataOutput out)引发IOException{
WritableUtils.writeVInt(out,count);
WritableUtils.writeVInt(输出,长度);
}
public void readFields(DataInput in)引发IOException{
this.count=WritableUtils.readVInt(in);
this.length=WritableUtils.readVInt(in);
}
public int compareTo(复合组键pop){
返回0;
}
public int getCount(){
返回这个.count;
}
公共整数getLength(){
返回这个.length;
}
}
现在我得到了这个错误:
java.lang.RuntimeException:java.lang.NoSuchMethodException:CompositeGroupKey.()
位于org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:80)
位于org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:62)
位于org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:40)
位于org.apache.hadoop.mapred.Task$ValuesIterator.readNextValue(Task.java:738)
位于org.apache.hadoop.mapred.Task$ValuesIterator.next(Task.java:678)
位于org.apache.hadoop.mapred.Task$CombineValuesIterator.next(Task.java:757)
在WordCountReducer.reduce(WordCountReducer.jav
java.lang.RuntimeException: java.lang.NoSuchMethodException: CompositeGroupKey.<init>()
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:80)
at org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:62)
at org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:40)
at org.apache.hadoop.mapred.Task$ValuesIterator.readNextValue(Task.java:738)
at org.apache.hadoop.mapred.Task$ValuesIterator.next(Task.java:678)
at org.apache.hadoop.mapred.Task$CombineValuesIterator.next(Task.java:757)
at WordCountReducer.reduce(WordCountReducer.java:24)
at WordCountReducer.reduce(WordCountReducer.java:1)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.combineAndSpill(MapTask.java:904)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:785)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.flush(MapTask.java:698)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:228)
at org.apache.hadoop.mapred.TaskTracker$Child.main(TaskTracker.java:2209)
Caused by: java.lang.NoSuchMethodException: CompositeGroupKey.<init>()
at java.lang.Class.getConstructor0(Unknown Source)
at java.lang.Class.getDeclaredConstructor(Unknown Source)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:74)
public CompositeGroupKey() {
}
java.lang.RuntimeException: java.lang.NoSuchMethodException: CompositeGroupKey.<init>()