Hadoop ChainMapper,ChainReducer
我对Hadoop还比较陌生,并试图弄清楚如何通过编程将作业(多个映射器、还原器)与ChainMapper、ChainReducer链接起来。我发现了一些局部示例,但没有一个完整且有效的示例 我当前的测试代码是Hadoop ChainMapper,ChainReducer,hadoop,mapreduce,chaining,Hadoop,Mapreduce,Chaining,我对Hadoop还比较陌生,并试图弄清楚如何通过编程将作业(多个映射器、还原器)与ChainMapper、ChainReducer链接起来。我发现了一些局部示例,但没有一个完整且有效的示例 我当前的测试代码是 public class ChainJobs extends Configured implements Tool { public static class Map extends MapReduceBase implements Mapper<LongWritable, Tex
public class ChainJobs extends Configured implements Tool {
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);
}
}
}
public static class Map2 extends MapReduceBase implements Mapper<Text, IntWritable, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
@Override
public void map(Text key, IntWritable value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken().concat("Justatest"));
output.collect(word, one);
}
}
}
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
@Override
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
@Override
public int run(String[] args) {
Configuration conf = getConf();
JobConf job = new JobConf(conf);
job.setJobName("TestforChainJobs");
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
JobConf map1Conf = new JobConf(false);
ChainMapper.addMapper(job, Map.class, LongWritable.class, Text.class, Text.class, IntWritable.class, true, map1Conf);
JobConf map2Conf = new JobConf(false);
ChainMapper.addMapper(job, Map2.class, Text.class, IntWritable.class, Text.class, IntWritable.class, true, map2Conf);
JobConf reduceConf = new JobConf(false);
ChainReducer.setReducer(job, Reduce.class, Text.class, IntWritable.class, Text.class, IntWritable.class, true, reduceConf);
JobClient.runJob(job);
return 0;
}
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new ChainJobs(), args);
System.exit(res);
}
非常感谢任何提示或非常简单的工作示例
我已经基于链映射器编写了一个单词计数作业。代码是在新API上编写的,并且工作良好:) 您可能会看到一些特殊或不需要的字符,因为我没有使用任何清理来删除标点符号。我只是专注于链映射器的工作。
谢谢:)您的驱动程序代码有问题,我可以稍微弄明白。配置对象尚未正确声明。请查看我在下面的答案中提供的代码,特别是驾驶员代码,以了解到底出了什么问题。谢谢:)我一直在为ChainMapper的Job和JobConf苦苦挣扎,但你的回答为我指明了方向。我添加了Hadoop 2.7.1 JAR并导入了正确的类。非常感谢。您是手动导入的吗?或者你使用了像gradle这样的构建工具;尝试使用gradle,如果你还没有这样做,看看神奇的发生!你把一切都自动化了是的,我知道Gradle和Maven的魔力:)
MapAttempt TASK_TYPE="MAP" TASKID="task_201210162337_0009_m_000000" TASK_ATTEMPT_ID="attempt_201210162337_0009_m_000000_0" TASK_STATUS="FAILED" FINISH_TIME="1350397216365" HOSTNAME="localhost\.localdomain" ERROR="java\.lang\.RuntimeException: Error in configuring object
at org\.apache\.hadoop\.util\.ReflectionUtils\.setJobConf(ReflectionUtils\.java:106)
at org\.apache\.hadoop\.util\.ReflectionUtils\.setConf(ReflectionUtils\.java:72)
at org\.apache\.hadoop\.util\.ReflectionUtils\.newInstance(ReflectionUtils\.java:130)
at org\.apache\.hadoop\.mapred\.MapTask\.runOldMapper(MapTask\.java:389)
at org\.apache\.hadoop\.mapred\.MapTask\.run(MapTask\.java:327)
at org\.apache\.hadoop\.mapred\.Child$4\.run(Child\.java:268)
at java\.security\.AccessController\.doPrivileged(Native Method)
at javax\.security\.auth\.Subject\.doAs(Subject\.java:396)
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
//implementing CHAIN MAPREDUCE without using custom format
//SPLIT MAPPER
class SplitMapper extends Mapper<Object,Text,Text,IntWritable>
{
private IntWritable dummyValue=new IntWritable(1);
//private String content;
private String tokens[];
@Override
public void map(Object key,Text value,Context context)throws IOException,InterruptedException{
tokens=value.toString().split(" ");
for(String x:tokens)
{
context.write(new Text(x), dummyValue);
}
}
}
//UPPER CASE MAPPER
class UpperCaseMapper extends Mapper<Text,IntWritable,Text,IntWritable>
{
@Override
public void map(Text key,IntWritable value,Context context)throws IOException,InterruptedException{
String val=key.toString().toUpperCase();
Text newKey=new Text(val);
context.write(newKey, value);
}
}
//ChainMapReducer
class ChainMapReducer extends Reducer<Text,IntWritable,Text,IntWritable>
{
private int sum=0;
@Override
public void reduce(Text key,Iterable<IntWritable>values,Context context)throws IOException,InterruptedException{
for(IntWritable value:values)
{
sum+=value.get();
}
context.write(key, new IntWritable(sum));
}
}
public class FirstClass extends Configured implements Tool{
static Configuration cf;
public int run (String args[])throws IOException,InterruptedException,ClassNotFoundException{
cf=new Configuration();
//bypassing the GenericOptionsParser part and directly running into job declaration part
Job j=Job.getInstance(cf);
/**************CHAIN MAPPER AREA STARTS********************************/
Configuration splitMapConfig=new Configuration(false);
//below we add the 1st mapper class under ChainMapper Class
ChainMapper.addMapper(j, SplitMapper.class, Object.class, Text.class, Text.class, IntWritable.class, splitMapConfig);
//configuration for second mapper
Configuration upperCaseConfig=new Configuration(false);
//below we add the 2nd mapper that is the lower case mapper to the Chain Mapper class
ChainMapper.addMapper(j, UpperCaseMapper.class, Text.class, IntWritable.class, Text.class, IntWritable.class, upperCaseConfig);
/**************CHAIN MAPPER AREA FINISHES********************************/
//now proceeding with the normal delivery
j.setJarByClass(FirstClass.class);
j.setCombinerClass(ChainMapReducer.class);
j.setOutputKeyClass(Text.class);
j.setOutputValueClass(IntWritable.class);
Path p=new Path(args[1]);
//set the input and output URI
FileInputFormat.addInputPath(j, new Path(args[0]));
FileOutputFormat.setOutputPath(j, p);
p.getFileSystem(cf).delete(p, true);
return j.waitForCompletion(true)?0:1;
}
public static void main(String args[])throws Exception{
int res=ToolRunner.run(cf, new FirstClass(), args);
System.exit(res);
}
}
A 619
ACCORDING 636
ACCOUNT 638
ACROSS? 655
ADDRESSES 657
AFTER 674
AGGREGATING, 687
AGO, 704
ALL 721
ALMOST 755
ALTERING 768
AMOUNT 785
AN 819
ANATOMY 820
AND 1198
ANXIETY 1215
ANY 1232
APACHE 1300
APPENDING 1313
APPLICATIONS 1330
APPLICATIONS. 1347
APPLICATIONS.� 1364
APPLIES 1381
ARCHITECTURE, 1387
ARCHIVES 1388
ARE 1405
AS 1422
BASED 1439