Java 为什么作业链接在mapreduce中不起作用?
我创建了两个作业,并希望链接它们,以便在前一个作业完成后执行一个作业。所以我写了下面的代码。但正如我所观察到的,job1正确地完成了,而job2似乎从未执行过Java 为什么作业链接在mapreduce中不起作用?,java,hadoop,mapreduce,Java,Hadoop,Mapreduce,我创建了两个作业,并希望链接它们,以便在前一个作业完成后执行一个作业。所以我写了下面的代码。但正如我所观察到的,job1正确地完成了,而job2似乎从未执行过 public class Simpletask extends Configured implements Tool { public static enum FileCounters { COUNT; } public static class TokenizerMapper extends Mapper<Object,
public class Simpletask extends Configured implements Tool {
public static enum FileCounters {
COUNT;
}
public static class TokenizerMapper extends Mapper<Object, Text, IntWritable, Text>{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
String line = itr.nextToken();
String part[] = line.split(",");
int id = Integer.valueOf(part[0]);
int x1 = Integer.valueOf(part[1]);
int y1 = Integer.valueOf(part[2]);
int z1 = Integer.valueOf(part[3]);
int x2 = Integer.valueOf(part[4]);
int y2 = Integer.valueOf(part[5]);
int z2 = Integer.valueOf(part[6]);
int h_v = Hilbert(x1,y1,z1);
int parti = h_v/10;
IntWritable partition = new IntWritable(parti);
Text neuron = new Text();
neuron.set(line);
context.write(partition,neuron);
}
}
public int Hilbert(int x,int y,int z){
return (int) (Math.random()*20);
}
}
public static class IntSumReducer extends Reducer<IntWritable,Text,IntWritable,Text> {
private Text result = new Text();
private MultipleOutputs<IntWritable, Text> mos;
public void setup(Context context) {
mos = new MultipleOutputs<IntWritable, Text>(context);
}
<K, V> String generateFileName(K k) {
return "p"+k.toString();
}
public void reduce(IntWritable key,Iterable<Text> values, Context context) throws IOException, InterruptedException {
String accu = "";
for (Text val : values) {
String[] entry=val.toString().split(",");
String MBR = entry[1];
accu+=entry[0]+",MBR"+MBR+" ";
}
result.set(accu);
context.getCounter(FileCounters.COUNT).increment(1);
mos.write(key, result, generateFileName(key));
}
}
public static class RTreeMapper extends Mapper<Object, Text, IntWritable, Text>{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
System.out.println("WOWOWOWOW RUNNING");// NOTHING SHOWS UP!
}
}
public static class RTreeReducer extends Reducer<IntWritable,Text,IntWritable,Text> {
private MultipleOutputs<IntWritable, Text> mos;
Text t = new Text();
public void setup(Context context) {
mos = new MultipleOutputs<IntWritable, Text>(context);
}
public void reduce(IntWritable key,Iterable<Text> values, Context context) throws IOException, InterruptedException {
t.set("dsfs");
mos.write(key, t, "WOWOWOWOWOW"+key.get());
//ALSO, NOTHING IS WRITTEN TO THE FILE!!!!!
}
}
public static class RTreeInputFormat extends TextInputFormat{
protected boolean isSplitable(FileSystem fs, Path file) {
return false;
}
}
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Enter valid number of arguments <Inputdirectory> <Outputlocation>");
System.exit(0);
}
ToolRunner.run(new Configuration(), new Simpletask(), args);
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Job1");
job.setJarByClass(Simpletask.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
boolean complete = job.waitForCompletion(true);
//================RTree Loop============
int capacity = 3;
Configuration rconf = new Configuration();
Job rtreejob = Job.getInstance(rconf, "rtree");
if(complete){
int count = (int) job.getCounters().findCounter(FileCounters.COUNT).getValue();
System.out.println("File count: "+count);
String path = null;
for(int i=0;i<count;i++){
path = "/Worker/p"+i+"-m-00000";
System.out.println("Add input path: "+path);
FileInputFormat.addInputPath(rtreejob, new Path(path));
}
System.out.println("Input path done.");
FileOutputFormat.setOutputPath(rtreejob, new Path("/RTree"));
rtreejob.setJarByClass(Simpletask.class);
rtreejob.setMapperClass(RTreeMapper.class);
rtreejob.setCombinerClass(RTreeReducer.class);
rtreejob.setReducerClass(RTreeReducer.class);
rtreejob.setOutputKeyClass(IntWritable.class);
rtreejob.setOutputValueClass(Text.class);
rtreejob.setInputFormatClass(RTreeInputFormat.class);
complete = rtreejob.waitForCompletion(true);
}
return 0;
}
}
公共类Simpletask扩展配置的工具{
公共静态枚举文件计数器{
计数
}
公共静态类TokenizerMapper扩展映射器{
公共void映射(对象键、文本值、上下文上下文)引发IOException、InterruptedException{
StringTokenizer itr=新的StringTokenizer(value.toString());
而(itr.hasMoreTokens()){
字符串行=itr.nextToken();
字符串部分[]=行分割(“,”);
int id=Integer.valueOf(部分[0]);
int x1=整数值(第[1]部分);
int y1=整数值(第[2]部分);
int z1=整数.valueOf(第[3]部分);
int x2=整数.valueOf(第[4]部分);
int y2=整型值(第[5]部分);
int z2=整数.valueOf(第[6]部分);
int h_v=希尔伯特(x1,y1,z1);
int parti=h_v/10;
IntWritable分区=新的IntWritable(parti);
Text=newtext();
神经元。设置(行);
写(分区,神经元);
}
}
公共整数希尔伯特(整数x,整数y,整数z){
返回值(int)(Math.random()*20);
}
}
公共静态类IntSumReducer扩展了Reducer{
私有文本结果=新文本();
私人多路输出mos;
公共无效设置(上下文){
mos=新的多输出(上下文);
}
字符串generateFileName(K){
返回“p”+k.toString();
}
public void reduce(IntWritable键、Iterable值、上下文)抛出IOException、InterruptedException{
字符串accu=“”;
用于(文本值:值){
String[]entry=val.toString().split(“,”);
字符串MBR=条目[1];
accu+=条目[0]+”,MBR“+MBR+”;
}
结果集(accu);
getCounter(FileCounters.COUNT).increment(1);
mos.write(键、结果、generateFileName(键));
}
}
公共静态类RTreeMapper扩展了Mapper{
公共void映射(对象键、文本值、上下文上下文)引发IOException、InterruptedException{
System.out.println(“wowowo正在运行”);//什么都没有显示!
}
}
公共静态类RTreeReducer扩展了Reducer{
私人多路输出mos;
Text t=新文本();
公共无效设置(上下文){
mos=新的多输出(上下文);
}
public void reduce(IntWritable键、Iterable值、上下文)抛出IOException、InterruptedException{
t、 集合(“DSF”);
mos.write(key,t,“wowowowo”+key.get());
//而且,文件中没有写入任何内容!!!!!
}
}
公共静态类RTreeInputFormat扩展了TextInputFormat{
受保护的布尔isSplitable(文件系统fs,路径文件){
返回false;
}
}
公共静态void main(字符串[]args)引发异常{
如果(参数长度!=2){
System.err.println(“输入有效的参数数”);
系统出口(0);
}
运行(新配置(),新Simpletask(),args);
}
@凌驾
公共int运行(字符串[]args)引发异常{
Configuration conf=新配置();
Job=Job.getInstance(conf,“Job1”);
job.setJarByClass(Simpletask.class);
setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
addInputPath(作业,新路径(args[0]);
setOutputPath(作业,新路径(args[1]);
setOutputFormatClass(作业,TextOutputFormat.class);
布尔值complete=job.waitForCompletion(true);
//======================RTree循环============
int容量=3;
配置rconf=新配置();
Job rtreejob=Job.getInstance(rconf,“rtree”);
如果(完成){
int count=(int)job.getCounters().findCounter(FileCounters.count).getValue();
System.out.println(“文件计数:“+count”);
字符串路径=null;
对于(int i=0;i对于mapreduce作业,输出目录不应存在。它将首先检查输出目录。如果存在,作业将失败。在您的情况下,您为两个作业指定了相同的输出目录。我修改了代码。我将args[1]更改为args[2]现在第三个参数将是第二个作业的输出目录。所以也传递第三个参数
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Job1");
job.setJarByClass(Simpletask.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//AND THEN I WAIT THIS JOB TO COMPLETE.
boolean complete = job.waitForCompletion(true);
//I START A NEW JOB, BUT WHY IS IT NOT RUNNING?
Configuration conf = new Configuration();
Job job2 = Job.getInstance(conf, "Job2");
job2.setJarByClass(Simpletask.class);
job2.setMapperClass(TokenizerMapper.class);
job2.setCombinerClass(IntSumReducer.class);
job2.setReducerClass(IntSumReducer.class);
job2.setOutputKeyClass(IntWritable.class);
job2.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job2, new Path(args[0]));
FileOutputFormat.setOutputPath(job2, new Path(args[2]));
错误的几个可能原因:
conf
声明两次(没有编译错误?)
job2的输出路径已存在,因为它是从job1(+1到)创建的
我认为您还应该为这两个作业使用job.setMapOutputKeyClass(Text.class);
和job.setMapOutputValueClass(IntWritable.class);
在发布代码片段之后,您是否还有执行job2的命令?我的意思是,您是否实际运行了job2.waitForCompletion(true);
,或者类似的操作
总的来说:检查日志中的错误消息,这些消息应该可以清楚地解释出错的原因。这是在main类中编写的。我查找了一些方法,说我应该使用JobClient,但我还想自定义inputformat以读取多个文件,而JobClient似乎不支持这一点?FileOutputFormat.setOutputPath(作业2,新路径(args[1]))将出现,它可能会失败。您可以查看此示例以了解作业链接的基本结构。这只是作业链接模式的一个示例,但实际上在路径问题上没有编译器错误。关键是我编写了类似的驱动程序函数,而第二个作业没有运行。这只是一个示例。a实际上,我在第二个作业中使用了不同的选项和路径,关键是