Java 何时在Hadoop Map Reduce中使用NLineInputFormat?
我有一个基于文本的输入文件,大小约为25GB。在该文件中,一条记录由4行组成。每个记录的处理都是相同的。但是在每个记录中,四行中的每一行都被不同的处理Java 何时在Hadoop Map Reduce中使用NLineInputFormat?,java,hadoop,mapreduce,Java,Hadoop,Mapreduce,我有一个基于文本的输入文件,大小约为25GB。在该文件中,一条记录由4行组成。每个记录的处理都是相同的。但是在每个记录中,四行中的每一行都被不同的处理 我是Hadoop新手,所以我想要一个指导,在这种情况下是使用NLineInputFormat,还是使用默认的TextInputFormat?提前谢谢 假设您有以下格式的文本文件: 2015-8-02 error2014 blahblahblahblah 2015-8-02 blahblahbalh error2014 你可以用 使用NLine
我是Hadoop新手,所以我想要一个指导,在这种情况下是使用
NLineInputFormat
,还是使用默认的TextInputFormat
?提前谢谢 假设您有以下格式的文本文件:
2015-8-02
error2014 blahblahblahblah
2015-8-02
blahblahbalh error2014
你可以用 使用
NLineInputFormat
功能,您可以精确地指定映射器应该有多少行
在您的情况下,可以使用为每个映射器输入4行
编辑:
以下是使用NLineInputFormat的示例:
映射器类:
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MapperNLine extends Mapper<LongWritable, Text, LongWritable, Text> {
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(key, value);
}
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Driver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.out
.printf("Two parameters are required for DriverNLineInputFormat- <input dir> <output dir>\n");
return -1;
}
Job job = new Job(getConf());
job.setJobName("NLineInputFormat example");
job.setJarByClass(Driver.class);
job.setInputFormatClass(NLineInputFormat.class);
NLineInputFormat.addInputPath(job, new Path(args[0]));
job.getConfiguration().setInt("mapreduce.input.lineinputformat.linespermap", 4);
LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(MapperNLine.class);
job.setNumReduceTasks(0);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new Configuration(), new Driver(), args);
System.exit(exitCode);
}
}
import java.io.IOException;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.Mapper;
公共类MapperNLine扩展了Mapper{
@凌驾
公共void映射(可长写键、文本值、上下文)
抛出IOException、InterruptedException{
编写(键、值);
}
}
驱动程序类别:
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MapperNLine extends Mapper<LongWritable, Text, LongWritable, Text> {
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(key, value);
}
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Driver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.out
.printf("Two parameters are required for DriverNLineInputFormat- <input dir> <output dir>\n");
return -1;
}
Job job = new Job(getConf());
job.setJobName("NLineInputFormat example");
job.setJarByClass(Driver.class);
job.setInputFormatClass(NLineInputFormat.class);
NLineInputFormat.addInputPath(job, new Path(args[0]));
job.getConfiguration().setInt("mapreduce.input.lineinputformat.linespermap", 4);
LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(MapperNLine.class);
job.setNumReduceTasks(0);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new Configuration(), new Driver(), args);
System.exit(exitCode);
}
}
import org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.conf.Configured;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.mapreduce.Job;
导入org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
导入org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
导入org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
导入org.apache.hadoop.util.Tool;
导入org.apache.hadoop.util.ToolRunner;
公共类驱动程序扩展配置的实现工具{
@凌驾
公共int运行(字符串[]args)引发异常{
如果(参数长度!=2){
系统输出
.printf(“DriverLineInputFormat-\n需要两个参数”);
返回-1;
}
Job Job=新作业(getConf());
setJobName(“NLineInputFormat示例”);
job.setJarByClass(Driver.class);
作业.setInputFormatClass(NLineInputFormat.class);
NLineInputFormat.addInputPath(作业,新路径(args[0]);
job.getConfiguration().setInt(“mapreduce.input.lineinputformat.LinesMap”,4);
setOutputFormatClass(作业,TextOutputFormat.class);
setOutputPath(作业,新路径(args[1]);
setMapperClass(MapperNLine.class);
job.setNumReduceTasks(0);
布尔值success=job.waitForCompletion(true);
返回成功?0:1;
}
公共静态void main(字符串[]args)引发异常{
int exitCode=ToolRunner.run(新配置(),新驱动程序(),参数);
系统退出(退出代码);
}
}