EMR流作业使用映射器和reducer的Java代码
我目前使用ruby编写的mapper和reducer代码运行流作业。我想把它们转换成java。我不知道如何使用java使用EMR hadoop运行流作业。亚马逊EMR网站cloudburst中给出的示例太复杂了。以下是我当前如何运行作业的详细信息 启动作业的代码:EMR流作业使用映射器和reducer的Java代码,java,ruby,hadoop,mapreduce,emr,Java,Ruby,Hadoop,Mapreduce,Emr,我目前使用ruby编写的mapper和reducer代码运行流作业。我想把它们转换成java。我不知道如何使用java使用EMR hadoop运行流作业。亚马逊EMR网站cloudburst中给出的示例太复杂了。以下是我当前如何运行作业的详细信息 启动作业的代码: elastic-mapreduce --create --alive --plain-output --master-instance-type m1.small --slave-instance-type m1.x
elastic-mapreduce --create --alive --plain-output --master-instance-type m1.small
--slave-instance-type m1.xlarge --num-instances 2 --name "Job Name" --bootstrap-action
s3://bucket-path/bootstrap.sh
添加步骤的代码:
elastic-mapreduce -j <job_id> --stream --step-name "my_step_name"
--jobconf mapred.task.timeout=0 --mapper s3://bucket-path/mapper.rb
--reducer s3://bucket-path/reducerRules.rb --cache s3://bucket-path/cache/cache.txt
--input s3://bucket-path/input --output s3://bucket-path/output
如果您使用的是java,就不会使用流媒体。您可以直接针对MapReduceAPI构建一个Jar 查看hadoop源代码的examples文件夹,以获得一些很好的示例,包括臭名昭著的wordcount: 我不完全清楚为什么要使用Java,但是直接编码到API会很痛苦。您可能希望尝试以下操作之一: Java项目:
- 层叠
- 嘎吱作响
- 配置单元(类似sql)
- 猪
- 史考比(斯卡拉)
FWIW我认为Pig可能是我的选择,并且在EMR上得到了开箱即用的支持。因为现在我在Hadoop和Mapreduce上有了更好的据点,以下是我所期望的: 要启动集群,代码将与问题中的代码大致相同,但我们可以添加配置参数:
ruby elastic-mapreduce --create --alive --plain-output --master-instance-type m1.xlarge --slave-instance-type m1.xlarge --num-instances 11 --name "Java Pipeline" --bootstrap-action s3://elasticmapreduce/bootstrap-actions/install-ganglia --bootstrap-action s3://elasticmapreduce/bootstrap-actions/configure-hadoop --args "--mapred-config-file, s3://com.versata.emr/conf/mapred-site-tuned.xml"
要添加作业步骤,请执行以下操作:
步骤1:
ruby elastic mapreduce--jobflow--jar s3://somepath/job-one.jar--arg s3://somepath/input one--arg s3://somepath/output one--args-m,mapred.min.split.size=52880-m,mapred.task.timeout=0
步骤2:
ruby elastic mapreduce--jobflow--jar s3://somepath/job-two.jar--arg s3://somepath/output one--arg s3://somepath/output two--args-m,mapred.min.split.size=52880-m,mapred.task.timeout=0
现在,对于Java代码,将有一个主类,其中包含以下每个类的一个实现:
- org.apache.hadoop.mapreduce.Mapper李>
- org.apache.hadoop.mapreduce.Reducer李>
public class SomeJob extends Configured implements Tool {
private static final String JOB_NAME = "My Job";
/**
* This is Mapper.
*/
public static class MapJob extends Mapper<LongWritable, Text, Text, Text> {
private Text outputKey = new Text();
private Text outputValue = new Text();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
// Get the cached file
Path file = DistributedCache.getLocalCacheFiles(context.getConfiguration())[0];
File fileObject = new File (file.toString());
// Do whatever required with file data
}
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
outputKey.set("Some key calculated or derived");
outputVey.set("Some Value calculated or derived");
context.write(outputKey, outputValue);
}
}
/**
* This is Reducer.
*/
public static class ReduceJob extends Reducer<Text, Text, Text, Text> {
private Text outputKey = new Text();
private Text outputValue = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException,
InterruptedException {
outputKey.set("Some key calculated or derived");
outputVey.set("Some Value calculated or derived");
context.write(outputKey, outputValue);
}
}
@Override
public int run(String[] args) throws Exception {
try {
Configuration conf = getConf();
DistributedCache.addCacheFile(new URI(args[2]), conf);
Job job = new Job(conf);
job.setJarByClass(TaxonomyOverviewReportingStepOne.class);
job.setJobName(JOB_NAME);
job.setMapperClass(MapJob.class);
job.setReducerClass(ReduceJob.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, args[0]);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
} catch (Exception e) {
e.printStackTrace();
return 1;
}
}
public static void main(String[] args) throws Exception {
if (args.length < 3) {
System.out
.println("Usage: SomeJob <comma sparated list of input directories> <output dir> <cache file>");
System.exit(-1);
}
int result = ToolRunner.run(new TaxonomyOverviewReportingStepOne(), args);
System.exit(result);
}
}
public类SomeJob扩展配置的实现工具{
私有静态最终字符串JOB\u NAME=“我的作业”;
/**
*我是Mapper。
*/
公共静态类映射作业扩展映射器{
私有文本输出键=新文本();
私有文本输出值=新文本();
@凌驾
受保护的无效设置(上下文上下文)引发IOException、InterruptedException{
//获取缓存文件
路径文件=DistributedCache.getLocalCacheFiles(context.getConfiguration())[0];
File fileObject=新文件(File.toString());
//对文件数据执行任何需要的操作
}
@凌驾
公共void映射(LongWritable键、文本值、上下文上下文)引发IOException、InterruptedException{
outputKey.set(“计算或派生的某个键”);
outputVey.set(“计算或导出的某个值”);
write(outputKey,outputValue);
}
}
/**
*这是减速机。
*/
公共静态类ReduceJob扩展Reducer{
私有文本输出键=新文本();
私有文本输出值=新文本();
@凌驾
受保护的void reduce(文本键、Iterable值、上下文上下文)引发IOException,
中断异常{
outputKey.set(“计算或派生的某个键”);
outputVey.set(“计算或导出的某个值”);
write(outputKey,outputValue);
}
}
@凌驾
公共int运行(字符串[]args)引发异常{
试一试{
配置conf=getConf();
addCacheFile(新URI(args[2]),conf);
作业=新作业(配置);
setJarByClass(TaxonomyOverviewReportingStepOne.class);
job.setJobName(作业名称);
setMapperClass(MapJob.class);
job.setReduceClass(ReduceJob.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
setInputFormatClass(TextInputFormat.class);
setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
setInputPath(作业,参数[0]);
setOutputPath(作业,新路径(args[1]);
布尔值success=job.waitForCompletion(true);
返回成功?0:1;
}捕获(例外e){
e、 printStackTrace();
返回1;
}
}
公共静态void main(字符串[]args)引发异常{
如果(参数长度<3){
系统输出
.println(“用法:SomeJob”);
系统退出(-1);
}
int result=ToolRunner.run(新的TaxonomyOverviewReportingStepOne(),args);
系统退出(结果);
}
}
即使您使用的是Java,也可以使用流媒体。生成集群之后,您拥有了jobflo_id,向集群添加了两个流式处理步骤,如下所示:ruby elastic mapreduce--jobflow--jar s3://somepath/job-one.jar--arg s3://somepath/input one--arg s3://somepath/output one--args-m,mapred.min.split.size=52880-m,mapred.task.timeout=0
和ruby elastic mapreduce--jobflow--jar s3://somepath/job-one.jar--arg s3://somepath/input-one--arg s3://somepath/output-one--args-m,mapred.min.split.size=52880-m,mapred.task.timeout=0
public class SomeJob extends Configured implements Tool {
private static final String JOB_NAME = "My Job";
/**
* This is Mapper.
*/
public static class MapJob extends Mapper<LongWritable, Text, Text, Text> {
private Text outputKey = new Text();
private Text outputValue = new Text();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
// Get the cached file
Path file = DistributedCache.getLocalCacheFiles(context.getConfiguration())[0];
File fileObject = new File (file.toString());
// Do whatever required with file data
}
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
outputKey.set("Some key calculated or derived");
outputVey.set("Some Value calculated or derived");
context.write(outputKey, outputValue);
}
}
/**
* This is Reducer.
*/
public static class ReduceJob extends Reducer<Text, Text, Text, Text> {
private Text outputKey = new Text();
private Text outputValue = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException,
InterruptedException {
outputKey.set("Some key calculated or derived");
outputVey.set("Some Value calculated or derived");
context.write(outputKey, outputValue);
}
}
@Override
public int run(String[] args) throws Exception {
try {
Configuration conf = getConf();
DistributedCache.addCacheFile(new URI(args[2]), conf);
Job job = new Job(conf);
job.setJarByClass(TaxonomyOverviewReportingStepOne.class);
job.setJobName(JOB_NAME);
job.setMapperClass(MapJob.class);
job.setReducerClass(ReduceJob.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, args[0]);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
} catch (Exception e) {
e.printStackTrace();
return 1;
}
}
public static void main(String[] args) throws Exception {
if (args.length < 3) {
System.out
.println("Usage: SomeJob <comma sparated list of input directories> <output dir> <cache file>");
System.exit(-1);
}
int result = ToolRunner.run(new TaxonomyOverviewReportingStepOne(), args);
System.exit(result);
}
}