Maven 在CDH4上运行简单的MR作业
我正在尝试使用CDH4运行一个简单的MR作业。我犯了一个最奇怪的错误,我不知道为什么。基本上,我的程序读取一个文件,使用一个身份映射器,然后减速器只发出一个键和一个字符串作为值。我不明白为什么我的脚本不起作用。我在CDH3中从未遇到过这样的问题。任何建议都很好 错误:Maven 在CDH4上运行简单的MR作业,maven,hadoop,mapreduce,cloudera-cdh,Maven,Hadoop,Mapreduce,Cloudera Cdh,我正在尝试使用CDH4运行一个简单的MR作业。我犯了一个最奇怪的错误,我不知道为什么。基本上,我的程序读取一个文件,使用一个身份映射器,然后减速器只发出一个键和一个字符串作为值。我不明白为什么我的脚本不起作用。我在CDH3中从未遇到过这样的问题。任何建议都很好 错误: 14/03/26 20:35:45 INFO mapred.JobClient: Task Id : attempt_201403171159_0109_m_000002_2, Status : FAILED java.lang.
14/03/26 20:35:45 INFO mapred.JobClient: Task Id : attempt_201403171159_0109_m_000002_2, Status : FAILED
java.lang.NumberFormatException: For input string: "256MB"
at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
at java.lang.Integer.parseInt(Integer.java:492)
at java.lang.Integer.parseInt(Integer.java:527)
at org.apache.hadoop.conf.Configuration.getInt(Configuration.java:1060)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.init(MapTask.java:809)
at org.apache.hadoop.mapred.MapTask.createSortingCollector(MapTask.java:376)
at org.apache.hadoop.mapred.MapTask.access$100(MapTask.java:85)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.<init>(MapTask.java:584)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:656)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:330)
at org.apache.hadoop.mapred.Child$4.run(Child.java:268)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1408)
at org.apache.hadoop
14/03/26 20:35:45信息映射。作业客户端:任务Id:尝试\u 201403171159\u 0109\u m\u000002\u 2,状态:失败
java.lang.NumberFormatException:对于输入字符串:“256MB”
位于java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
位于java.lang.Integer.parseInt(Integer.java:492)
在java.lang.Integer.parseInt(Integer.java:527)处
位于org.apache.hadoop.conf.Configuration.getInt(Configuration.java:1060)
位于org.apache.hadoop.mapred.MapTask$mapoutbuffer.init(MapTask.java:809)
位于org.apache.hadoop.mapred.MapTask.createSortingCollector(MapTask.java:376)
位于org.apache.hadoop.mapred.MapTask.access$100(MapTask.java:85)
位于org.apache.hadoop.mapred.MapTask$NewOutputCollector。(MapTask.java:584)
位于org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:656)
位于org.apache.hadoop.mapred.MapTask.run(MapTask.java:330)
位于org.apache.hadoop.mapred.Child$4.run(Child.java:268)
位于java.security.AccessController.doPrivileged(本机方法)
位于javax.security.auth.Subject.doAs(Subject.java:415)
位于org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1408)
在org.apache.hadoop上
Maven依赖项:
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>2.0.0-mr1-cdh4.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.0.0-cdh4.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-tools</artifactId>
<version>2.0.0-mr1-cdh4.4.0</version>
</dependency>
org.apache.hadoop
hadoop内核
2.0.0-mr1-cdh4.4.0
org.apache.hadoop
hadoop通用
2.0.0-cdh4.4.0
org.apache.hadoop
hadoop工具
2.0.0-mr1-cdh4.4.0
Maven回购协议:
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
<repository>
<id>maven-hadoop</id>
<name>Hadoop Releases</name>
<url>https://repository.cloudera.com/content/repositories/releases/</url>
</repository>
</repositories>
克劳德拉
https://repository.cloudera.com/artifactory/cloudera-repos/
maven hadoop
Hadoop发行版
https://repository.cloudera.com/content/repositories/releases/
MR代码:
package com.some.packagename;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MyMRJob extends Configured implements Tool {
private static String inputPath = "someHDFSInputPath";
private static String outputPath = "someHDFSOutputPath";
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("mapred.job.tracker", "jtserver:8021");
conf.set("fs.defaultFS", "hdfs://nnserver:8020");
ToolRunner.run(conf, new MyMRJob(), args);
}
public final int run(final String[] args) throws Exception {
// Initialize
Job job = new Job(super.getConf(),MyMRJob.class.getSimpleName());
// General Configs
job.setJarByClass(MyMRJob.class);
// Inputs
TextInputFormat.setInputPaths(job, inputPath);
job.setInputFormatClass(TextInputFormat.class);
// Mapper
job.setMapperClass(TheMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// Reducer
job.setReducerClass(TheReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// Output
TextOutputFormat.setOutputPath(job, new Path(outputPath));
job.setOutputFormatClass(TextOutputFormat .class);
// Run the job
boolean b = job.waitForCompletion(true);
if (!b)
throw new IOException("Error with the job - it has failed!");
return 1;
}
private static class TheMapper extends Mapper<Text, Text, Text, Text> {
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
context.write(key, value);
}
}
public static class TheReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
context.write(key, new Text("some value"));
}
}
}
package com.some.packagename;
导入java.io.IOException;
导入org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.conf.Configured;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.Job;
导入org.apache.hadoop.mapreduce.Mapper;
导入org.apache.hadoop.mapreduce.Reducer;
导入org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
导入org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
导入org.apache.hadoop.util.Tool;
导入org.apache.hadoop.util.ToolRunner;
公共类MyMRJob扩展配置的实现工具{
私有静态字符串inputPath=“someHDFSInputPath”;
私有静态字符串outputPath=“someHDFSOutputPath”;
公共静态void main(字符串[]args)引发异常{
Configuration conf=新配置();
conf.set(“mapred.job.tracker”,“jtserver:8021”);
conf.set(“fs.defaultFS”hdfs://nnserver:8020");
run(conf,new MyMRJob(),args);
}
公共最终整型运行(最终字符串[]args)引发异常{
//初始化
Job Job=新作业(super.getConf(),MyMRJob.class.getSimpleName());
//通用配置
job.setJarByClass(MyMRJob.class);
//投入
设置输入路径(作业,输入路径);
setInputFormatClass(TextInputFormat.class);
//制图员
job.setMapperClass(TheMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//减速器
job.setReducerClass(reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//输出
setOutputPath(作业,新路径(outputPath));
setOutputFormatClass(TextOutputFormat.class);
//执行任务
布尔b=作业。等待完成(true);
如果(!b)
抛出新IOException(“作业出错-已失败!”);
返回1;
}
私有静态类TheMapper扩展映射器{
受保护的void映射(文本键、文本值、上下文上下文)引发IOException、InterruptedException{
编写(键、值);
}
}
公共静态类减速器扩展减速器{
公共void reduce(文本键、Iterable值、上下文上下文)引发IOException、InterruptedException{
写(键,新文本(“某些值”);
}
}
}
查看您的
mapred-site.xml
它可能具有类似“256MB”的配置,特别是以下属性
mapred.child.java.opts和io.sort.mb
是的,io.sort.mb被设置为256MB。在我的配置中,我添加了conf.set(“io.sort.mb”,“256”)来覆盖它,错误消失了。我想我应该在文件中更改它作为永久修复?是的,请在文件中更改它应该不会出现任何错误。