Java MinimerArnCluster,在本地运行MR
我正在尝试使用MiniMRYarnCluster在本地运行MR jobs。 我使用的是旧的mapreduce(不是纱线)和mapreduce API v2 这些东西可以在这里找到:Java MinimerArnCluster,在本地运行MR,java,hadoop,mapreduce,cloudera,Java,Hadoop,Mapreduce,Cloudera,我正在尝试使用MiniMRYarnCluster在本地运行MR jobs。 我使用的是旧的mapreduce(不是纱线)和mapreduce API v2 这些东西可以在这里找到: <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-jobclient</artifactId&
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>2.0.0-cdh4.1.1</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
这里有一个例外:
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/hadoop/yarn/service/CompositeService
at java.lang.ClassLoader.defineClass1(Native Method)
at java.lang.ClassLoader.defineClassCond(ClassLoader.java:631)
at java.lang.ClassLoader.defineClass(ClassLoader.java:615)
at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:141)
at java.net.URLClassLoader.defineClass(URLClassLoader.java:283)
at java.net.URLClassLoader.access$000(URLClassLoader.java:58)
at java.net.URLClassLoader$1.run(URLClassLoader.java:197)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:306)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:247)
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.yarn.service.CompositeService
at java.net.URLClassLoader$1.run(URLClassLoader.java:202)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:306)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:247)
... 12 more
Could not find the main class: org.apache.hadoop.mapreduce.v2.app.MRAppMaster. Program will exit.
我已经使用org.apache.hadoop.mapreduce.v2.TestMRJobs作为我自己测试的基础。有人遇到过这个问题吗
这是我的代码,它是在CI服务器或开发人员计算机上本地测试MR作业的抽象基类:
public abstract class AbstractClusterMapReduceTest {
private static final Log LOG = LogFactory.getLog(AbstractClusterMapReduceTest.class);
public static final String DEFAULT_LOG_CATALOG = "local-mr-logs";
private static final int DEFAULT_NAMENODE_PORT = 50123;
private static final int ONE_DATANODE = 1;
private static final int DEFAULT_REDUCE_NUM_TASKS = 1;
private static final String SLASH = "/";
private static final String DEFAULT_MR_INPUT_DATA_FILE = "mr-input-data-file";
private MiniMRYarnCluster mrCluster;
private MiniDFSCluster dfsCluster;
/** Shitty code from base Cloudera example*/
private static Path TEST_ROOT_DIR = new Path("target",
AbstractClusterMapReduceTest.class.getName() + "-tmpDir").makeQualified(getLocalFileSystem());
static Path APP_JAR = new Path(TEST_ROOT_DIR, "MRAppJar.jar");
private static FileSystem getLocalFileSystem(){
try {
return FileSystem.getLocal(new Configuration());
} catch (IOException e) {
throw new Error("Can't access local file system. MR cluster can't be started", e);
}
}
/**
* Always provide path to log catalog.
* Default is: ${project.build.directory}/{@link AbstractClusterMapReduceTest#DEFAULT_LOG_CATALOG}
* */
protected String getPathToLogCatalog(){
return getPathToOutputDirectory()+ SLASH + DEFAULT_LOG_CATALOG;
}
private String getPathToOutputDirectory(){
return System.getProperty("project.build.directory");
}
private void checkAppJar(){
if (!(new File(MiniMRYarnCluster.APPJAR)).exists()) {
throw new Error("MRAppJar " + MiniMRYarnCluster.APPJAR+ " not found. Not running test.");
}else{
LOG.info(MiniMRYarnCluster.APPJAR + " is at the right place. Can continue to setup Env...");
}
}
public void setupEnv() throws IOException{
checkAppJar();
System.setProperty("hadoop.log.dir", getPathToLogCatalog());
System.setProperty("javax.xml.parsers.SAXParserFactory",
"com.sun.org.apache.xerces.internal.jaxp.SAXParserFactoryImpl");
dfsCluster = buildMiniDFSCluster();
//dfsCluster.getFileSystem().makeQualified(createPath(getHDFSPathToInputData()));
//dfsCluster.getFileSystem().makeQualified(createPath(getOutputPath()));
mrCluster = new MiniMRYarnCluster(this.getClass().getName(), 1);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", getFileSystem().getUri().toString()); // use HDFS
//conf.set(MRJobConfig.MR_AM_STAGING_DIR, getPathToOutputDirectory()+"/tmp-mapreduce");
conf.set(MRJobConfig.MR_AM_STAGING_DIR, "/apps_staging_dir");
mrCluster.init(conf);
mrCluster.start();
//Cloudera tricks :)
// Copy MRAppJar and make it private. TODO: FIXME. This is a hack to
// workaround the absent public discache.
getLocalFileSystem().copyFromLocalFile(new Path(MiniMRYarnCluster.APPJAR), APP_JAR);
getLocalFileSystem().setPermission(APP_JAR, new FsPermission("700"));
}
public void tearDown() {
if (mrCluster != null) {
mrCluster.stop();
mrCluster = null;
}
if (dfsCluster != null) {
dfsCluster.shutdown();
dfsCluster = null;
}
}
public boolean createAndSubmitJob() throws IOException, ClassNotFoundException, InterruptedException{
LOG.info("createAndSubmitJob: enter");
checkAppJar();
LOG.info("MRAppJar has been found. Can start to create Job");
Configuration configuration = mrCluster.getConfig();
configuration.set(MRConfig.MASTER_ADDRESS, "local");
Job job = Job.getInstance(configuration);
job.setJobName(this.getClass().getSimpleName()+"-job");
job.addFileToClassPath(APP_JAR); // The AppMaster jar itself.
job.setJarByClass(getMRJobClass());
job.setJobName(getMRJobClass().getSimpleName());
job.setNumReduceTasks(getReduceNumTasks());
job.setOutputKeyClass(getOutputKeyClass());
job.setOutputValueClass(getOutputValueClass());
job.setMapperClass(getMapperClass());
job.setReducerClass(getReducerClass());
job.setInputFormatClass(getInputFormat());
job.setOutputFormatClass(getOutputFormat());
FileInputFormat.setInputPaths(job, getHDFSPathToInputData());
FileOutputFormat.setOutputPath(job, createPath(getOutputPath()));
job.setSpeculativeExecution(false);
job.setMaxMapAttempts(1); // speed up failures
LOG.info("Submitting job...");
job.submit();
LOG.info("Job has been submitted.");
String trackingUrl = job.getTrackingURL();
String jobId = job.getJobID().toString();
LOG.info("trackingUrl:" +trackingUrl);
LOG.info("jobId:" +jobId);
return job.waitForCompletion(true);
}
protected FileSystem getFileSystem() throws IOException {
return dfsCluster.getFileSystem();
}
protected int getReduceNumTasks(){
return DEFAULT_REDUCE_NUM_TASKS;
}
/**
* @return InputStream instance to file you want to run with your MR job
* */
protected InputStream getInputStreamForInputData() {
return this.getClass().getClassLoader().getResourceAsStream(this.getClass().getSimpleName()+"/"+getInputDatasetName());
//return getPathToOutputDirectory()+ SLASH + DEFAULT_INPUT_CATALOG+"/mr-input-data";
}
protected String getHDFSPathToInputData() throws IOException{
InputStream inputStream = getInputStreamForInputData();
Path hdfsInputPath = new Path(DEFAULT_MR_INPUT_DATA_FILE);
FSDataOutputStream fsDataOutputStream = getFileSystem().create(hdfsInputPath);
copyStream(inputStream, fsDataOutputStream);
fsDataOutputStream.close();
inputStream.close();
return hdfsInputPath.toString();
}
private void copyStream(InputStream input, OutputStream output) throws IOException {
byte[] buffer = new byte[1024]; // Adjust if you want
int bytesRead;
while ((bytesRead = input.read(buffer)) != -1)
{
output.write(buffer, 0, bytesRead);
}
}
/**
* Dataset should be placed in resources/ConcreteClusterMapReduceTest
* @return a name of a file from catalog.
* */
protected abstract String getInputDatasetName();
/**
* @return path reducer output
* default is: @{link AbstractClusterMapReduceTest#DEFAULT_OUTPUT_CATALOG}
* */
protected String getOutputPath(){
return "mr-data-output";
}
/**
* Creates @{link Path} using absolute path to some FS resource
* @return new Path instance.
* */
protected Path createPath(String pathToFSResource){
return new Path(pathToFSResource);
}
/**
* Builds new instance of MiniDFSCluster
* Default: @{link DEFAULT_NAMENODE_PORT}, @{link DEFAULT_NAMENODE_PORT}
* @return MiniDFSCluster instance.
* */
protected MiniDFSCluster buildMiniDFSCluster() throws IOException {
return new MiniDFSCluster.Builder(new Configuration())
.nameNodePort(DEFAULT_NAMENODE_PORT)
.numDataNodes(ONE_DATANODE)
.build();
}
protected abstract Class<? extends Configured> getMRJobClass();
protected abstract Class<? extends Mapper> getMapperClass();
protected abstract Class<? extends Reducer> getReducerClass();
protected abstract Class<? extends InputFormat> getInputFormat();
protected abstract Class<? extends OutputFormat> getOutputFormat();
protected abstract Class<?> getOutputKeyClass();
protected abstract Class<?> getOutputValueClass();
}
公共抽象类AbstractClusterMapReduceTest{
私有静态最终日志日志=LogFactory.getLog(AbstractClusterMapReduceTest.class);
公共静态最终字符串DEFAULT\u LOG\u CATALOG=“local mr logs”;
私有静态final int DEFAULT_NAMENODE_PORT=50123;
私有静态final int ONE_DATANODE=1;
私有静态final int DEFAULT\u REDUCE\u NUM\u TASKS=1;
私有静态最终字符串斜杠=“/”;
私有静态最终字符串默认值\u MR\u INPUT\u DATA\u FILE=“MR INPUT DATA FILE”;
私有最小arncluster-mrCluster;
私有小型集群;
/**来自基本Cloudera示例的糟糕代码*/
私有静态路径测试\u ROOT\u DIR=新路径(“目标”,
AbstractClusterMapReduceTest.class.getName()+“-tmpDir”).makeQualified(getLocalFileSystem());
静态路径APP_JAR=新路径(TEST_ROOT_DIR,“MRAppJar.JAR”);
私有静态文件系统getLocalFileSystem(){
试一试{
返回FileSystem.getLocal(新配置());
}捕获(IOE异常){
抛出新错误(“无法访问本地文件系统。无法启动MR群集”,e);
}
}
/**
*始终提供日志目录的路径。
*默认值为:${project.build.directory}/{@link AbstractClusterMapReduceTest#Default_LOG_CATALOG}
* */
受保护的字符串getPathToLogCatalog(){
返回getPathToOutputDirectory()+斜杠+默认日志目录;
}
私有字符串getPathToOutputDirectory(){
返回System.getProperty(“project.build.directory”);
}
私有void checkAppJar(){
如果(!(新文件(MiniMRYarnCluster.APPJAR)).exists()){
抛出新错误(“MRAppJar”+MiniMRYarnCluster.APPJAR+“未找到。未运行测试”);
}否则{
LOG.info(MiniMRYarnCluster.APPJAR+“位于正确的位置。可以继续设置Env…”);
}
}
public void setupEnv()引发IOException{
checkAppJar();
setProperty(“hadoop.log.dir”,getPathToLogCatalog());
System.setProperty(“javax.xml.parsers.SAXParserFactory”,
“com.sun.org.apache.xerces.internal.jaxp.saxparserfactorympl”);
dfsCluster=buildMiniDFSCluster();
//dfsCluster.getFileSystem().makeQualified(createPath(getHDFSPathToInputData());
//dfsCluster.getFileSystem().makeQualified(createPath(getOutputPath());
mrCluster=new MiniMRYarnCluster(this.getClass().getName(),1);
Configuration conf=新配置();
conf.set(“fs.defaultFS”,getFileSystem().getUri().toString());//使用HDFS
//conf.set(MRJobConfig.MR_AM_STAGING_DIR,getPathToOutputDirectory()+“/tmp mapreduce”);
conf.set(MRJobConfig.MR_AM_STAGING_DIR,“/apps_STAGING_DIR”);
mrCluster.init(conf);
mrCluster.start();
//Cloudera技巧:)
//复制MRAppJar并将其私有化。TODO:修复我。这是对
//绕过缺席的公众讨论会。
getLocalFileSystem().copyFromLocalFile(新路径(MiniMRYarnCluster.APPJAR),APP\u JAR);
getLocalFileSystem().setPermission(APP_JAR,新的FsPermission(“700”);
}
公共无效拆卸(){
if(mrCluster!=null){
mrCluster.stop();
mrCluster=null;
}
如果(dfsCluster!=null){
dfsCluster.shutdown();
dfsCluster=null;
}
}
公共布尔createAndSubmitJob()引发IOException、ClassNotFoundException、InterruptedException{
LOG.info(“createAndSubmitJob:enter”);
checkAppJar();
LOG.info(“已找到MRAppJar。可以开始创建作业”);
Configuration=mrCluster.getConfig();
set(MRConfig.MASTER_地址,“本地”);
Job Job=Job.getInstance(配置);
job.setJobName(this.getClass().getSimpleName()+“-job”);
job.addFileToClassPath(APP_JAR);//AppMaster JAR本身。
setJarByClass(getMRJobClass());
job.setJobName(getMRJobClass().getSimpleName());
setNumReduceTasks(getReduceEnumTasks());
setOutputKeyClass(getOutputKeyClass());
setOutputValueClass(getOutputValueClass());
setMapperClass(getMapperClass());
setReducerClass(getReducerClass());
setInputFormatClass(getInputFormat());
setOutputFormatClass(getOutputFormat());
setInputPath(作业,getHDFSPathToInputData());
setOutputPath(作业,createPath(getOutputPath());
job.setSpecificationExecution(false);
job.setMaxMapAttempts(1);//加速失败
LOG.info(“提交作业…”);
job.submit();
LOG.info(“作业已提交”);
String trackingUrl=job.getTrackingURL();
字符串jobId=job.getJobID().toString();
LOG.info(“trackingUrl:+trackingUrl”);
LOG.info(“jobId:+jobId”);
返回作业。waitForCompletion(true);
}
受保护的文件系统getFileSystem()引发IOException{
返回dfsCluster.getFileSystem();
}
受保护的int getReduceEnumTasks(){
返回默认任务数量;
}
/**
*@return InputStream实例到要与MR作业一起运行的文件
* */
受保护的InputStream getInputStreamForInputData(){
返回此.getClass().getClassLoader().getResourceAsStream(此.getClass().getSimpleName()+“/”+getInputDatasetName());
//返回getPathToOutputDirectory()+斜杠+默认值\u输入\u目录+”/mr
public abstract class AbstractClusterMapReduceTest {
private static final Log LOG = LogFactory.getLog(AbstractClusterMapReduceTest.class);
public static final String DEFAULT_LOG_CATALOG = "local-mr-logs";
private static final int DEFAULT_NAMENODE_PORT = 50123;
private static final int ONE_DATANODE = 1;
private static final int DEFAULT_REDUCE_NUM_TASKS = 1;
private static final String SLASH = "/";
private static final String DEFAULT_MR_INPUT_DATA_FILE = "mr-input-data-file";
private MiniMRYarnCluster mrCluster;
private MiniDFSCluster dfsCluster;
/** Shitty code from base Cloudera example*/
private static Path TEST_ROOT_DIR = new Path("target",
AbstractClusterMapReduceTest.class.getName() + "-tmpDir").makeQualified(getLocalFileSystem());
static Path APP_JAR = new Path(TEST_ROOT_DIR, "MRAppJar.jar");
private static FileSystem getLocalFileSystem(){
try {
return FileSystem.getLocal(new Configuration());
} catch (IOException e) {
throw new Error("Can't access local file system. MR cluster can't be started", e);
}
}
/**
* Always provide path to log catalog.
* Default is: ${project.build.directory}/{@link AbstractClusterMapReduceTest#DEFAULT_LOG_CATALOG}
* */
protected String getPathToLogCatalog(){
return getPathToOutputDirectory()+ SLASH + DEFAULT_LOG_CATALOG;
}
private String getPathToOutputDirectory(){
return System.getProperty("project.build.directory");
}
private void checkAppJar(){
if (!(new File(MiniMRYarnCluster.APPJAR)).exists()) {
throw new Error("MRAppJar " + MiniMRYarnCluster.APPJAR+ " not found. Not running test.");
}else{
LOG.info(MiniMRYarnCluster.APPJAR + " is at the right place. Can continue to setup Env...");
}
}
public void setupEnv() throws IOException{
checkAppJar();
System.setProperty("hadoop.log.dir", getPathToLogCatalog());
System.setProperty("javax.xml.parsers.SAXParserFactory",
"com.sun.org.apache.xerces.internal.jaxp.SAXParserFactoryImpl");
dfsCluster = buildMiniDFSCluster();
//dfsCluster.getFileSystem().makeQualified(createPath(getHDFSPathToInputData()));
//dfsCluster.getFileSystem().makeQualified(createPath(getOutputPath()));
mrCluster = new MiniMRYarnCluster(this.getClass().getName(), 1);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", getFileSystem().getUri().toString()); // use HDFS
//conf.set(MRJobConfig.MR_AM_STAGING_DIR, getPathToOutputDirectory()+"/tmp-mapreduce");
conf.set(MRJobConfig.MR_AM_STAGING_DIR, "/apps_staging_dir");
mrCluster.init(conf);
mrCluster.start();
//Cloudera tricks :)
// Copy MRAppJar and make it private. TODO: FIXME. This is a hack to
// workaround the absent public discache.
getLocalFileSystem().copyFromLocalFile(new Path(MiniMRYarnCluster.APPJAR), APP_JAR);
getLocalFileSystem().setPermission(APP_JAR, new FsPermission("700"));
}
public void tearDown() {
if (mrCluster != null) {
mrCluster.stop();
mrCluster = null;
}
if (dfsCluster != null) {
dfsCluster.shutdown();
dfsCluster = null;
}
}
public boolean createAndSubmitJob() throws IOException, ClassNotFoundException, InterruptedException{
LOG.info("createAndSubmitJob: enter");
checkAppJar();
LOG.info("MRAppJar has been found. Can start to create Job");
Configuration configuration = mrCluster.getConfig();
configuration.set(MRConfig.MASTER_ADDRESS, "local");
Job job = Job.getInstance(configuration);
job.setJobName(this.getClass().getSimpleName()+"-job");
job.addFileToClassPath(APP_JAR); // The AppMaster jar itself.
job.setJarByClass(getMRJobClass());
job.setJobName(getMRJobClass().getSimpleName());
job.setNumReduceTasks(getReduceNumTasks());
job.setOutputKeyClass(getOutputKeyClass());
job.setOutputValueClass(getOutputValueClass());
job.setMapperClass(getMapperClass());
job.setReducerClass(getReducerClass());
job.setInputFormatClass(getInputFormat());
job.setOutputFormatClass(getOutputFormat());
FileInputFormat.setInputPaths(job, getHDFSPathToInputData());
FileOutputFormat.setOutputPath(job, createPath(getOutputPath()));
job.setSpeculativeExecution(false);
job.setMaxMapAttempts(1); // speed up failures
LOG.info("Submitting job...");
job.submit();
LOG.info("Job has been submitted.");
String trackingUrl = job.getTrackingURL();
String jobId = job.getJobID().toString();
LOG.info("trackingUrl:" +trackingUrl);
LOG.info("jobId:" +jobId);
return job.waitForCompletion(true);
}
protected FileSystem getFileSystem() throws IOException {
return dfsCluster.getFileSystem();
}
protected int getReduceNumTasks(){
return DEFAULT_REDUCE_NUM_TASKS;
}
/**
* @return InputStream instance to file you want to run with your MR job
* */
protected InputStream getInputStreamForInputData() {
return this.getClass().getClassLoader().getResourceAsStream(this.getClass().getSimpleName()+"/"+getInputDatasetName());
//return getPathToOutputDirectory()+ SLASH + DEFAULT_INPUT_CATALOG+"/mr-input-data";
}
protected String getHDFSPathToInputData() throws IOException{
InputStream inputStream = getInputStreamForInputData();
Path hdfsInputPath = new Path(DEFAULT_MR_INPUT_DATA_FILE);
FSDataOutputStream fsDataOutputStream = getFileSystem().create(hdfsInputPath);
copyStream(inputStream, fsDataOutputStream);
fsDataOutputStream.close();
inputStream.close();
return hdfsInputPath.toString();
}
private void copyStream(InputStream input, OutputStream output) throws IOException {
byte[] buffer = new byte[1024]; // Adjust if you want
int bytesRead;
while ((bytesRead = input.read(buffer)) != -1)
{
output.write(buffer, 0, bytesRead);
}
}
/**
* Dataset should be placed in resources/ConcreteClusterMapReduceTest
* @return a name of a file from catalog.
* */
protected abstract String getInputDatasetName();
/**
* @return path reducer output
* default is: @{link AbstractClusterMapReduceTest#DEFAULT_OUTPUT_CATALOG}
* */
protected String getOutputPath(){
return "mr-data-output";
}
/**
* Creates @{link Path} using absolute path to some FS resource
* @return new Path instance.
* */
protected Path createPath(String pathToFSResource){
return new Path(pathToFSResource);
}
/**
* Builds new instance of MiniDFSCluster
* Default: @{link DEFAULT_NAMENODE_PORT}, @{link DEFAULT_NAMENODE_PORT}
* @return MiniDFSCluster instance.
* */
protected MiniDFSCluster buildMiniDFSCluster() throws IOException {
return new MiniDFSCluster.Builder(new Configuration())
.nameNodePort(DEFAULT_NAMENODE_PORT)
.numDataNodes(ONE_DATANODE)
.build();
}
protected abstract Class<? extends Configured> getMRJobClass();
protected abstract Class<? extends Mapper> getMapperClass();
protected abstract Class<? extends Reducer> getReducerClass();
protected abstract Class<? extends InputFormat> getInputFormat();
protected abstract Class<? extends OutputFormat> getOutputFormat();
protected abstract Class<?> getOutputKeyClass();
protected abstract Class<?> getOutputValueClass();
}
public class POIClusterMapreduceTest extends AbstractClusterMapReduceTest{
private static final String INTEGRATION = "integration";
@BeforeClass(groups = INTEGRATION)
public void setup() throws IOException {
super.setupEnv();
}
//@Test(groups = INTEGRATION)
public void runJob() throws InterruptedException, IOException, ClassNotFoundException {
boolean result = createAndSubmitJob();
MatcherAssert.assertThat(result, Matchers.is(true));
String outputResultAsString = getFileSystem().open(createPath(getOutputPath())).readUTF();
MatcherAssert.assertThat(outputResultAsString.length(), Matchers.greaterThan(0));
}
@AfterClass(groups = INTEGRATION)
public void tearDown(){
super.tearDown();
}
@Override
protected Class<Main> getMRJobClass() {
return Main.class;
}
@Override
protected Class<POIMapper> getMapperClass() {
return POIMapper.class;
}
@Override
protected Class<Reducer> getReducerClass() {
return Reducer.class;
}
@Override
protected Class<TextInputFormat> getInputFormat() {
return TextInputFormat.class;
}
@Override
protected Class<TextOutputFormat> getOutputFormat() {
return TextOutputFormat.class;
}
@Override
protected Class<LongWritable> getOutputKeyClass() {
return LongWritable.class;
}
@Override
protected Class<XVLRDataWritable> getOutputValueClass() {
return XVLRDataWritable.class;
}
@Override
protected String getInputDatasetName() {
return "mr-input-data";
}
}
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>${hadoop.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.common.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.common.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<classifier>tests</classifier>
<version>${hadoop.common.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-test</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<hadoop.version>2.0.0-mr1-cdh4.1.1</hadoop.version>
<hadoop.common.version>2.0.0-cdh4.1.1</hadoop.common.version>
</properties>
public abstract class AbstractClusterMRTest {
private static final Log LOG = LogFactory.getLog(AbstractClusterMRTest.class);
public static final String DEFAULT_LOG_CATALOG = "local-mr-logs";
public static final String SLASH = "/";
public static final String MR_DATA_OUTPUT = "mr-data-output";
public static final String DEFAULT_OUTPUT_FILE_NAME = "part-r-00000";
private static final int DEFAULT_REDUCE_NUM_TASKS = 1;
private Configuration configuration;
private FileSystem localFileSystem;
private MiniMRCluster mrCluster;
private JobConf mrClusterConf;
/**
* Always provide path to log catalog.
* */
protected String getPathToLogCatalog(){
File logCatalog = new File(getPathToOutputDirectory()+ SLASH + DEFAULT_LOG_CATALOG);
if(!logCatalog.exists()){
logCatalog.mkdir();
}
LOG.info("Path to log catalog is: "+logCatalog.getAbsolutePath());
return logCatalog.getAbsolutePath();
}
private String getPathToOutputDirectory(){
return System.getProperty("project.build.directory");
}
public void setup() throws IOException{
System.setProperty("hadoop.log.dir", getPathToLogCatalog());
configuration = new Configuration(true);
localFileSystem = FileSystem.get(configuration);
mrCluster = new MiniMRCluster(1, localFileSystem.getUri().toString(), 1, null, null, new JobConf(configuration));
mrClusterConf = mrCluster.createJobConf();
}
public void tearDown() {
if (mrCluster != null) {
mrCluster.shutdown();
mrCluster = null;
}
}
/**
* Use this method to get JobBuilder configured for testing purposes.
* @return JobBuilder instance ready for further configuration.
* */
public JobBuilder createTestableJobInstance() throws IOException{
return new JobBuilder(mrClusterConf, this.getClass().getSimpleName()+"-mrjob")
.withNumReduceTasks(DEFAULT_REDUCE_NUM_TASKS);
}
/**
* Pass configured JobBuilder and wait for completion
* @param jobBuilder is a JobBuilder ready to submit
* @return job completion result.
* */
public boolean buildSubmitAndWaitForCompletion(JobBuilder jobBuilder)
throws InterruptedException, IOException, ClassNotFoundException {
String pathToInputFile = getPathToInputData();
checkThatFileExists(pathToInputFile);
Job job = jobBuilder.build();
FileInputFormat.setInputPaths(job, pathToInputFile);
FileOutputFormat.setOutputPath(job, createPath(getOutputPath()));
LOG.info("Submitting job...");
job.submit();
LOG.info("Job has been submitted.");
String trackingUrl = job.getTrackingURL();
String jobId = job.getJobID().toString();
LOG.info("trackingUrl:" +trackingUrl);
LOG.info("jobId:" +jobId);
return job.waitForCompletion(true);
}
/**
* By declaration input data should be stored in test/resources folder:
* ConcreteTestClassName/in/getInputDatasetName()
* Don't forget to override @link{AbstractClusterMRTest#getInputDatasetName()}
* @return path to input data
* */
protected String getPathToInputData(){
String pathFile = this.getClass().getSimpleName() + SLASH + "in" + SLASH+ getInputDatasetName();
LOG.info("Path for getting URL to file:" + pathFile);
URL urlToFile = this.getClass().getClassLoader().getResource(pathFile);
File file = FileUtils.toFile(urlToFile);
return file.getAbsolutePath();
}
/**
* Dataset should be placed in resources/ConcreteClusterMapReduceTest
* @return a name of a file from catalog.
* */
protected abstract String getInputDatasetName();
/**
* @return path reducer output
* default is: @{link AbstractClusterMapReduceTestOld#DEFAULT_OUTPUT_CATALOG}
* */
protected String getOutputPath(){
return getPathToOutputDirectory()+ SLASH + MR_DATA_OUTPUT;
}
/**
* @return text lines from reducer output file.
* */
protected List<String> getLinesFromOutputFile() throws IOException{
String pathToResult = getOutputPath()+SLASH+DEFAULT_OUTPUT_FILE_NAME;
File resultFile = new File(pathToResult);
return FileUtils.readLines(resultFile);
}
public abstract String getEtalonOutputFileName();
protected List<String> getLinesFromEtalonOutputFile() throws IOException{
String pathFile = this.getClass().getSimpleName() + SLASH + "out" + SLASH+ getEtalonOutputFileName();
LOG.debug("path to etalon file: "+ pathFile);
URL urlToFile = this.getClass().getClassLoader().getResource(pathFile);
File file = FileUtils.toFile(urlToFile);
return FileUtils.readLines(file);
}
/**
* Creates @{link Path} using absolute path to some FS resource
* @return new Path instance.
* */
protected Path createPath(String pathToFSResource){
return new Path(pathToFSResource);
}
public void checkThatFileExists(String absolutePathToFile){
if(! new File(absolutePathToFile).exists()){
throw new Error("Path to input file is incorrect. Can't run MR job. Incorrect path is:"+absolutePathToFile);
}
}
}