从mapreduce中的DistributedCache读取HAR文件
我已经编写了一个oozie工作流,它创建HAR归档,然后运行MR作业,该作业需要从该归档读取数据。 1.创建存档 2.当作业运行时,映射器会在分布式缓存中看到存档。 3.我怎么能读这本书?从这个归档文件逐行读取数据的API是什么(我的har是一批多个新行分隔的文本文件)。 注意:当我处理存储在DistirubtedCache中的普通文件(不是HAR归档文件)时,它工作得非常好。我在尝试从HAR读取数据时遇到问题 以下是一段代码片段:从mapreduce中的DistributedCache读取HAR文件,mapreduce,hdfs,cloudera,distributed-cache,Mapreduce,Hdfs,Cloudera,Distributed Cache,我已经编写了一个oozie工作流,它创建HAR归档,然后运行MR作业,该作业需要从该归档读取数据。 1.创建存档 2.当作业运行时,映射器会在分布式缓存中看到存档。 3.我怎么能读这本书?从这个归档文件逐行读取数据的API是什么(我的har是一批多个新行分隔的文本文件)。 注意:当我处理存储在DistirubtedCache中的普通文件(不是HAR归档文件)时,它工作得非常好。我在尝试从HAR读取数据时遇到问题 以下是一段代码片段: InputStream inputStream;
InputStream inputStream;
String cachedDatafileName = System.getProperty(DIST_CACHE_FILE_NAME);
LOG.info(String.format("Looking for[%s]=[%s] in DistributedCache",DIST_CACHE_FILE_NAME, cachedDatafileName));
URI[] uris = DistributedCache.getCacheArchives(getContext().getConfiguration());
URI uriToCachedDatafile = null;
for(URI uri : uris){
if(uri.toString().endsWith(cachedDatafileName)){
uriToCachedDatafile = uri;
break;
}
}
if(uriToCachedDatafile == null){
throw new RuntimeConfigurationException(String.format("Looking for[%s]=[%s] in DistributedCache failed. There is no such file",
DIST_CACHE_FILE_NAME, cachedDatafileName));
}
Path pathToFile = new Path(uriToCachedDatafile);
LOG.info(String.format("[%s] has been found. Uri is: [%s]. The path is:[%s]",cachedDatafileName, uriToCachedDatafile, pathToFile));
FileSystem fileSystem = pathToFile.getFileSystem(getContext().getConfiguration());
HarFileSystem harFileSystem = new HarFileSystem(fileSystem);
inputStream = harFileSystem.open(pathToFile); //NULL POINTER EXCEPTION IS HERE!
return inputStream;
正如你所看到的,这太可怕了。您已手动读取存储在存档中的索引文件,并使用索引文件元数据重建路径。如果您知道存储在存档中的文件的确切名称(如我的示例中),则可以手动构造路径
这并不方便。我确实希望像Zip->zipEntry这样的东西,当您可以在不知道其结构的情况下迭代归档条目时
protected InputStream getInputStreamToDistCacheFile() throws IOException{
InputStream inputStream;
String cachedDatafileName = System.getProperty(DIST_CACHE_FILE_NAME);
LOG.info(String.format("Looking for[%s]=[%s] in DistributedCache",DIST_CACHE_FILE_NAME, cachedDatafileName));
URI[] uris = DistributedCache.getCacheArchives(getContext().getConfiguration());
URI uriToCachedDatafile = null;
for(URI uri : uris){
if(uri.toString().endsWith(cachedDatafileName)){
uriToCachedDatafile = uri;
break;
}
}
if(uriToCachedDatafile == null){
throw new RuntimeConfigurationException(String.format("Looking for[%s]=[%s] in DistributedCache failed. There is no such file",
DIST_CACHE_FILE_NAME, cachedDatafileName));
}
//Path pathToFile = new Path(uriToCachedDatafile +"/stf/db_bts_stf.txt");
Path pathToFile = new Path("har:///"+"home/ssa/devel/megalabs/kyc-solution/kyc-mrjob/target/test-classes/GSMCellSubscriberHomeIntersectionJobDescriptionClusterMRTest/in/gsm_cell_location_stf.har" +"/stf/db_bts_stf.txt");
//Path pathToFile = new Path(("har://home/ssa/devel/megalabs/kyc-solution/kyc-mrjob/target/test-classes/GSMCellSubscriberHomeIntersectionJobDescriptionClusterMRTest/in/gsm_cell_location_stf.har"));
LOG.info(String.format("[%s] has been found. Uri is: [%s]. The path is:[%s]",cachedDatafileName, uriToCachedDatafile, pathToFile));
FileSystem harFileSystem = pathToFile.getFileSystem(context.getConfiguration());
FSDataInputStream fin = harFileSystem.open(pathToFile);
LOG.info("fin: " + fin);
// FileSystem fileSystem = pathToFile.getFileSystem(getContext().getConfiguration());
// HarFileSystem harFileSystem = new HarFileSystem(fileSystem);
// harFileSystem.exists(new Path("har://home/ssa/devel/mycompany/my-solution/my-mrjob/target/test-classes/HomeJobDescriptionClusterMRTest/in/locations.har"));
// LOG.info("harFileSystem.exists(pathToFile):"+ harFileSystem.exists(pathToFile));
// harFileSystem.initialize(uriToCachedDatafile, context.getConfiguration());
FileStatus[] statuses = harFileSystem.listStatus(new Path("har:///"+"har://home/ssa/devel/mycompany/my-solution/my-mrjob/target/test-classes/HomeJobDescriptionClusterMRTest/in/locations.har"));
for(FileStatus fileStatus : statuses){
LOG.info("fileStatus isDir"+fileStatus.isDirectory() +" len:" + fileStatus.getLen());
}
// String tmpPathToFile = "har:///"+pathToFile.toString(); //+"/stf/db_bts_stf.txt";
// Path tmpPath = new Path(tmpPathToFile);
// LOG.info("KILL ME PATH TO FILE IN ARCHIVE: " +tmpPath);
// inputStream = harFileSystem.open(tmpPath);
// return inputStream;
return fin;
}