Java 在Hadoop 2.2.0中打开缓存文件
使用Java 在Hadoop 2.2.0中打开缓存文件,java,hadoop,mapreduce,yarn,Java,Hadoop,Mapreduce,Yarn,使用job.addCacheFile()将缓存文件添加到作业中,并使用我的映射器使用context.getCacheFiles()将其下拉后。如何打开缓存文件。我试过使用: BufferedReader reader=new BufferedReader(new FileReader(filename))(注释如下) 其中filename是URI的toString()。有人能帮我吗 import java.io.*; import java.net.*; import java.util.*;
job.addCacheFile()
将缓存文件添加到作业中,并使用我的映射器使用context.getCacheFiles()
将其下拉后。如何打开缓存文件。我试过使用:
BufferedReader reader=new BufferedReader(new FileReader(filename))代码>(注释如下)
其中filename是URI
的toString()。有人能帮我吗
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.chain.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;
import org.apache.hadoop.mapreduce.lib.reduce.*;
public class UFOLocation2
{
public static class MapClass extends Mapper<LongWritable, Text, Text, LongWritable>
{
private final static LongWritable one = new LongWritable(1);
private static Pattern locationPattern = Pattern.compile("[a-zA-Z]{2}[^a-zA-Z]*$");
private Map<String, String> stateNames;
@Override
public void setup(Context context)
{
try
{
URI[] cacheFiles = context.getCacheFiles();
setupStateMap(cacheFiles[0].toString());
}
catch (IOException ioe)
{
System.err.println("Error reading state file.");
ioe.printStackTrace();
System.exit(1);
}
}
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException
{
String line = value.toString();
String[] fields = line.split("\t");
String location = fields[2].trim();
if (location.length() >= 2)
{
Matcher matcher = locationPattern.matcher(location);
if (matcher.find())
{
int start = matcher.start();
String state = location.substring(start, start + 2);
context.write(new Text(lookupState(state.toUpperCase())), one);
}
}
}
private void setupStateMap(String filename) throws IOException
{
Map<String, String> states = new HashMap<String, String>();
// the following line causes an IOException
BufferedReader reader = new BufferedReader(new FileReader(filename));
String line = reader.readLine();
while (line != null)
{
String[] split = line.split("\t");
states.put(split[0], split[1]);
line = reader.readLine();
}
stateNames = states;
}
private String lookupState(String state)
{
String fullName = stateNames.get(state);
return fullName == null ? "Other" : fullName;
}
}
public static void main(String[] args) throws Exception
{
Configuration config = new Configuration();
Job job = Job.getInstance(config, "UFO Location 2");
job.setJarByClass(UFOLocation2.class);
job.addCacheFile(new URI("/user/kevin/data/states.txt"));
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
Configuration mapconf1 = new Configuration(false);
ChainMapper.addMapper(job, UFORecordValidationMapper.class, LongWritable.class,
Text.class, LongWritable.class, Text.class, mapconf1);
Configuration mapconf2 = new Configuration(false);
ChainMapper.addMapper(job, MapClass.class, LongWritable.class,
Text.class, Text.class, LongWritable.class, mapconf2);
job.setMapperClass(ChainMapper.class);
job.setCombinerClass(LongSumReducer.class);
job.setReducerClass(LongSumReducer.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
import java.io.*;
导入java.net。*;
导入java.util.*;
导入java.util.regex.*;
导入org.apache.hadoop.conf.*;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.*;
导入org.apache.hadoop.mapreduce.*;
导入org.apache.hadoop.mapreduce.lib.chain.*;
导入org.apache.hadoop.mapreduce.lib.input.*;
导入org.apache.hadoop.mapreduce.lib.output.*;
导入org.apache.hadoop.mapreduce.lib.reduce.*;
公共类UFOLocation2
{
公共静态类映射器类扩展映射器
{
private final static LongWritable one=新的LongWritable(1);
私有静态模式locationPattern=Pattern.compile(“[a-zA-Z]{2}[^a-zA-Z]*$”;
私有地图州名;
@凌驾
公共无效设置(上下文)
{
尝试
{
URI[]cacheFiles=context.getCacheFiles();
setupStateMap(缓存文件[0].toString());
}
捕获(ioe异常ioe)
{
System.err.println(“读取状态文件时出错”);
ioe.printStackTrace();
系统出口(1);
}
}
公共void映射(可长写键、文本值、上下文)
抛出IOException、InterruptedException
{
字符串行=value.toString();
String[]fields=line.split(“\t”);
字符串位置=字段[2]。trim();
if(location.length()>=2)
{
Matcher Matcher=locationPattern.Matcher(位置);
if(matcher.find())
{
int start=matcher.start();
字符串状态=位置.子字符串(开始,开始+2);
write(新文本(lookupState(state.toUpperCase()),一个);
}
}
}
私有void setupStateMap(字符串文件名)引发IOException
{
映射状态=新的HashMap();
//以下行导致IOException
BufferedReader reader=新的BufferedReader(新文件读取器(文件名));
字符串行=reader.readLine();
while(行!=null)
{
String[]split=line.split(“\t”);
出售(分割[0],分割[1]);
line=reader.readLine();
}
州名=州;
}
私有字符串lookupState(字符串状态)
{
字符串fullName=stateNames.get(state);
返回fullName==null?“其他”:fullName;
}
}
公共静态void main(字符串[]args)引发异常
{
配置配置=新配置();
Job Job=Job.getInstance(配置,“UFO位置2”);
job.setJarByClass(UFOLocation2.class);
addCacheFile(新URI(“/user/kevin/data/states.txt”);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
Configuration mapconf1=新配置(错误);
ChainMapper.addMapper(作业、UForeRecordValidationMapper.class、LongWritable.class、,
Text.class、LongWritable.class、Text.class、mapconf1);
Configuration mapconf2=新配置(错误);
ChainMapper.addMapper(作业,MapClass.class,LongWritable.class,
Text.class、Text.class、LongWritable.class、mapconf2);
setMapperClass(ChainMapper.class);
job.setCombinerClass(LongSumReducer.class);
job.setReducerClass(LongSumReducer.class);
addInputPath(作业,新路径(args[0]);
setOutputPath(作业,新路径(args[1]);
系统退出(作业等待完成(真)?0:1;
}
}
这是代码问题还是配置问题?我在一个假定为psuedo的分布式集群上运行您可以使用类似的东西
Path path = new Path(uri[0].getPath().toString());
if (fileSystem.exists(path)) {
FSDataInputStream dataInputStream = fileSystem.open(path);
byte[] data = new byte[1024];
while (dataInputStream.read(data) > 0) {
//do your stuff here
}
dataInputStream.close();
}