Hadoop 用于单行和多行日志的自定义RecordReader
我正在尝试创建一个MR作业,该作业将更改通过Flume加载到HDFS中的日志文件的格式。我正在尝试将日志转换为字段以“:”分隔的格式。例如Hadoop 用于单行和多行日志的自定义RecordReader,hadoop,mapreduce,flume,Hadoop,Mapreduce,Flume,我正在尝试创建一个MR作业,该作业将更改通过Flume加载到HDFS中的日志文件的格式。我正在尝试将日志转换为字段以“:”分隔的格式。例如 date/timestamp:::log-level:::rest-of-log 我遇到的问题是,有些日志是单行的,有些是多行的,我需要在其余的日志字段中保持多行日志的完整性。我编写了一个自定义的InputFormat和RecordReader来尝试这样做(基本上就是NLineRecordReader修改为追加行,直到它到达一个日期戳,而不是追加固定数量的
date/timestamp:::log-level:::rest-of-log
我遇到的问题是,有些日志是单行的,有些是多行的,我需要在其余的日志字段中保持多行日志的完整性。我编写了一个自定义的InputFormat
和RecordReader
来尝试这样做(基本上就是NLineRecordReader
修改为追加行,直到它到达一个日期戳,而不是追加固定数量的行)。我用来格式化日志的MR作业似乎工作正常,但是RecordReader
似乎无法正确地通过多行,我不知道为什么
这是我的RecordReader类:
public class LogRecordReader extends RecordReader<LongWritable, Text> {
private LineReader in;
private LongWritable key;
private Text value = new Text();
private long start = 0;
private long end = 0;
private long pos = 0;
private int maxLineLength;
private Text line = new Text(); // working line
private Text lineHasDate = new Text(); // if line encounters a date stamp, hold it here
public void close() throws IOException {
if (in != null) {
in.close();
}
}
public LongWritable getCurrentKey() throws IOException,InterruptedException {
return key;
}
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
public float getProgress() throws IOException, InterruptedException {
if (start == end) {
return 0.0f;
}
else {
return Math.min(1.0f, (pos - start) / (float)(end - start));
}
}
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit split = (FileSplit) genericSplit;
final Path file = split.getPath();
Configuration conf = context.getConfiguration();
this.maxLineLength = conf.getInt("mapred.linerecordreader.maxlength",Integer.MAX_VALUE);
FileSystem fs = file.getFileSystem(conf);
start = split.getStart();
end = start + split.getLength();
boolean skipFirstLine = false;
FSDataInputStream filein = fs.open(split.getPath());
// if we're not starting at the beginning, we should skip the first line
if (start != 0){
skipFirstLine = true;
--start;
filein.seek(start);
}
in = new LineReader(filein, conf);
// if we should skip the first line
if(skipFirstLine){
start += in.readLine(new Text(), 0, (int)Math.min((long)Integer.MAX_VALUE, end - start));
}
this.pos = start;
}
/**
* create a complete log message from individual lines using date/time stamp as a breakpoint
*/
public boolean nextKeyValue() throws IOException, InterruptedException {
// if key has not yet been initialized
if (key == null) {
key = new LongWritable();
}
key.set(pos);
// if value has not yet been initialized
if (value == null) {
value = new Text();
}
value.clear();
final Text endline = new Text("\n");
int newSize = 0;
// if a line with a date was encountered on the previous call
if (lineHasDate.getLength() > 0) {
while (pos < end) {
value.append(lineHasDate.getBytes(), 0, lineHasDate.getLength()); // append the line
value.append(endline.getBytes(), 0, endline.getLength()); // append a line break
pos += newSize;
if (newSize == 0) break;
}
lineHasDate.clear(); // clean up
}
// to check buffer 'line' for date/time stamp
Pattern regexDateTime = Pattern.compile("^\\d{2}\\s\\S+\\s\\d{4}\\s\\d{2}:\\d{2}:\\d{2},\\d{3}\\s");
Matcher matcherDateTime = regexDateTime.matcher(line.toString());
// read in a new line to the buffer 'line'
newSize = in.readLine(line, maxLineLength, Math.max((int)Math.min(Integer.MAX_VALUE, end-pos), maxLineLength));
// if the line in the buffer contains a date/time stamp, append it
if (matcherDateTime.find()) {
while (pos < end) {
newSize = in.readLine(line, maxLineLength, Math.max((int)Math.min(Integer.MAX_VALUE, end-pos), maxLineLength));
value.append(line.getBytes(), 0, line.getLength()); // append the line
value.append(endline.getBytes(), 0, endline.getLength()); // append a line break
if (newSize == 0) break;
pos += newSize;
if (newSize < maxLineLength) break;
}
// read in the next line to the buffer 'line'
newSize = in.readLine(line, maxLineLength, Math.max((int)Math.min(Integer.MAX_VALUE, end-pos), maxLineLength));
}
// while lines in the buffer do not contain date/time stamps, append them
while(!matcherDateTime.find()) {
newSize = in.readLine(line, maxLineLength, Math.max((int)Math.min(Integer.MAX_VALUE, end-pos), maxLineLength));
value.append(line.getBytes(), 0, line.getLength()); // append the line
value.append(endline.getBytes(), 0, endline.getLength()); // append a line break
if (newSize == 0) break;
pos += newSize;
if (newSize < maxLineLength) break;
// read in the next line to the buffer 'line', and continue looping
newSize = in.readLine(line, maxLineLength, Math.max((int)Math.min(Integer.MAX_VALUE, end-pos), maxLineLength));
}
// if the line in the buffer contains a date/time stamp (which it should since the loop broke) save it for next call
if (matcherDateTime.find()) lineHasDate = line;
// if there is no new line
if (newSize == 0) {
// TODO: if lineHasDate is the last line in the file, it must be appended (?)
key = null;
value = null;
return false;
}
return true;
}
}
公共类LogRecordReader扩展了RecordReader{
专用线路阅读器;
私钥;
私有文本值=新文本();
专用长启动=0;
专用长端=0;
私人长pos=0;
私有整数maxLineLength;
私有文本行=新文本();//工作行
私有文本lineHasDate=new Text();//如果行遇到日期戳,请将其保留在此处
public void close()引发IOException{
if(in!=null){
in.close();
}
}
public LongWritable getCurrentKey()引发IOException、InterruptedException{
返回键;
}
公共文本getCurrentValue()引发IOException、InterruptedException{
返回值;
}
public float getProgress()引发IOException、InterruptedException{
如果(开始==结束){
返回0.0f;
}
否则{
返回数学最小值(1.0f,(pos-start)/(float)(end-start));
}
}
public void initialize(InputSplit genericSplit,TaskAttemptContext上下文)引发IOException、InterruptedException{
FileSplit split=(FileSplit)genericSplit;
最终路径文件=split.getPath();
conf=context.getConfiguration();
this.maxLineLength=conf.getInt(“mapred.linerecordreader.maxlength”,Integer.MAX_值);
FileSystem fs=file.getFileSystem(conf);
start=split.getStart();
end=start+split.getLength();
布尔skipFirstLine=false;
FSDataInputStream filein=fs.open(split.getPath());
//如果我们不是从头开始,我们应该跳过第一行
如果(开始!=0){
skipFirstLine=true;
--开始;
filein.seek(开始);
}
in=新的LineReader(文件输入,配置);
//如果我们跳过第一行
if(skipFirstLine){
start+=in.readLine(new Text(),0,(int)Math.min((long)Integer.MAX_值,end-start));
}
this.pos=开始;
}
/**
*使用日期/时间戳作为断点,从各行创建完整的日志消息
*/
公共布尔值nextKeyValue()引发IOException、InterruptedException{
//如果密钥尚未初始化
如果(key==null){
key=新的LongWritable();
}
按键设置(pos);
//如果值尚未初始化
如果(值==null){
值=新文本();
}
value.clear();
最终文本结束行=新文本(“\n”);
int newSize=0;
//如果在上一次通话中遇到带有日期的线路
如果(lineHasDate.getLength()>0){
while(pos
下面是格式化日志的MR作业:
public class FlumeLogFormat extends Configured implements Tool {
/**
* Map class
*/
public static class Map extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
private Text formattedLog = new Text();
private Text keyDateTime = new Text(); // no value
public void map(LongWritable key, Text value,
OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
String log = value.toString();
StringBuffer buffer = new StringBuffer();
Pattern regex = Pattern.compile("^(\\d{2}\\s\\S+\\s\\d{4}\\s\\d{2}:\\d{2}:\\d{2},\\d{3})\\s([A-Z]{4,5})\\s([\\s\\S]+)");
Matcher matcher = regex.matcher(log);
if (matcher.find()) {
buffer.append(matcher.group(1)+":::"+matcher.group(2)+":::"+matcher.group(3)); // insert ":::" between fields to serve as a delimiter
formattedLog.set(buffer.toString());
keyDateTime.set(matcher.group(1));
output.collect(keyDateTime, formattedLog);
}
}
}
/**
* run method
* @param args
* @return int
* @throws Exception
*/
public int run(String[] args) throws Exception {
JobConf conf = new JobConf(getConf(), FlumeLogFormat.class); // class is LogFormat
conf.setJobName("FlumeLogFormat");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(Map.class);
List<String> other_args = new ArrayList<String>();
for(int i=0; i < args.length; ++i) {
try {
if ("-m".equals(args[i])) {
conf.setNumMapTasks(Integer.parseInt(args[++i]));
} else if ("-r".equals(args[i])) {
conf.setNumReduceTasks(Integer.parseInt(args[++i]));
} else {
other_args.add(args[i]);
}
} catch (NumberFormatException exception) {
System.out.println("Give int value instead of " + args[i]);
//return printUsage();
} catch (ArrayIndexOutOfBoundsException exception) {
System.out.println("Parameter missing " + args[i-1]);
//return printUsage();
}
}
if (other_args.size() != 2) {
//return printUsage();
}
FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));
FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));
conf.setInputFormat(LogInputFormat.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
JobClient.runJob(conf);
return 0;
}
/**
* Main method
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new FlumeLogFormat(), args);
System.exit(res);
}
}
21 July 2013 17:35:51,334 INFO [conf-file-poller-0] (org.apache.flume.node.Application.startAllComponents:173) - Starting Sink k1
25 May 2013 06:33:36,795 ERROR [lifecycleSupervisor-1-7] (org.apache.flume.lifecycle.LifecycleSupervisor$MonitorRunnable.run:253) - Unable to start EventDrivenSourceRunner: { source:org.apache.flume.source.SpoolDirectorySource{name:r1,state:IDLE} } - Exception follows.
java.lang.IllegalStateException: Directory does not exist: /root/FlumeTest
at com.google.common.base.Preconditions.checkState(Preconditions.java:145)
at org.apache.flume.client.avro.ReliableSpoolingFileEventReader.<init>(ReliableSpoolingFileEventReader.java:129)
at org.apache.flume.client.avro.ReliableSpoolingFileEventReader.<init>(ReliableSpoolingFileEventReader.java:72)
at org.apache.flume.client.avro.ReliableSpoolingFileEventReader$Builder.build(ReliableSpoolingFileEventReader.java:556)
at org.apache.flume.source.SpoolDirectorySource.start(SpoolDirectorySource.java:75)
at org.apache.flume.source.EventDrivenSourceRunner.start(EventDrivenSourceRunner.java:44)
at org.apache.flume.lifecycle.LifecycleSupervisor$MonitorRunnable.run(LifecycleSupervisor.java:251)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
at java.util.concurrent.FutureTask$Sync.innerRunAndReset(FutureTask.java:351)
at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:178)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:165)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:267)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1146)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:679)
01 June 2012 12:35:22,222 INFO noiweoqierwnvoirenvoiernv iorenvoiernve irnvoirenv
公共类FlumeLogFormat扩展Conf