Warning: file_get_contents(/data/phpspider/zhask/data//catemap/6/haskell/8.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Java 如何在Hadoop中使用CombineFileInputFormat?_Java_Hadoop_Mapreduce - Fatal编程技术网

Java 如何在Hadoop中使用CombineFileInputFormat?

Java 如何在Hadoop中使用CombineFileInputFormat?,java,hadoop,mapreduce,Java,Hadoop,Mapreduce,我想使用Hadoop 0.20.0/0.20.2的CombineFileInputFormat,这样它就可以处理每个记录1个文件,并且不会影响数据局部性(它通常会处理) TomWhite的Hadoop权威指南中提到了这一点,但他并没有展示如何做到这一点。相反,他继续研究序列文件 我对记录读取器中处理变量的含义相当困惑。 任何代码示例都会有巨大的帮助 提前感谢。检查以下用于组合文件输入格式的输入格式 import java.io.IOException; import org.apache.had

我想使用Hadoop 0.20.0/0.20.2的CombineFileInputFormat,这样它就可以处理每个记录1个文件,并且不会影响数据局部性(它通常会处理)

TomWhite的Hadoop权威指南中提到了这一点,但他并没有展示如何做到这一点。相反,他继续研究序列文件

我对记录读取器中处理变量的含义相当困惑。 任何代码示例都会有巨大的帮助


提前感谢。

检查以下用于组合文件输入格式的输入格式

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;


/**
 * CustomInputformat which implements the createRecordReader of abstract class CombineFileInputFormat
 */

public class MyCombineFileInputFormat extends CombineFileInputFormat {

    public static class MyRecordReader extends RecordReader<LongWritable,Text>{
        private LineRecordReader delegate=null;
        private int idx;

        public MyRecordReader(CombineFileSplit split,TaskAttemptContext taskcontext ,Integer idx) throws IOException {
            this.idx=idx;
            delegate = new LineRecordReader();
        }

        @Override
        public void close() throws IOException {
            delegate.close();
        }

        @Override
        public float getProgress() {
            try {
                return delegate.getProgress();
            }
            catch(Exception e) {
                return 0;
            }
        }

        @Override
        public void initialize(InputSplit split, TaskAttemptContext taskcontext) throws IOException {
            CombineFileSplit csplit=(CombineFileSplit)split;
            FileSplit fileSplit = new FileSplit(csplit.getPath(idx), csplit.getOffset(idx), csplit.getLength(idx), csplit.getLocations());
            delegate.initialize(fileSplit, taskcontext);
        }

        @Override
        public LongWritable getCurrentKey() throws IOException,
                InterruptedException {
            return delegate.getCurrentKey();
        }


        @Override
        public Text getCurrentValue() throws IOException, InterruptedException {
            return delegate.getCurrentValue();
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return delegate.nextKeyValue();
        }

    }

    @SuppressWarnings("unchecked")
    @Override
    public RecordReader createRecordReader(InputSplit split,TaskAttemptContext taskcontext) throws IOException {
        return new CombineFileRecordReader((CombineFileSplit) split, taskcontext, MyRecordReader.class);
    }
}
import java.io.IOException;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.InputSplit;
导入org.apache.hadoop.mapreduce.RecordReader;
导入org.apache.hadoop.mapreduce.TaskAttemptContext;
导入org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
导入org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
导入org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
导入org.apache.hadoop.mapreduce.lib.input.FileSplit;
导入org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
/**
*实现抽象类CombineFileInputFormat的createRecordReader的CustomInputformat
*/
公共类MyCombineFileInputFormat扩展了CombineFileInputFormat{
公共静态类MyRecordReader扩展了RecordReader{
私有LineRecordReader委托=null;
私有int idx;
公共MyRecordReader(CombineFileSplit拆分、TaskAttemptContext taskcontext、Integer idx)引发IOException{
这个.idx=idx;
委托=新建LineRecordReader();
}
@凌驾
public void close()引发IOException{
delegate.close();
}
@凌驾
公共进度(){
试一试{
返回delegate.getProgress();
}
捕获(例外e){
返回0;
}
}
@凌驾
public void initialize(InputSplit split,TaskAttemptContext taskcontext)引发IOException{
CombineFileSplit csplit=(CombineFileSplit)拆分;
FileSplit FileSplit=newfilesplit(csplit.getPath(idx)、csplit.getOffset(idx)、csplit.getLength(idx)、csplit.getLocations());
初始化(fileSplit,taskcontext);
}
@凌驾
public LongWritable getCurrentKey()引发IOException,
中断异常{
返回delegate.getCurrentKey();
}
@凌驾
公共文本getCurrentValue()引发IOException、InterruptedException{
返回delegate.getCurrentValue();
}
@凌驾
公共布尔值nextKeyValue()引发IOException、InterruptedException{
返回delegate.nextKeyValue();
}
}
@抑制警告(“未选中”)
@凌驾
public RecordReader createRecordReader(InputSplit split,TaskAttemptContext taskcontext)引发IOException{
返回新的CombineFileRecordReader((CombineFileSplit)split,taskcontext,MyRecordReader.class);
}
}

以下是使用所谓“新API”中的CombineFileInputFormat的最简单方法。假设您的实际输入格式是MyFormat,它与MyKey的键和MyValue的值一起工作(例如,可能是
SequenceFileInputFormat
的某个子类)

公共类CombinedMyFormat扩展了CombineFileInputFormat{
//存在只是为了修复键/值类型和
//将委托格式注入超类
//如果MyFormat不使用状态,则考虑常数。
私有静态类组合MyKeyValueReaderRapper
扩展CombineFileRecordReaderRapper{
受保护组合mykeymyValueReaderRapper(
CombineFileSplit拆分、TaskAttemptContext ctx、整数idx
)抛出IOException、InterruptedException{
super(新的MyFormat()、split、ctx、idx);
}
}
@凌驾
公共RecordReadercreateRecordReader(
InputSplit拆分,TaskAttemptContext ctx
)抛出IOException{
返回新的CombineFileRecordReader(
(CombineFileSplit)拆分、ctx、CombineMyValueReaderRapper.class
);
}
}

在您的作业驱动程序中,您现在应该能够为
MyFormat
输入
CombinedMyFormat
。您还应该设置一个,以防止Hadoop将整个输入合并到一个单独的分割中。

您能详细说明一下您所说的
每个记录一个文件是什么意思吗?
public class CombinedMyFormat extends CombineFileInputFormat< MyKey, MyValue > {
    // exists merely to fix the key/value types and
    // inject the delegate format to the superclass
    // if MyFormat does not use state, consider a constant instead
    private static class CombineMyKeyMyValueReaderWrapper
    extends CombineFileRecordReaderWrapper< MyKey, MyValue > {
        protected CombineMyKeyMyValueReaderWrapper(
            CombineFileSplit split, TaskAttemptContext ctx, Integer idx
        ) throws IOException, InterruptedException {
            super( new MyFormat(), split, ctx, idx );
        }
    }

    @Override
    public RecordReader< MyKey, MyValue > createRecordReader(
        InputSplit split, TaskAttemptContext ctx
    ) throws IOException {
        return new CombineFileRecordReader< MyKey, MyValue >(
            ( CombineFileSplit )split, ctx, CombineMyKeyMyValueReaderWrapper.class
        );
    }
}