Java Hadoop Map Reduce CustomSplit/CustomRecordReader_Java_Hadoop

Java Hadoop Map Reduce CustomSplit/CustomRecordReader

java hadoop

Java Hadoop Map Reduce CustomSplit/CustomRecordReader,java,hadoop,Java,Hadoop,我有一个巨大的文本文件，我想分割文件，使每个块有5行。我实现了自己的GWASInputFormat和GWASRecordReader类。然而，我的问题是，在下面的代码（我从中复制）中，initialize（）方法中有以下几行代码 FileSplit split = (FileSplit) genericSplit; final Path file = split.getPath(); Configuration conf = context.getConfiguration(); 我的问题是，

我有一个巨大的文本文件，我想分割文件，使每个块有5行。我实现了自己的GWASInputFormat和GWASRecordReader类。然而，我的问题是，在下面的代码（我从中复制）中，initialize（）方法中有以下几行代码

FileSplit split = (FileSplit) genericSplit;
final Path file = split.getPath();
Configuration conf = context.getConfiguration();

我的问题是，当在我的GWASRecordReader类中调用initialize（）方法时，文件是否已经被分割？我以为我是在GWASRecordReader类中进行（拆分）的。让我知道我的思维过程是否在这里

package com.test;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;

public class GWASRecordReader extends RecordReader<LongWritable, Text> {

private final int NLINESTOPROCESS = 5;
private LineReader in;
private LongWritable key;
private Text value = new Text();
private long start = 0;
private long pos = 0;
private long end = 0;
private int maxLineLength;

public void close() throws IOException {
    if(in != null) {
        in.close();
    }
}

public LongWritable getCurrentKey() throws IOException, InterruptedException {
    return key;
}

public Text getCurrentValue() throws IOException, InterruptedException {
    return value;
}

public float getProgress() throws IOException, InterruptedException {
    if(start == end) {
        return 0.0f;
    }
    else {
        return Math.min(1.0f, (pos - start)/(float) (end - start));
    }
}

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    final Path file = split.getPath();
    Configuration conf = context.getConfiguration();
    this.maxLineLength = conf.getInt("mapred.linerecordreader.maxlength",Integer.MAX_VALUE);
    FileSystem fs = file.getFileSystem(conf);
    start = split.getStart();
    end = start + split.getLength();
    System.out.println("---------------SPLIT LENGTH---------------------" + split.getLength());
    boolean skipFirstLine = false;
    FSDataInputStream filein = fs.open(split.getPath());

    if(start != 0) {
        skipFirstLine = true;
        --start;
        filein.seek(start);
    }

    in = new LineReader(filein, conf);
    if(skipFirstLine) {
        start += in.readLine(new Text(),0,(int)Math.min((long)Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

public boolean nextKeyValue() throws IOException, InterruptedException {
    if (key == null) {
        key = new LongWritable();
    }

    key.set(pos);

    if (value == null) {
        value = new Text();
    }
    value.clear();
    final Text endline = new Text("\n");
    int newSize = 0;
    for(int i=0; i<NLINESTOPROCESS;i++) {
        Text v = new Text();
        while( pos < end) {
            newSize = in.readLine(v ,maxLineLength, Math.max((int)Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
            value.append(v.getBytes(), 0, v.getLength());
            value.append(endline.getBytes(),0,endline.getLength());
            if(newSize == 0) {
                break;
            }
            pos += newSize;
            if(newSize < maxLineLength) {
                break;
            }
        }
    }

    if(newSize == 0) {
        key = null;
        value = null;
        return false;
    } else {
        return true;
    }
}
}

package.com.test；
导入java.io.IOException；
导入org.apache.hadoop.conf.Configuration；
导入org.apache.hadoop.fs.FSDataInputStream；
导入org.apache.hadoop.fs.FileSystem；
导入org.apache.hadoop.fs.Path；
导入org.apache.hadoop.io.LongWritable；
导入org.apache.hadoop.io.Text；
导入org.apache.hadoop.mapreduce.InputSplit；
导入org.apache.hadoop.mapreduce.RecordReader；
导入org.apache.hadoop.mapreduce.TaskAttemptContext；
导入org.apache.hadoop.mapreduce.lib.input.FileSplit；
导入org.apache.hadoop.util.LineReader；
公共类GWASRecordReader扩展了RecordReader{
私有final int NLINESTOPROCESS=5；
专用线路阅读器；
私钥；
私有文本值=新文本（）；
专用长启动=0；
私人长pos=0；
专用长端=0；
私有整数maxLineLength；
public void close（）引发IOException{
if（in！=null）{
in.close（）；
}
}
public LongWritable getCurrentKey（）引发IOException、InterruptedException{
返回键；
}
公共文本getCurrentValue（）引发IOException、InterruptedException{
返回值；
}
public float getProgress（）引发IOException、InterruptedException{
如果（开始==结束）{
返回0.0f；
}
否则{
返回数学最小值（1.0f，（pos-start）/（float）（end-start））；
}
}
public void initialize（InputSplit genericSplit，TaskAttemptContext上下文）引发IOException{
FileSplit split=（FileSplit）genericSplit；
最终路径文件=split.getPath（）；
conf=context.getConfiguration（）；
this.maxLineLength=conf.getInt（“mapred.linerecordreader.maxlength”，Integer.MAX_值）；
FileSystem fs=file.getFileSystem（conf）；
start=split.getStart（）；
end=start+split.getLength（）；
System.out.println（“--------------拆分长度-----------------”+SPLIT.getLength（））；
布尔skipFirstLine=false；
FSDataInputStream filein=fs.open（split.getPath（））；
如果（开始！=0）{
skipFirstLine=true；
--开始；
filein.seek（开始）；
}
in=新的LineReader（文件输入，配置）；
if（skipFirstLine）{
start+=in.readLine（new Text（），0，（int）Math.min（（long）Integer.MAX_值，end-start））；
}
this.pos=开始；
}
公共布尔值nextKeyValue（）引发IOException、InterruptedException{
if（key==null）{
key=新的LongWritable（）；
}
按键设置（pos）；
如果（值==null）{
值=新文本（）；
}
value.clear（）；
最终文本结束行=新文本（“\n”）；
int newSize=0；
对于（int i=0；iYes），输入文件已经被拆分。基本上如下所示：
您的输入文件->InputSplit->RecordReader->Mapper…

基本上，InputSplit
将输入分成块，RecordReader
将这些块分成键/值对。请注意，InputSplit
和RecordReader
将由您使用的InputFormat
确定。例如，TextInputFormat
使用FileSplit
将然后输入LineRecordReader，它以位置为键处理每一行，以行本身为值。
因此，在您的GWASInputFormat
中，您需要查看使用哪种FileSplit
来查看它传递给GWASRecordReader
的内容
我建议研究一下“将N行输入拆分为一行”的方法。它可能完全能够完成您自己想要做的事情
如果您试图一次获取5行作为值，第一行的行号作为键，我想您可以使用定制的NLineInputFormat
和定制的LineRecordReader
来实现这一点。您不必像我想的那样担心输入拆分，因为输入格式可以将其拆分为这5行块。哟urRecordReader
与LineRecordReader
非常相似，但不是获取数据块开头的字节位置，而是获取行号。因此，除了微小的更改外，代码几乎相同。因此，您可以复制并粘贴NLineInputFormat
和LineRecordReader但输入格式使用记录阅读器获取行号。代码将非常相似。
非常感谢。这就清除了一些内容。我想跟踪输入文件的行号，并将行号与输入记录一起作为值输入映射器。因此，看起来我必须使用我自己的拆分，是吗因为我现在使用的方法已经拆分了文件。请告诉我我有哪些选项（我想我需要重写computeSplitSize（）方法）1.我在网上搜遍了，但找不到具体的答案not@user1707141我更新了我的答案来解决这个问题。让我知道这是否合理，或者我需要更好地解释。这个答案的第一部分救了我的命，我有一个类似的问题，我知道我关心的是RecordReader。我想找到记录由字符串分隔，所以我找到了这篇很棒的文章。我知道这是一个迟来的答案，但它可能对任何需要它的人都有用：