Hadoop 如何使用相同的程序将MapReduce输出插入到HBASE中_Hadoop_Pdf_Mapreduce_Hbase

Hadoop 如何使用相同的程序将MapReduce输出插入到HBASE中

hadoop pdf mapreduce hbase

Hadoop 如何使用相同的程序将MapReduce输出插入到HBASE中,hadoop,pdf,mapreduce,hbase,Hadoop,Pdf,Mapreduce,Hbase,我写了一个程序，将pdf作为输入，并作为一个整体生成文本输出。我想使用相同的程序在hbase中加载此文本，是否有任何方法可以做到这一点。任何帮助都将非常有用 //Driver Class package com.tcs; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Lo

我写了一个程序，将pdf作为输入，并作为一个整体生成文本输出。我想使用相同的程序在hbase中加载此文本，是否有任何方法可以做到这一点。任何帮助都将非常有用

//Driver Class
package com.tcs;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class PdfInputDriver {

    public static void main(String[] args) throws IOException,InterruptedException, ClassNotFoundException 
    {
        Configuration conf = new Configuration();
        GenericOptionsParser parser = new GenericOptionsParser(conf, args);
        args = parser.getRemainingArgs();
        @SuppressWarnings("deprecation")
        Job job = new Job(conf, "Pdftext");
        job.setJarByClass(PdfInputDriver.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        job.setInputFormatClass(PdfInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);


        System.out.println(job.waitForCompletion(true));
    }
}

//InputFormatClass
package com.tcs;

import java.io.IOException;

import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

public class PdfInputFormat extends FileInputFormat<Object, Object> {

    @SuppressWarnings({ "unchecked", "rawtypes" })
    @Override
    public RecordReader createRecordReader(
            InputSplit split, TaskAttemptContext context) throws IOException,
            InterruptedException {

        return new PdfRecordReader();
    }

}

//PDF Record Reader class
package com.tcs;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

public class PdfRecordReader extends RecordReader<Object, Object> {

    private String[] lines = null;
    private LongWritable key = null;
    private Text value = null;

    @Override
    public void initialize(InputSplit genericSplit, TaskAttemptContext context)
            throws IOException, InterruptedException {

        FileSplit split = (FileSplit) genericSplit;
        Configuration job = context.getConfiguration();
        final Path file = split.getPath();

        /*
         * The below code contains the logic for opening the file and seek to
         * the start of the split. Here we are applying the Pdf Parsing logic
         */

        FileSystem fs = file.getFileSystem(job);
        FSDataInputStream fileIn = fs.open(split.getPath());
        PDDocument pdf = null;
        String parsedText = null;
        PDFTextStripper stripper;
        pdf = PDDocument.load(fileIn);
        stripper = new PDFTextStripper();
        parsedText = stripper.getText(pdf);
        //String delims = "[ ]";
        this.lines = parsedText.split("/n");
        }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {

        if (key == null) {
            key = new LongWritable();
            key.set(1);
            value = new Text();
            value.set(lines[0]);
        } else 
        {
            int temp = (int) key.get();
            if (temp < (lines.length - 1)) {
                int count = (int) key.get();
                value = new Text();
                value.set(lines[count]);
                count = count + 1;
                key = new LongWritable(count);
            } else {
                return false;
            }

        }
        if (key == null || value == null) {
            return false;
        } else {
            return true;
        }
    }

    @Override
    public LongWritable getCurrentKey() throws IOException,
            InterruptedException {

        return key;
    }

    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {

        return value;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {

        return 0;
    }

    @Override
    public void close() throws IOException {

    }

}

//Mapper Class
package com.tcs;


import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>
{

    protected void map(LongWritable key, Text value, Context context)
         throws IOException, InterruptedException {

    context.write(value, key);
}
}


//Reducer Class
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordCountReducer extends Reducer<Object, Object, Object, Object> {
    protected void reduce(Text key, Iterable<LongWritable> values,
            Context context) throws IOException, InterruptedException {


            context.write(key, new WordCountReducer());
    }
}

//驱动程序类
包com.tcs；
导入java.io.IOException；
导入org.apache.hadoop.conf.Configuration；
导入org.apache.hadoop.fs.Path；
导入org.apache.hadoop.io.LongWritable；
导入org.apache.hadoop.io.Text；
导入org.apache.hadoop.mapreduce.Job；
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat；
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat；
导入org.apache.hadoop.mapreduce.lib.output.TextOutputFormat；
导入org.apache.hadoop.util.GenericOptionsParser；
公共类PdfInputDriver{
公共静态void main（字符串[]args）引发IOException、InterruptedException、ClassNotFoundException
{
Configuration conf=新配置（）；
GenericOptionsParser=新的GenericOptionsParser（conf，args）；
args=parser.getRemainingArgs（）；
@抑制警告（“弃用”）
作业作业=新作业（conf，“Pdftext”）；
job.setJarByClass（PdfInputDriver.class）；
job.setOutputKeyClass（Text.class）；
job.setOutputValueClass（LongWritable.class）；
job.setInputFormatClass（PdfInputFormat.class）；
setOutputFormatClass（TextOutputFormat.class）；
setInputPaths（作业，新路径（args[0]）；
setOutputPath（作业，新路径（args[1]）；
setMapperClass（WordCountMapper.class）；
job.setReducerClass（WordCountReducer.class）；
System.out.println（job.waitForCompletion（true））；
}
}
//InputFormatClass
包com.tcs；
导入java.io.IOException；
导入org.apache.hadoop.mapreduce.InputSplit；
导入org.apache.hadoop.mapreduce.RecordReader；
导入org.apache.hadoop.mapreduce.TaskAttemptContext；
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat；
公共类PdfInputFormat扩展了FileInputFormat{
@SuppressWarnings（{“unchecked”，“rawtypes”}）
@凌驾
公共记录阅读器createRecordReader(
InputSplit拆分，TaskAttemptContext上下文）引发IOException，
中断异常{
返回新的PdfRecordReader（）；
}
}
//PDF记录阅读器类
包com.tcs；
导入java.io.IOException；
导入org.apache.hadoop.conf.Configuration；
导入org.apache.hadoop.fs.FSDataInputStream；
导入org.apache.hadoop.fs.FileSystem；
导入org.apache.hadoop.fs.Path；
导入org.apache.hadoop.io.LongWritable；
导入org.apache.hadoop.io.Text；
导入org.apache.hadoop.mapreduce.InputSplit；
导入org.apache.hadoop.mapreduce.RecordReader；
导入org.apache.hadoop.mapreduce.TaskAttemptContext；
导入org.apache.hadoop.mapreduce.lib.input.FileSplit；
导入org.apache.pdfbox.pdmodel.PDDocument；
导入org.apache.pdfbox.util.PDFTextStripper；
公共类PdfRecordReader扩展了RecordReader{
私有字符串[]行=null；
私有长可写密钥=null；
私有文本值=null；
@凌驾
公共void初始化（InputSplit genericSplit，TaskAttemptContext上下文）
抛出IOException、InterruptedException{
FileSplit split=（FileSplit）genericSplit；
配置作业=context.getConfiguration（）；
最终路径文件=split.getPath（）；
/*
*下面的代码包含打开文件并查找的逻辑
*拆分的开始。这里我们应用Pdf解析逻辑
*/
FileSystem fs=file.getFileSystem（作业）；
FSDataInputStream fileIn=fs.open（split.getPath（））；
PDDocument pdf=null；
字符串parsedText=null；
PDFTEXT脱料器脱料器；
pdf=PDDocument.load（fileIn）；
剥离器=新的PDFTextStripper（）；
parsedText=stripper.getText（pdf）；
//字符串delims=“[]”；
this.lines=parsedText.split（“/n”）；
}
@凌驾
公共布尔值nextKeyValue（）引发IOException、InterruptedException{
if（key==null）{
key=新的LongWritable（）；
键组（1）；
值=新文本（）；
value.set（行[0]）；
}否则
{
int temp=（int）key.get（）；
如果（温度<（线路长度-1））{
int count=（int）key.get（）；
值=新文本（）；
设置（行[计数]）；
计数=计数+1；
键=新的可长写（计数）；
}否则{
返回false；
}
}
if（key==null | | value==null）{
返回false；
}否则{
返回true；
}
}
@凌驾
public LongWritable getCurrentKey（）引发IOException，
中断异常{
返回键；
}
@凌驾
公共文本getCurrentValue（）引发IOException、InterruptedException{
返回值；
}
@凌驾
public float getProgress（）引发IOException、InterruptedException{
返回0；
}
@凌驾
public void close（）引发IOException{
}
}
//映射器类
包com.tcs；
导入java.io.IOException；
导入org.apache.hadoop.io.LongWritable；
导入org.apache.hadoop.io.Text；
导入org.apache.hadoop.mapreduce.Mapper；
公共类WordCountMapper扩展了映射器
{
受保护的void映射（可长写键、文本值、上下文）
抛出IOException、InterruptedException{
context.write（值、键）；
}
}
//减速器类
包com.tcs；
导入java.io.IOException；
导入org.apache.hadoop.io.LongWritable；
导入org.apache.hadoop.io.Text；
导入org.apache.hadoop.mapreduce.Reducer；
公共类WordCountReducer扩展了Reducer{
受保护的void reduce（文本键、Iterable值、，
上下文）抛出IOException、InterruptedException{
write（key，newwordcountreducer（））；
}
}

我想你