Hadoop 如何使用相同的程序将MapReduce输出插入到HBASE中
我写了一个程序,将pdf作为输入,并作为一个整体生成文本输出。我想使用相同的程序在hbase中加载此文本,是否有任何方法可以做到这一点。任何帮助都将非常有用Hadoop 如何使用相同的程序将MapReduce输出插入到HBASE中,hadoop,pdf,mapreduce,hbase,Hadoop,Pdf,Mapreduce,Hbase,我写了一个程序,将pdf作为输入,并作为一个整体生成文本输出。我想使用相同的程序在hbase中加载此文本,是否有任何方法可以做到这一点。任何帮助都将非常有用 //Driver Class package com.tcs; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Lo
//Driver Class
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class PdfInputDriver {
public static void main(String[] args) throws IOException,InterruptedException, ClassNotFoundException
{
Configuration conf = new Configuration();
GenericOptionsParser parser = new GenericOptionsParser(conf, args);
args = parser.getRemainingArgs();
@SuppressWarnings("deprecation")
Job job = new Job(conf, "Pdftext");
job.setJarByClass(PdfInputDriver.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setInputFormatClass(PdfInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
System.out.println(job.waitForCompletion(true));
}
}
//InputFormatClass
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class PdfInputFormat extends FileInputFormat<Object, Object> {
@SuppressWarnings({ "unchecked", "rawtypes" })
@Override
public RecordReader createRecordReader(
InputSplit split, TaskAttemptContext context) throws IOException,
InterruptedException {
return new PdfRecordReader();
}
}
//PDF Record Reader class
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
public class PdfRecordReader extends RecordReader<Object, Object> {
private String[] lines = null;
private LongWritable key = null;
private Text value = null;
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
final Path file = split.getPath();
/*
* The below code contains the logic for opening the file and seek to
* the start of the split. Here we are applying the Pdf Parsing logic
*/
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
PDDocument pdf = null;
String parsedText = null;
PDFTextStripper stripper;
pdf = PDDocument.load(fileIn);
stripper = new PDFTextStripper();
parsedText = stripper.getText(pdf);
//String delims = "[ ]";
this.lines = parsedText.split("/n");
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (key == null) {
key = new LongWritable();
key.set(1);
value = new Text();
value.set(lines[0]);
} else
{
int temp = (int) key.get();
if (temp < (lines.length - 1)) {
int count = (int) key.get();
value = new Text();
value.set(lines[count]);
count = count + 1;
key = new LongWritable(count);
} else {
return false;
}
}
if (key == null || value == null) {
return false;
} else {
return true;
}
}
@Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
@Override
public void close() throws IOException {
}
}
//Mapper Class
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>
{
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(value, key);
}
}
//Reducer Class
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Object, Object, Object, Object> {
protected void reduce(Text key, Iterable<LongWritable> values,
Context context) throws IOException, InterruptedException {
context.write(key, new WordCountReducer());
}
}
//驱动程序类
包com.tcs;
导入java.io.IOException;
导入org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.Job;
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
导入org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
导入org.apache.hadoop.util.GenericOptionsParser;
公共类PdfInputDriver{
公共静态void main(字符串[]args)引发IOException、InterruptedException、ClassNotFoundException
{
Configuration conf=新配置();
GenericOptionsParser=新的GenericOptionsParser(conf,args);
args=parser.getRemainingArgs();
@抑制警告(“弃用”)
作业作业=新作业(conf,“Pdftext”);
job.setJarByClass(PdfInputDriver.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setInputFormatClass(PdfInputFormat.class);
setOutputFormatClass(TextOutputFormat.class);
setInputPaths(作业,新路径(args[0]);
setOutputPath(作业,新路径(args[1]);
setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
System.out.println(job.waitForCompletion(true));
}
}
//InputFormatClass
包com.tcs;
导入java.io.IOException;
导入org.apache.hadoop.mapreduce.InputSplit;
导入org.apache.hadoop.mapreduce.RecordReader;
导入org.apache.hadoop.mapreduce.TaskAttemptContext;
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
公共类PdfInputFormat扩展了FileInputFormat{
@SuppressWarnings({“unchecked”,“rawtypes”})
@凌驾
公共记录阅读器createRecordReader(
InputSplit拆分,TaskAttemptContext上下文)引发IOException,
中断异常{
返回新的PdfRecordReader();
}
}
//PDF记录阅读器类
包com.tcs;
导入java.io.IOException;
导入org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.fs.FSDataInputStream;
导入org.apache.hadoop.fs.FileSystem;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.InputSplit;
导入org.apache.hadoop.mapreduce.RecordReader;
导入org.apache.hadoop.mapreduce.TaskAttemptContext;
导入org.apache.hadoop.mapreduce.lib.input.FileSplit;
导入org.apache.pdfbox.pdmodel.PDDocument;
导入org.apache.pdfbox.util.PDFTextStripper;
公共类PdfRecordReader扩展了RecordReader{
私有字符串[]行=null;
私有长可写密钥=null;
私有文本值=null;
@凌驾
公共void初始化(InputSplit genericSplit,TaskAttemptContext上下文)
抛出IOException、InterruptedException{
FileSplit split=(FileSplit)genericSplit;
配置作业=context.getConfiguration();
最终路径文件=split.getPath();
/*
*下面的代码包含打开文件并查找的逻辑
*拆分的开始。这里我们应用Pdf解析逻辑
*/
FileSystem fs=file.getFileSystem(作业);
FSDataInputStream fileIn=fs.open(split.getPath());
PDDocument pdf=null;
字符串parsedText=null;
PDFTEXT脱料器脱料器;
pdf=PDDocument.load(fileIn);
剥离器=新的PDFTextStripper();
parsedText=stripper.getText(pdf);
//字符串delims=“[]”;
this.lines=parsedText.split(“/n”);
}
@凌驾
公共布尔值nextKeyValue()引发IOException、InterruptedException{
if(key==null){
key=新的LongWritable();
键组(1);
值=新文本();
value.set(行[0]);
}否则
{
int temp=(int)key.get();
如果(温度<(线路长度-1)){
int count=(int)key.get();
值=新文本();
设置(行[计数]);
计数=计数+1;
键=新的可长写(计数);
}否则{
返回false;
}
}
if(key==null | | value==null){
返回false;
}否则{
返回true;
}
}
@凌驾
public LongWritable getCurrentKey()引发IOException,
中断异常{
返回键;
}
@凌驾
公共文本getCurrentValue()引发IOException、InterruptedException{
返回值;
}
@凌驾
public float getProgress()引发IOException、InterruptedException{
返回0;
}
@凌驾
public void close()引发IOException{
}
}
//映射器类
包com.tcs;
导入java.io.IOException;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.Mapper;
公共类WordCountMapper扩展了映射器
{
受保护的void映射(可长写键、文本值、上下文)
抛出IOException、InterruptedException{
context.write(值、键);
}
}
//减速器类
包com.tcs;
导入java.io.IOException;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.Reducer;
公共类WordCountReducer扩展了Reducer{
受保护的void reduce(文本键、Iterable值、,
上下文)抛出IOException、InterruptedException{
write(key,newwordcountreducer());
}
}
我想你