Parsing 在com.itextpdf.text.pdf.parser.RenderListener上找不到类异常
我正在尝试使用自定义输入格式类在Mapreduce中解析PDF文件,如下所示:Parsing 在com.itextpdf.text.pdf.parser.RenderListener上找不到类异常,parsing,pdf,hadoop,mapreduce,Parsing,Pdf,Hadoop,Mapreduce,我正在尝试使用自定义输入格式类在Mapreduce中解析PDF文件,如下所示: import java.io.IOException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class PdfFileInputFormat extends FileInputFormat<LongWritable, Text> {
@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
System.out.println("Entered PdfFileInputFormat class");
return new PdfRecordReader();
}
@Override
protected boolean isSplitable(JobContext context, Path file) {
return false;
}
}
import java.io.IOException;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.InputSplit;
导入org.apache.hadoop.mapreduce.JobContext;
导入org.apache.hadoop.mapreduce.RecordReader;
导入org.apache.hadoop.mapreduce.TaskAttemptContext;
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
公共类PdfileInputFormat扩展了FileInputFormat{
@凌驾
公共RecordReader createRecordReader(InputSplit split,
TaskAttemptContext(上下文)引发IOException、InterruptedException{
System.out.println(“输入的PdfFileInputFormat类”);
返回新的PdfRecordReader();
}
@凌驾
受保护的布尔isSplitable(JobContext上下文,路径文件){
返回false;
}
}
我的pdf阅读器课程是:
package com.pdf.prac;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import com.itextpdf.text.pdf.parser.RenderListener;
public class PdfRecordReader extends RecordReader<LongWritable, Text> {
private int flag = 0;
private LongWritable key = null;
private Text value = null;
private PdfReader reader;
private PdfReaderContentParser parser;
private TextExtractionStrategy strategy;
private FSDataInputStream fileIn;
private List<String> records = new ArrayList<String>();
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
throws IOException {
System.out.println("Executing initialize........");
FileSplit split = (FileSplit) genericSplit;
Configuration conf = context.getConfiguration();
final Path file = split.getPath();
FileSystem fs = file.getFileSystem(conf);
this.fileIn = fs.open(split.getPath());
this.reader = new PdfReader(fileIn);
this.parser = new PdfReaderContentParser(reader);
readRecords();
}
public synchronized boolean nextKeyValue() throws IOException {
System.out.println("Executing nextKey........Total Records : " + records.size() + "; Flag : " + (flag++));
int index = 0;
if (key == null) {
key = new LongWritable(index);
} else {
index = (int) key.get();
key.set(++index);
}
if (value == null) {
value = new Text(records.get(index));
} else {
value.set(records.get(index));
}
if (flag == records.size()) {
return false;
} else {
return true;
}
}
@Override
public LongWritable getCurrentKey() {
return key;
}
@Override
public Text getCurrentValue() {
return value;
}
/**
* Get the progress within the split
*/
public float getProgress() {
return 0;
}
public synchronized void close() throws IOException {
if (fileIn != null) {
fileIn.close();
}
}
private void readRecords() throws IOException {
if (reader != null) {
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
if (strategy != null) {
/* String[] content = (strategy.getResultantText()).split("\n");
for (String str : content) {
records.add(str);
}*/
StringTokenizer tokens = new StringTokenizer(strategy.getResultantText(), "\n");
while (tokens.hasMoreTokens()) {
records.add(tokens.nextToken());
}
}
}
reader.close();
}
return;
}
package com.pdf.prac;
导入java.io.IOException;
导入java.util.ArrayList;
导入java.util.List;
导入java.util.StringTokenizer;
导入org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.fs.FSDataInputStream;
导入org.apache.hadoop.fs.FileSystem;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.InputSplit;
导入org.apache.hadoop.mapreduce.RecordReader;
导入org.apache.hadoop.mapreduce.TaskAttemptContext;
导入org.apache.hadoop.mapreduce.lib.input.FileSplit;
导入com.itextpdf.text.pdf.PdfReader;
导入com.itextpdf.text.pdf.parser.PdfReaderContentParser;
导入com.itextpdf.text.pdf.parser.simpletextractionstrategy;
导入com.itextpdf.text.pdf.parser.TextExtractionStrategy;
导入com.itextpdf.text.pdf.parser.RenderListener;
公共类PdfRecordReader扩展了RecordReader{
私有int标志=0;
私有长可写密钥=null;
私有文本值=null;
私人PDF阅读器;
私有PdfReaderContentParser;
私有文本抽取策略;
私有FSDataInputStream文件;
私有列表记录=新的ArrayList();
公共void初始化(InputSplit genericSplit,TaskAttemptContext上下文)
抛出IOException{
System.out.println(“执行初始化……”);
FileSplit split=(FileSplit)genericSplit;
conf=context.getConfiguration();
最终路径文件=split.getPath();
FileSystem fs=file.getFileSystem(conf);
this.fileIn=fs.open(split.getPath());
this.reader=新的PdfReader(fileIn);
this.parser=新的PdfReaderContentParser(读取器);
readRecords();
}
公共同步布尔值nextKeyValue()引发IOException{
System.out.println(“正在执行nextKey…….总记录数:“+Records.size()+”;标志:“+(标志++)”;
int指数=0;
if(key==null){
键=新的长可写(索引);
}否则{
index=(int)key.get();
key.set(++索引);
}
如果(值==null){
值=新文本(records.get(index));
}否则{
value.set(records.get(index));
}
if(flag==records.size()){
返回false;
}否则{
返回true;
}
}
@凌驾
公共长可写getCurrentKey(){
返回键;
}
@凌驾
公共文本getCurrentValue(){
返回值;
}
/**
*在拆分中获取进度
*/
公共进度(){
返回0;
}
public synchronized void close()引发IOException{
if(fileIn!=null){
fileIn.close();
}
}
私有void readRecords()引发IOException{
if(读卡器!=null){
对于(int i=1;我有没有将iText添加到类路径中?如果有,是哪个版本?是的..我在5.4.2版的构建路径中添加了iText作为外部jar。你确定你只有一个对iText的引用吗?如果你有两个不同版本的iText jar,JVM可能会混淆:当存在歧义且JVM不知道选择哪个jar时,JVM会选择它也会说找不到类。是的。我知道这一点,并且在我引用的库中只看到一个iText jar。还有其他地方需要检查吗?如果您已经向类路径添加了一个iText jar,并且该jar包含类com.itextpdf.text.pdf.parser.RenderListener
(我假设您已经检查过),则不应发生错误。如果发生错误,则您的一个假设是错误的。