Parsing 在com.itextpdf.text.pdf.parser.RenderListener上找不到类异常

Parsing 在com.itextpdf.text.pdf.parser.RenderListener上找不到类异常,parsing,pdf,hadoop,mapreduce,Parsing,Pdf,Hadoop,Mapreduce,我正在尝试使用自定义输入格式类在Mapreduce中解析PDF文件,如下所示: import java.io.IOException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.

我正在尝试使用自定义输入格式类在Mapreduce中解析PDF文件,如下所示:

import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;    
public class PdfFileInputFormat extends FileInputFormat<LongWritable, Text> {

@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, 
                    TaskAttemptContext context) throws IOException, InterruptedException {

    System.out.println("Entered PdfFileInputFormat class");

    return new PdfRecordReader();
}

@Override
protected boolean isSplitable(JobContext context, Path file) {
    return false;
}   

}
import java.io.IOException;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.InputSplit;
导入org.apache.hadoop.mapreduce.JobContext;
导入org.apache.hadoop.mapreduce.RecordReader;
导入org.apache.hadoop.mapreduce.TaskAttemptContext;
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
公共类PdfileInputFormat扩展了FileInputFormat{
@凌驾
公共RecordReader createRecordReader(InputSplit split,
TaskAttemptContext(上下文)引发IOException、InterruptedException{
System.out.println(“输入的PdfFileInputFormat类”);
返回新的PdfRecordReader();
}
@凌驾
受保护的布尔isSplitable(JobContext上下文,路径文件){
返回false;
}   
}
我的pdf阅读器课程是:

package com.pdf.prac;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;    
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import com.itextpdf.text.pdf.parser.RenderListener;

public class PdfRecordReader extends RecordReader<LongWritable, Text> {
private int flag = 0;
private LongWritable key = null;
private Text value = null;

private PdfReader reader;
private PdfReaderContentParser parser;
private TextExtractionStrategy strategy;
private FSDataInputStream fileIn;

private List<String> records = new ArrayList<String>();

public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException {
    System.out.println("Executing initialize........");

    FileSplit split = (FileSplit) genericSplit;
    Configuration conf = context.getConfiguration();
    final Path file = split.getPath();

    FileSystem fs = file.getFileSystem(conf);
    this.fileIn = fs.open(split.getPath());

    this.reader = new PdfReader(fileIn);
    this.parser = new PdfReaderContentParser(reader);

    readRecords();
}

public synchronized boolean nextKeyValue() throws IOException {
    System.out.println("Executing nextKey........Total Records : " + records.size() + "; Flag : " + (flag++));
    int index = 0;

    if (key == null) {
        key = new LongWritable(index);
    } else {
        index = (int) key.get();
        key.set(++index);
    }

    if (value == null) {
        value = new Text(records.get(index));
    } else {
        value.set(records.get(index));
    }

    if (flag == records.size()) {
        return false;
    } else {
        return true;
    }
}

@Override
public LongWritable getCurrentKey() {
    return key;
}

@Override
public Text getCurrentValue() {
    return value;
}

/**
 * Get the progress within the split
 */
public float getProgress() {
    return 0;
}

public synchronized void close() throws IOException {
    if (fileIn != null) {
        fileIn.close();
    }
}

private void readRecords() throws IOException { 
    if (reader != null) {
        for (int i = 1; i <= reader.getNumberOfPages(); i++) {
            strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
                if (strategy != null) {
/*                   String[] content = (strategy.getResultantText()).split("\n");
                 for (String str : content) {
                     records.add(str);
                 }*/
                StringTokenizer tokens = new StringTokenizer(strategy.getResultantText(), "\n");
                while (tokens.hasMoreTokens()) {
                    records.add(tokens.nextToken());
                }
            }
        }
        reader.close();     
    }

    return;
}
package com.pdf.prac;
导入java.io.IOException;
导入java.util.ArrayList;
导入java.util.List;
导入java.util.StringTokenizer;
导入org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.fs.FSDataInputStream;
导入org.apache.hadoop.fs.FileSystem;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.InputSplit;
导入org.apache.hadoop.mapreduce.RecordReader;
导入org.apache.hadoop.mapreduce.TaskAttemptContext;
导入org.apache.hadoop.mapreduce.lib.input.FileSplit;
导入com.itextpdf.text.pdf.PdfReader;
导入com.itextpdf.text.pdf.parser.PdfReaderContentParser;
导入com.itextpdf.text.pdf.parser.simpletextractionstrategy;
导入com.itextpdf.text.pdf.parser.TextExtractionStrategy;
导入com.itextpdf.text.pdf.parser.RenderListener;
公共类PdfRecordReader扩展了RecordReader{
私有int标志=0;
私有长可写密钥=null;
私有文本值=null;
私人PDF阅读器;
私有PdfReaderContentParser;
私有文本抽取策略;
私有FSDataInputStream文件;
私有列表记录=新的ArrayList();
公共void初始化(InputSplit genericSplit,TaskAttemptContext上下文)
抛出IOException{
System.out.println(“执行初始化……”);
FileSplit split=(FileSplit)genericSplit;
conf=context.getConfiguration();
最终路径文件=split.getPath();
FileSystem fs=file.getFileSystem(conf);
this.fileIn=fs.open(split.getPath());
this.reader=新的PdfReader(fileIn);
this.parser=新的PdfReaderContentParser(读取器);
readRecords();
}
公共同步布尔值nextKeyValue()引发IOException{
System.out.println(“正在执行nextKey…….总记录数:“+Records.size()+”;标志:“+(标志++)”;
int指数=0;
if(key==null){
键=新的长可写(索引);
}否则{
index=(int)key.get();
key.set(++索引);
}
如果(值==null){
值=新文本(records.get(index));
}否则{
value.set(records.get(index));
}
if(flag==records.size()){
返回false;
}否则{
返回true;
}
}
@凌驾
公共长可写getCurrentKey(){
返回键;
}
@凌驾
公共文本getCurrentValue(){
返回值;
}
/**
*在拆分中获取进度
*/
公共进度(){
返回0;
}
public synchronized void close()引发IOException{
if(fileIn!=null){
fileIn.close();
}
}
私有void readRecords()引发IOException{
if(读卡器!=null){

对于(int i=1;我有没有将iText添加到类路径中?如果有,是哪个版本?是的..我在5.4.2版的构建路径中添加了iText作为外部jar。你确定你只有一个对iText的引用吗?如果你有两个不同版本的iText jar,JVM可能会混淆:当存在歧义且JVM不知道选择哪个jar时,JVM会选择它也会说找不到类。是的。我知道这一点,并且在我引用的库中只看到一个iText jar。还有其他地方需要检查吗?如果您已经向类路径添加了一个iText jar,并且该jar包含类
com.itextpdf.text.pdf.parser.RenderListener
(我假设您已经检查过),则不应发生错误。如果发生错误,则您的一个假设是错误的。