Parsing 在com.itextpdf.text.pdf.parser.RenderListener上找不到类异常_Parsing_Pdf_Hadoop_Mapreduce

Parsing 在com.itextpdf.text.pdf.parser.RenderListener上找不到类异常

parsing pdf hadoop mapreduce

Parsing 在com.itextpdf.text.pdf.parser.RenderListener上找不到类异常,parsing,pdf,hadoop,mapreduce,Parsing,Pdf,Hadoop,Mapreduce,我正在尝试使用自定义输入格式类在Mapreduce中解析PDF文件，如下所示： import java.io.IOException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.

我正在尝试使用自定义输入格式类在Mapreduce中解析PDF文件，如下所示：

import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;    
public class PdfFileInputFormat extends FileInputFormat<LongWritable, Text> {

@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, 
                    TaskAttemptContext context) throws IOException, InterruptedException {

    System.out.println("Entered PdfFileInputFormat class");

    return new PdfRecordReader();
}

@Override
protected boolean isSplitable(JobContext context, Path file) {
    return false;
}   

}

import java.io.IOException；
导入org.apache.hadoop.fs.Path；
导入org.apache.hadoop.io.LongWritable；
导入org.apache.hadoop.io.Text；
导入org.apache.hadoop.mapreduce.InputSplit；
导入org.apache.hadoop.mapreduce.JobContext；
导入org.apache.hadoop.mapreduce.RecordReader；
导入org.apache.hadoop.mapreduce.TaskAttemptContext；
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat；
公共类PdfileInputFormat扩展了FileInputFormat{
@凌驾
公共RecordReader createRecordReader（InputSplit split，
TaskAttemptContext（上下文）引发IOException、InterruptedException{
System.out.println（“输入的PdfFileInputFormat类”）；
返回新的PdfRecordReader（）；
}
@凌驾
受保护的布尔isSplitable（JobContext上下文，路径文件）{
返回false；
}   
}

我的pdf阅读器课程是：

package com.pdf.prac;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;    
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import com.itextpdf.text.pdf.parser.RenderListener;

public class PdfRecordReader extends RecordReader<LongWritable, Text> {
private int flag = 0;
private LongWritable key = null;
private Text value = null;

private PdfReader reader;
private PdfReaderContentParser parser;
private TextExtractionStrategy strategy;
private FSDataInputStream fileIn;

private List<String> records = new ArrayList<String>();

public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException {
    System.out.println("Executing initialize........");

    FileSplit split = (FileSplit) genericSplit;
    Configuration conf = context.getConfiguration();
    final Path file = split.getPath();

    FileSystem fs = file.getFileSystem(conf);
    this.fileIn = fs.open(split.getPath());

    this.reader = new PdfReader(fileIn);
    this.parser = new PdfReaderContentParser(reader);

    readRecords();
}

public synchronized boolean nextKeyValue() throws IOException {
    System.out.println("Executing nextKey........Total Records : " + records.size() + "; Flag : " + (flag++));
    int index = 0;

    if (key == null) {
        key = new LongWritable(index);
    } else {
        index = (int) key.get();
        key.set(++index);
    }

    if (value == null) {
        value = new Text(records.get(index));
    } else {
        value.set(records.get(index));
    }

    if (flag == records.size()) {
        return false;
    } else {
        return true;
    }
}

@Override
public LongWritable getCurrentKey() {
    return key;
}

@Override
public Text getCurrentValue() {
    return value;
}

/**
 * Get the progress within the split
 */
public float getProgress() {
    return 0;
}

public synchronized void close() throws IOException {
    if (fileIn != null) {
        fileIn.close();
    }
}

private void readRecords() throws IOException { 
    if (reader != null) {
        for (int i = 1; i <= reader.getNumberOfPages(); i++) {
            strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
                if (strategy != null) {
/*                   String[] content = (strategy.getResultantText()).split("\n");
                 for (String str : content) {
                     records.add(str);
                 }*/
                StringTokenizer tokens = new StringTokenizer(strategy.getResultantText(), "\n");
                while (tokens.hasMoreTokens()) {
                    records.add(tokens.nextToken());
                }
            }
        }
        reader.close();     
    }

    return;
}

package com.pdf.prac；
导入java.io.IOException；
导入java.util.ArrayList；
导入java.util.List；
导入java.util.StringTokenizer；
导入org.apache.hadoop.conf.Configuration；
导入org.apache.hadoop.fs.FSDataInputStream；
导入org.apache.hadoop.fs.FileSystem；
导入org.apache.hadoop.fs.Path；
导入org.apache.hadoop.io.LongWritable；
导入org.apache.hadoop.io.Text；
导入org.apache.hadoop.mapreduce.InputSplit；
导入org.apache.hadoop.mapreduce.RecordReader；
导入org.apache.hadoop.mapreduce.TaskAttemptContext；
导入org.apache.hadoop.mapreduce.lib.input.FileSplit；
导入com.itextpdf.text.pdf.PdfReader；
导入com.itextpdf.text.pdf.parser.PdfReaderContentParser；
导入com.itextpdf.text.pdf.parser.simpletextractionstrategy；
导入com.itextpdf.text.pdf.parser.TextExtractionStrategy；
导入com.itextpdf.text.pdf.parser.RenderListener；
公共类PdfRecordReader扩展了RecordReader{
私有int标志=0；
私有长可写密钥=null；
私有文本值=null；
私人PDF阅读器；
私有PdfReaderContentParser；
私有文本抽取策略；
私有FSDataInputStream文件；
私有列表记录=新的ArrayList（）；
公共void初始化（InputSplit genericSplit，TaskAttemptContext上下文）
抛出IOException{
System.out.println（“执行初始化……”）；
FileSplit split=（FileSplit）genericSplit；
conf=context.getConfiguration（）；
最终路径文件=split.getPath（）；
FileSystem fs=file.getFileSystem（conf）；
this.fileIn=fs.open（split.getPath（））；
this.reader=新的PdfReader（fileIn）；
this.parser=新的PdfReaderContentParser（读取器）；
readRecords（）；
}
公共同步布尔值nextKeyValue（）引发IOException{
System.out.println（“正在执行nextKey…….总记录数：“+Records.size（）+”；标志：“+（标志++）”；
int指数=0；
if（key==null）{
键=新的长可写（索引）；
}否则{
index=（int）key.get（）；
key.set（++索引）；
}
如果（值==null）{
值=新文本（records.get（index））；
}否则{
value.set（records.get（index））；
}
if（flag==records.size（））{
返回false；
}否则{
返回true；
}
}
@凌驾
公共长可写getCurrentKey（）{
返回键；
}
@凌驾
公共文本getCurrentValue（）{
返回值；
}
/**
*在拆分中获取进度
*/
公共进度（）{
返回0；
}
public synchronized void close（）引发IOException{
if（fileIn！=null）{
fileIn.close（）；
}
}
私有void readRecords（）引发IOException{
if（读卡器！=null）{
对于（int i=1；我有没有将iText添加到类路径中？如果有，是哪个版本？是的..我在5.4.2版的构建路径中添加了iText作为外部jar。你确定你只有一个对iText的引用吗？如果你有两个不同版本的iText jar，JVM可能会混淆：当存在歧义且JVM不知道选择哪个jar时，JVM会选择它也会说找不到类。是的。我知道这一点，并且在我引用的库中只看到一个iText jar。还有其他地方需要检查吗？如果您已经向类路径添加了一个iText jar，并且该jar包含类com.itextpdf.text.pdf.parser.RenderListener
（我假设您已经检查过），则不应发生错误。如果发生错误，则您的一个假设是错误的。