Java 是否有任何函数可以识别iTEXt呈现给定PDF中的所有文本_Java_Pdf_Itext_Extract

Java 是否有任何函数可以识别iTEXt呈现给定PDF中的所有文本

java pdf itext

Java 是否有任何函数可以识别iTEXt呈现给定PDF中的所有文本,java,pdf,itext,extract,Java,Pdf,Itext,Extract,我使用itext 5.0.6从给定的PDF中提取文本。我重写了TextExtractionStrategy中的renderText（）方法以获取PDF中的文本，并将提取的信息存储在StringBuilder中。实际上，我想在提取的文本中附加相应的字体信息。但有些单词被分成两个单词。请帮忙。提前谢谢我在c#->中找到了一些有用的代码。这里我更新了这个答案的Java版本 import com.itextpdf.text.pdf.parser.ImageRenderInfo; import com.

我使用itext 5.0.6从给定的PDF中提取文本。我重写了TextExtractionStrategy中的renderText（）方法以获取PDF中的文本，并将提取的信息存储在StringBuilder中。实际上，我想在提取的文本中附加相应的字体信息。但有些单词被分成两个单词。请帮忙。提前谢谢

我在c#->中找到了一些有用的代码。这里我更新了这个答案的Java版本

import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
import com.itextpdf.text.pdf.parser.Vector;
import com.itextpdf.text.Rectangle;
enum TextRenderMode{ FillText(0), StrokeText(1), FillThenStrokeText(2), Invisible(3),FillTextAndAddToPathForClipping(4),StrokeTextAndAddToPathForClipping(5), FillThenStrokeTextAndAddToPathForClipping(6),AddTextToPaddForClipping(7);
private int value;
private TextRenderMode(int value) {
    this.value = value;
}
public int getValue() {
    return value;
}
}
public class CustomizedTextExtractionStrategy implements   TextExtractionStrategy {
private StringBuilder result = new StringBuilder();
// Store last used properties
private Vector lastBaseLine;
private String lastFont;
private float lastFontSize;
@Override
public void endTextBlock() {
    // TODO Auto-generated method stub
}
@Override
public void renderImage(ImageRenderInfo imageRenderInfo) {
    // TODO Auto-generated method stub
}
@Override
public void renderText(TextRenderInfo renderInfo) {
// TODO Auto-generated method stub
String curFont = renderInfo.getFont().getPostscriptFontName();
// Check if faux bold is used
TextRenderMode mode = TextRenderMode.FillThenStrokeText;
int modeValue = mode.getValue();
if ((renderInfo.getTextRenderMode() == modeValue)) {
curFont += "-Bold";
}
// This code assumes that if the baseline changes then we're on a
// newline
Vector curBaseline = renderInfo.getBaseline().getStartPoint();
Vector topRight = renderInfo.getAscentLine().getEndPoint();
Rectangle rect = new Rectangle(curBaseline.get(Vector.I1),
curBaseline.get(Vector.I2), topRight.get(Vector.I1),
topRight.get(Vector.I2));
float curFontSize = rect.getHeight();
// See if something has changed, either the baseline, the font or the
// font size
if ((this.lastBaseLine == null)
|| (curBaseline.get(Vector.I2) != lastBaseLine.get(Vector.I2))
|| (curFontSize != lastFontSize) || (curFont != lastFont)) {
// if we've put down at least one span tag close it
if ((this.lastBaseLine != null)) {
// this.result.AppendLine("</span>");
}
// If the baseline has changed then insert a line break
if ((this.lastBaseLine != null)
&& curBaseline.get(Vector.I2) != lastBaseLine
.get(Vector.I2)) {
this.result.append(System.getProperty("line.separator"));
}
// Create an HTML tag with appropriate styles
this.result.append(curFont + "-" + curFontSize
+ System.getProperty("line.separator"));
}
// Append the current text
this.result.append(renderInfo.getText());
// Set currently used properties
this.lastBaseLine = curBaseline;
this.lastFontSize = curFontSize;
this.lastFont = curFont;
}
@Override
public String getResultantText() {
// TODO Auto-generated method stub
String words[] = result.toString().split(" ");
for (String word : words) {
System.out.println(word);
}
    return "Texts written on console successfully";
}

@Override
public void beginTextBlock() {
    // TODO Auto-generated method stub
    System.out
            .println("************** PDF Extraction Starts **************");
}

import com.itextpdf.text.pdf.parser.ImageRenderInfo；
导入com.itextpdf.text.pdf.parser.TextExtractionStrategy；
导入com.itextpdf.text.pdf.parser.TextRenderInfo；
导入com.itextpdf.text.pdf.parser.Vector；
导入com.itextpdf.text.Rectangle；
枚举TextRenderMode{FillText（0）、StrokeText（1）、FillThenStrokeText（2）、Invisible（3）、FillTextAndAddToPathForClipping（4）、StrokeTextAndAddToPathForClipping（5）、FillThenStrokeTextAndAddToPathForClipping（6）、AddTextTopAddTopForClipping（7）；
私有int值；
私有TextRenderMode（int值）{
这个值=值；
}
public int getValue（）{
返回值；
}
}
公共类CustomizedTextExtractionStrategy实现TextExtractionStrategy{
私有StringBuilder结果=新建StringBuilder（）；
//存储上次使用的属性
专用向量机基线；
私有字符串字体；
私人股本规模；
@凌驾
public void endTextBlock（）{
//TODO自动生成的方法存根
}
@凌驾
公共void渲染图像（ImageRenderInfo ImageRenderInfo）{
//TODO自动生成的方法存根
}
@凌驾
公共void renderText（TextRenderInfo renderInfo）{
//TODO自动生成的方法存根
字符串curFont=renderInfo.getFont（）.getPostscriptFontName（）；
//检查是否使用了faux bold
TextRenderMode模式=TextRenderMode.FillThenStroketText；
int modeValue=mode.getValue（）；
if（（renderInfo.getTextEnderMode（）==modeValue））{
curFont+=“-粗体”；
}
//这段代码假设，如果基线发生变化，那么我们就处于
//新线
向量路缘基线=renderInfo.getBaseline（）.getStartPoint（）；
Vector topRight=renderInfo.getAscentLine（）.getEndPoint（）；
矩形rect=新矩形（curBaseline.get（Vector.I1），
路缘基线.get（Vector.I2），右上角.get（Vector.I1），
get（Vector.I2））；
float curFontSize=rect.getHeight（）；
//查看是否有更改，基线、字体或
//字号
if（（this.lastBaseLine==null）
||（curBaseline.get（Vector.I2）！=lastBaseLine.get（Vector.I2））
||（curFontSize！=lastFontSize）| |（curFont！=lastFont））{
//如果我们至少放下一个跨度标签，关闭它
如果（（this.lastBaseLine！=null））{
//this.result.AppendLine（“”）；
}
//如果基线已更改，则插入换行符
if（（this.lastBaseLine！=null）
&&curBaseline.get（Vector.I2）！=lastBaseLine
.get（Vector.I2））{
this.result.append（System.getProperty（“line.separator”）；
}
//创建具有适当样式的HTML标记
this.result.append（curFont+“-”+curFontSize
+System.getProperty（“line.separator”）；
}
//附加当前文本
append（renderInfo.getText（））；
//设置当前使用的属性
this.lastBaseLine=路缘基线；
this.lastFontSize=curFontSize；
this.lastFont=curFont；
}
@凌驾
公共字符串getResultantText（）{
//TODO自动生成的方法存根
字符串字[]=result.toString（）.split（“”）；
for（字符串字：字）{
System.out.println（word）；
}
返回“控制台写入文本成功”；
}
@凌驾
public void beginTextBlock（）{
//TODO自动生成的方法存根
系统输出
.println（“****************PDF提取开始****************”）；
}

}

请分享您的代码，并举例说明出错的地方和您希望实现的目标