Java PDFbox未在android上提取区域
我试图只提取PDF文档上突出显示的文本。它在PC上工作,但当我在Android上使用它时,它失败了。PDFBox不能直接在android上工作,所以我在android上使用Birdbrain2/PDFBox android 下面是PC代码Java PDFbox未在android上提取区域,java,android,pdfbox,Java,Android,Pdfbox,我试图只提取PDF文档上突出显示的文本。它在PC上工作,但当我在Android上使用它时,它失败了。PDFBox不能直接在android上工作,所以我在android上使用Birdbrain2/PDFBox android 下面是PC代码 import java.awt.geom.Rectangle2D; import java.io.File; import java.util.List; import org.apache.pdfbox.pdmodel.PDDocument; import
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.util.PDFTextStripperByArea;
public class ExtractHighlights {
public static void main(String args[]) {
System.out.println(extractHighlights("sample.pdf"));
}
public static String extractHighlights(String fileName){
String extractedText = "";
try {
PDDocument pddDocument = PDDocument.load(new File(fileName));
List allPages = pddDocument.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
PDPage page = (PDPage) allPages.get(i);
List<PDAnnotation> la = page.getAnnotations();
if (la.size() < 1) {
continue;
}
for (PDAnnotation pda : la) {
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
PDRectangle rect = pda.getRectangle();
float x = rect.getLowerLeftX();
float y = rect.getUpperRightY();
float width = rect.getWidth();
float height = rect.getHeight();
int rotation = page.findRotation();
if (rotation == 0) {
PDRectangle pageSize = page.findMediaBox();
y = pageSize.getHeight() - y;
}
Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y,
width, height);
stripper.addRegion("0", awtRect);
stripper.extractRegions(page);
String highlight = stripper.getTextForRegion("0").trim();
if(highlight.length() == 0) continue;
extractedText += highlight.substring(0,highlight.length()-2)+" ";
}
}
pddDocument.close();
//System.out.println(extractedText);
} catch (Exception ex) {
ex.printStackTrace();
}
return extractedText;
}
}
导入java.awt.geom.Rectangle2D;
导入java.io.File;
导入java.util.List;
导入org.apache.pdfbox.pdmodel.PDDocument;
导入org.apache.pdfbox.pdmodel.PDPage;
导入org.apache.pdfbox.pdmodel.common.PDRectangle;
导入org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
导入org.apache.pdfbox.util.PDFTextStripperByArea;
公开课精选{
公共静态void main(字符串args[]){
System.out.println(extractHighlights(“sample.pdf”);
}
公共静态字符串提取器突出显示(字符串文件名){
字符串extractedText=“”;
试一试{
PDDocument pddDocument=PDDocument.load(新文件(文件名));
List allPages=pddDocument.getDocumentCatalog().getAllPages();
对于(inti=0;i
以下是无法工作的android代码
@Override
protected String doInBackground(String... strings) {
String extractedText = "";
try {
Log.i("ExtractHighlights","Started");
PDDocument pddDocument = PDDocument.load(new File(strings[0]));
PDPageTree allPages = pddDocument.getDocumentCatalog().getPages();
int totalPages = allPages.getCount();
int pageNumber = 0;
for (PDPage page:allPages) {
publishProgress(pageNumber++,totalPages);
Log.i("ExtractHighlights", "Reading page");
List<PDAnnotation> la = page.getAnnotations();
if (la.size() < 1) {
continue;
}
for (PDAnnotation pda : la) {
Log.i("ExtractHighlights","Annotation found");
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
Log.i("ExtractHighlights","Getting rectangle");
PDRectangle rect = pda.getRectangle();
float x = rect.getLowerLeftX();
float y = rect.getUpperRightY();
float width = rect.getWidth();
float height = rect.getHeight();
RectF region = new RectF(x,y,width,height);
stripper.addRegion("0",region);
Log.i("ExtractHighlights","Extracting regions");
stripper.extractRegions(page);
Log.i("ExtractHighlights","Getting text from region");
String highlight = stripper.getTextForRegion("0").trim();
Log.i("ExtractHighlights",highlight);
if(highlight.length() == 0) continue;
extractedText += highlight.substring(0,highlight.length()-2)+" ";
}
Log.i("ExtractHighlights","Page done");
}
pddDocument.close();
Log.i("ExtractHighlights","Document closed");
} catch (Exception ex) {
ex.printStackTrace();
}
return extractedText;
}
@覆盖
受保护的字符串背景(字符串…字符串){
字符串extractedText=“”;
试一试{
Log.i(“提取亮点”、“开始”);
PDDocument pddDocument=PDDocument.load(新文件(字符串[0]);
PDPageTree allPages=pddDocument.getDocumentCatalog().getPages();
int totalPages=allPages.getCount();
int pageNumber=0;
用于(PDPage页面:所有页面){
出版进度(页码++,总页数);
Log.i(“摘录摘要”、“阅读页”);
List la=page.getAnnotations();
如果(la.size()<1){
继续;
}
用于(pda注释pda:la){
Log.i(“提取亮点”、“找到注释”);
PDFTextStripperByArea剥离器=新的PDFTextStripperByArea();
脱扣器。设置端口BYPOSITION(真);
i(“提取高光”、“获取矩形”);
PDRectangle rect=pda.getRectangle();
float x=rect.getLowerLeftX();
float y=rect.getUpperRightY();
float width=rect.getWidth();
浮动高度=rect.getHeight();
RectF区域=新的RectF(x,y,宽度,高度);
剥离器。添加区域(“0”,区域);
Log.i(“提取高光”、“提取区域”);
汽提塔区域(第页);
Log.i(“提取突出显示”、“从区域获取文本”);
String highlight=stripper.getTextForRegion(“0”).trim();
Log.i(“提取高光”,高光);
如果(highlight.length()==0)继续;
extractedText+=highlight.substring(0,highlight.length()-2)+”;
}
Log.i(“摘录亮点”、“页面完成”);
}
pddDocument.close();
Log.i(“摘录摘要”、“文件关闭”);
}捕获(例外情况除外){
例如printStackTrace();
}
返回提取的文本;
}
而且在Android上需要很长时间,Android假设程序已经崩溃
我可以尝试将整个PDF转换为文本,但如何知道突出显示了哪些文本?正如@mkl所指出的,这是一个已知的错误。
所以我用了盖伊的PDFBox端口。成功了 Birdbrain2/PdfBox存在一个问题“#5 PDFTextStripperByArea无法提取RainHeart257 7 7个月前打开的文本(字体问题?)。也许你的问题是相关的?是的,这就是问题所在。你能给我推荐一些PDFbox以外的API吗?我很高兴它对你有用。然而,该项目已经停止了一年。所以,也要密切关注Birdbrain2,也许几个月后再试试:-)