Java PDFBox在特定pdf文档中获取错误的文本位置
上下文 我一直在开发一个程序,该程序可以获取pdf,突出显示一些单词(通过pdfbox标记注释),并保存新的pdf 为此,我扩展了该类,以覆盖该方法并获取每个单词(框)的坐标,这样我就可以准确地知道文本在PDF文档中的坐标位置(TextPosition对象为我提供每个单词框的坐标)。然后,在此基础上,我画了一个突出显示我想要的单词的图 问题 它适用于我迄今为止尝试过的所有文档,但有一个文档除外,我从TextPosts获得的位置似乎是错误的,导致了错误的突出显示 这是原始文档:Java PDFBox在特定pdf文档中获取错误的文本位置,java,pdf,pdfbox,Java,Pdf,Pdfbox,上下文 我一直在开发一个程序,该程序可以获取pdf,突出显示一些单词(通过pdfbox标记注释),并保存新的pdf 为此,我扩展了该类,以覆盖该方法并获取每个单词(框)的坐标,这样我就可以准确地知道文本在PDF文档中的坐标位置(TextPosition对象为我提供每个单词框的坐标)。然后,在此基础上,我画了一个突出显示我想要的单词的图 问题 它适用于我迄今为止尝试过的所有文档,但有一个文档除外,我从TextPosts获得的位置似乎是错误的,导致了错误的突出显示 这是原始文档: 这是writeS
这是writeString()提供给我的第一个字框中有突出显示的文档,它是MicroRNA:
它应该高亮显示MicroRNA,但它高亮显示了其上方的空白区域(粉色HL矩形) 这是writeString()提供给我的第一个字框中突出显示的文档,它是原始的:
它应该突出显示原始文档,但它突出显示了PDF文档最开始的空白区域(粉色HL矩形) 我想,这个PDF可能包含PDFBox很难找到正确位置的内容,或者这可能是PDFBox中的一个bug 技术规格: PDFBox 2.0.17
Java 11.0.6+10,采用OpenJDK
MacOS Catalina 10.15.4、16gb、x86_64 坐标值 例如,对于MicroRNA单词框的开始和结束,writeString()给我的TextPosition坐标是: M字母 一封信 它会导致我在上面分享的错误HL注释,而对于所有其他PDF文档,这是非常精确的,我已经测试了许多不同的注释。我在这里毫无头绪,也不是PDF定位方面的专家。我尝试使用PDFbox调试器工具,但无法正确阅读。在此,任何帮助都将不胜感激。如果我能提供更多的证据,请告诉我。谢谢 编辑 请注意,文本提取工作正常 我的代码 首先,我创建了一个坐标数组,其中包含了我要创建的第一个和最后一个字符的TextPosition对象中的一些值:
private void extractHLCoordinates(TextPosition firstPosition, TextPosition lastPosition, int pageNumber) {
double firstPositionX = firstPosition.getX();
double firstPositionY = firstPosition.getY();
double lastPositionEndX = lastPosition.getEndX();
double lastPositionY = lastPosition.getY();
double height = firstPosition.getHeight();
double width = firstPosition.getWidth();
int rotation = firstPosition.getRotation();
double[] wordCoordinates = {firstPositionX, firstPositionY, lastPositionEndX, lastPositionY, pageNumber,
height, width, rotation};
...
}
现在是基于提取的坐标绘制时间:
for (int pageIndex = 0; pageIndex < pdDocument.getNumberOfPages(); pageIndex++) {
DPage page = pdDocument.getPage(pageIndex);
List<PDAnnotation> annotations = page.getAnnotations();
int rotation;
double pageHeight = page.getMediaBox().getHeight();
double pageWidth = page.getMediaBox().getWidth();
// each CoordinatePoint obj holds the double array with the
// coordinates of each word I want to HL - see the previous method
for (CoordinatePoint coordinate : coordinates) {
double[] wordCoordinates = coordinate.getCoordinates();
int pageNumber = (int) wordCoordinates[4];
// if the current coordinates are not related to the current page,
//ignore them
if ((int) pageNumber == (pageIndex + 1)) {
// getting rotation of the page: portrait, landscape...
rotation = (int) wordCoordinates[7];
firstPositionX = wordCoordinates[0];
firstPositionY = wordCoordinates[1];
lastPositionEndX = wordCoordinates[2];
lastPositionY = wordCoordinates[3];
height = wordCoordinates[5];
double height;
double minX;
double maxX;
double minY;
double maxY;
if (rotation == 90) {
double width = wordCoordinates[6];
width = (pageHeight * width) / pageWidth;
//defining coordinates of a rectangle
maxX = firstPositionY;
minX = firstPositionY - height;
minY = firstPositionX;
maxY = firstPositionX + width;
} else {
minX = firstPositionX;
maxX = lastPositionEndX;
minY = pageHeight - firstPositionY;
maxY = pageHeight - lastPositionY + height;
}
// Finally I draw the Rectangle
PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
PDRectangle pdRectangle = new PDRectangle();
pdRectangle.setLowerLeftX((float) minX);
pdRectangle.setLowerLeftY((float) minY);
pdRectangle.setUpperRightX((float) maxX);
pdRectangle.setUpperRightY((float) ((float) maxY + height));
txtMark.setRectangle(pdRectangle);
// And the QuadPoints
float[] quads = new float[8];
quads[0] = pdRectangle.getLowerLeftX(); // x1
quads[1] = pdRectangle.getUpperRightY() - 2; // y1
quads[2] = pdRectangle.getUpperRightX(); // x2
quads[3] = quads[1]; // y2
quads[4] = quads[0]; // x3
quads[5] = pdRectangle.getLowerLeftY() - 2; // y3
quads[6] = quads[2]; // x4
quads[7] = quads[5]; // y5
txtMark.setQuadPoints(quads);
...
}
}
for(int pageIndex=0;pageIndex
您的四点坐标是相对于CropBox计算的,但它们需要相对于MediaBox。对于本文档,CropBox比MediaBox小,因此高光不在正确的位置。使用CropBox.LLX-MediaBox.LLY调整x,使用MediaBox.URY-CropBox.URY调整y,高光将位于正确的位置打开。上面的调整适用于Rotate=0的页面。如果Rotate!=0,则可能需要根据PDFBox返回坐标的方式进行进一步调整(我对PDFB不太熟悉)
private void extractHLCoordinates(TextPosition firstPosition, TextPosition lastPosition, int pageNumber) {
double firstPositionX = firstPosition.getX();
double firstPositionY = firstPosition.getY();
double lastPositionEndX = lastPosition.getEndX();
double lastPositionY = lastPosition.getY();
double height = firstPosition.getHeight();
double width = firstPosition.getWidth();
int rotation = firstPosition.getRotation();
double[] wordCoordinates = {firstPositionX, firstPositionY, lastPositionEndX, lastPositionY, pageNumber,
height, width, rotation};
...
}
for (int pageIndex = 0; pageIndex < pdDocument.getNumberOfPages(); pageIndex++) {
DPage page = pdDocument.getPage(pageIndex);
List<PDAnnotation> annotations = page.getAnnotations();
int rotation;
double pageHeight = page.getMediaBox().getHeight();
double pageWidth = page.getMediaBox().getWidth();
// each CoordinatePoint obj holds the double array with the
// coordinates of each word I want to HL - see the previous method
for (CoordinatePoint coordinate : coordinates) {
double[] wordCoordinates = coordinate.getCoordinates();
int pageNumber = (int) wordCoordinates[4];
// if the current coordinates are not related to the current page,
//ignore them
if ((int) pageNumber == (pageIndex + 1)) {
// getting rotation of the page: portrait, landscape...
rotation = (int) wordCoordinates[7];
firstPositionX = wordCoordinates[0];
firstPositionY = wordCoordinates[1];
lastPositionEndX = wordCoordinates[2];
lastPositionY = wordCoordinates[3];
height = wordCoordinates[5];
double height;
double minX;
double maxX;
double minY;
double maxY;
if (rotation == 90) {
double width = wordCoordinates[6];
width = (pageHeight * width) / pageWidth;
//defining coordinates of a rectangle
maxX = firstPositionY;
minX = firstPositionY - height;
minY = firstPositionX;
maxY = firstPositionX + width;
} else {
minX = firstPositionX;
maxX = lastPositionEndX;
minY = pageHeight - firstPositionY;
maxY = pageHeight - lastPositionY + height;
}
// Finally I draw the Rectangle
PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
PDRectangle pdRectangle = new PDRectangle();
pdRectangle.setLowerLeftX((float) minX);
pdRectangle.setLowerLeftY((float) minY);
pdRectangle.setUpperRightX((float) maxX);
pdRectangle.setUpperRightY((float) ((float) maxY + height));
txtMark.setRectangle(pdRectangle);
// And the QuadPoints
float[] quads = new float[8];
quads[0] = pdRectangle.getLowerLeftX(); // x1
quads[1] = pdRectangle.getUpperRightY() - 2; // y1
quads[2] = pdRectangle.getUpperRightX(); // x2
quads[3] = quads[1]; // y2
quads[4] = quads[0]; // x3
quads[5] = pdRectangle.getLowerLeftY() - 2; // y3
quads[6] = quads[2]; // x4
quads[7] = quads[5]; // y5
txtMark.setQuadPoints(quads);
...
}
}
...
if (rotation == 90) {
double width = wordCoordinates[6];
width = (pageHeight * width) / pageWidth;
//defining coordinates of a rectangle
maxX = firstPositionY;
minX = firstPositionY - height;
minY = firstPositionX;
maxY = firstPositionX + width;
} else {
minX = firstPositionX;
maxX = lastPositionEndX;
minY = pageHeight - firstPositionY;
maxY = pageHeight - lastPositionY + height;
}
...
...
PDRectangle mediaBox = page.getMediaBox();
PDRectangle cropBox = page.getCropBox();
if (rotation == 90) {
double width = wordCoordinates[6];
width = (pageHeight * width) / pageWidth;
//defining coordinates of a rectangle
maxX = firstPositionY;
minX = firstPositionY - height;
minY = firstPositionX;
maxY = firstPositionX + width;
} else {
minX = firstPositionX + cropBox.getLowerLeftX() - mediaBox.getLowerLeftY();
maxX = lastPositionEndX + cropBox.getLowerLeftX() - mediaBox.getLowerLeftY();
minY = pageHeight - firstPositionY - (mediaBox.getUpperRightY() - cropBox.getUpperRightY());
maxY = pageHeight - lastPositionY + height - (mediaBox.getUpperRightY() - cropBox.getUpperRightY());
}
...