Java 使用PDFClown突出显示文本,而不使用PDF批注

Java 使用PDFClown突出显示文本,而不使用PDF批注,java,pdf,pdfclown,Java,Pdf,Pdfclown,几周前我开始使用PDFClown。我的目的是多词突出,主要是在报纸上。从org.pdfclown.samples.cli.TextHighlightSample示例开始,我成功地提取了多个单词的位置并突出显示它们。在大多数情况下,由于文本排序和匹配,我甚至解决了一些问题 遗憾的是,我的框架包含了它并没有考虑 PDFANNETMENT/。因此,页面内容流之外的所有内容(如文本注释和其他所谓的标记注释)都会丢失 因此,有没有关于使用PdfClown创建“文本突出显示”而不使用PDF批注的建议?在批注

几周前我开始使用PDFClown。我的目的是多词突出,主要是在报纸上。从
org.pdfclown.samples.cli.TextHighlightSample
示例开始,我成功地提取了多个单词的位置并突出显示它们。在大多数情况下,由于文本排序和匹配,我甚至解决了一些问题

遗憾的是,我的框架包含了它并没有考虑<代码> PDFANNETMENT/<代码>。因此,页面内容流之外的所有内容(如文本注释和其他所谓的标记注释)都会丢失


因此,有没有关于使用PdfClown创建“文本突出显示”而不使用PDF批注的建议?

在批注中没有突出显示,而是在实际的页面内容流中,必须将图形命令放入页面内容流中,对于
org.pdfclown.samples.cli.TextHighlightSample
示例,页面内容流将隐式放入正常注释外观流中

这可以这样实现:

org.pdfclown.files.File file = new org.pdfclown.files.File(resource);
Pattern pattern = Pattern.compile("S", Pattern.CASE_INSENSITIVE);
TextExtractor textExtractor = new TextExtractor(true, true);

for (final Page page : file.getDocument().getPages())
{
    final List<Quad> highlightQuads = new ArrayList<Quad>();

    Map<Rectangle2D, List<ITextString>> textStrings = textExtractor.extract(page);
    final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings));

    textExtractor.filter(textStrings, new TextExtractor.IIntervalFilter()
    {
        @Override
        public boolean hasNext()
        {
            return matcher.find();
        }

        @Override
        public Interval<Integer> next()
        {
            return new Interval<Integer>(matcher.start(), matcher.end());
        }

        @Override
        public void process(Interval<Integer> interval, ITextString match)
        {
            {
                Rectangle2D textBox = null;
                for (TextChar textChar : match.getTextChars())
                {
                    Rectangle2D textCharBox = textChar.getBox();
                    if (textBox == null)
                    {
                        textBox = (Rectangle2D) textCharBox.clone();
                    }
                    else
                    {
                        if (textCharBox.getY() > textBox.getMaxY())
                        {
                            highlightQuads.add(Quad.get(textBox));
                            textBox = (Rectangle2D) textCharBox.clone();
                        }
                        else
                        {
                            textBox.add(textCharBox);
                        }
                    }
                }
                highlightQuads.add(Quad.get(textBox));
            }
        }

        @Override
        public void remove()
        {
            throw new UnsupportedOperationException();
        }
    });

    // Highlight the text pattern match!
    ExtGState defaultExtGState = new ExtGState(file.getDocument());
    defaultExtGState.setAlphaShape(false);
    defaultExtGState.setBlendMode(Arrays.asList(BlendModeEnum.Multiply));

    PrimitiveComposer composer = new PrimitiveComposer(page);
    composer.getScanner().moveEnd();
    // TODO: reset graphics state here.
    composer.applyState(defaultExtGState);
    composer.setFillColor(new DeviceRGBColor(1, 1, 0));
    {
        for (Quad markupBox : highlightQuads)
        {
            Point2D[] points = markupBox.getPoints();
            double markupBoxHeight = points[3].getY() - points[0].getY();
            double markupBoxMargin = markupBoxHeight * .25;
            composer.drawCurve(new Point2D.Double(points[3].getX(), points[3].getY()),
                    new Point2D.Double(points[0].getX(), points[0].getY()),
                    new Point2D.Double(points[3].getX() - markupBoxMargin, points[3].getY() - markupBoxMargin),
                    new Point2D.Double(points[0].getX() - markupBoxMargin, points[0].getY() + markupBoxMargin));
            composer.drawLine(new Point2D.Double(points[1].getX(), points[1].getY()));
            composer.drawCurve(new Point2D.Double(points[2].getX(), points[2].getY()),
                    new Point2D.Double(points[1].getX() + markupBoxMargin, points[1].getY() + markupBoxMargin),
                    new Point2D.Double(points[2].getX() + markupBoxMargin, points[2].getY() - markupBoxMargin));
            composer.fill();
        }
    }
    composer.flush();
}

file.save(new File(RESULT_FOLDER, "multiPage-highlight-content.pdf"), SerializationModeEnum.Incremental);
org.pdfclown.files.File File=new org.pdfclown.files.File(资源);
Pattern=Pattern.compile(“S”,Pattern.CASE\u不区分大小写);
TextExtractor TextExtractor=新的TextExtractor(真,真);
对于(最终页面:file.getDocument().getPages())
{
最终列表highlightQuads=新的ArrayList();
Map textStrings=textextextractor.extract(第页);
final Matcher Matcher=pattern.Matcher(textextextractor.toString(textStrings));
textExtractor.filter(TextString,新的textExtractor.IIntervalFilter()
{
@凌驾
公共布尔hasNext()
{
返回matcher.find();
}
@凌驾
公共间隔下一个()
{
返回新的间隔(matcher.start(),matcher.end());
}
@凌驾
公共作废进程(间隔、ITextString匹配)
{
{
矩形2D文本框=空;
for(TextChar:match.getTextChars())
{
矩形2D textCharBox=textChar.getBox();
if(textBox==null)
{
textBox=(矩形2D)textCharBox.clone();
}
其他的
{
if(textCharBox.getY()>textBox.getMaxY())
{
highlightQuads.add(Quad.get(textBox));
textBox=(矩形2D)textCharBox.clone();
}
其他的
{
textBox.add(textCharBox);
}
}
}
highlightQuads.add(Quad.get(textBox));
}
}
@凌驾
公共空间删除()
{
抛出新的UnsupportedOperationException();
}
});
//突出显示文本模式匹配!
ExtGState defaultExtGState=新的ExtGState(file.getDocument());
defaultExtGState.setAlphaShape(false);
defaultExtGState.setBlendMode(Arrays.asList(BlendModeEnum.Multiply));
PrimitiveComposer composer=新的PrimitiveComposer(第页);
composer.getScanner().moveEnd();
//TODO:在此重置图形状态。
composer.applyState(defaultExtGState);
setFillColor(新设备GBColor(1,1,0));
{
用于(四边形标记框:高亮四边形)
{
Point2D[]points=markupBox.getPoints();
双标记框高度=点[3]。getY()-点[0]。getY();
双标记边界=标记边界高度*.25;
composer.drawCurve(新的Point2D.Double(点[3].getX(),点[3].getY()),
新的Point2D.Double(点[0].getX(),点[0].getY()),
新的Point2D.Double(点[3].getX()-markupBoxMargin,点[3].getY()-markupBoxMargin),
新的Point2D.Double(点[0].getX()-markupBoxMargin,点[0].getY()+markupBoxMargin));
绘制线(新的Point2D.Double(点[1].getX(),点[1].getY());
composer.drawCurve(新的Point2D.Double(点[2].getX(),点[2].getY()),
新的Point2D.Double(点[1].getX()+markupBoxMargin,点[1].getY()+markupBoxMargin),
新的Point2D.Double(点[2].getX()+markupBoxMargin,点[2].getY()-markupBoxMargin));
composer.fill();
}
}
composer.flush();
}
保存(新文件(RESULT_文件夹,“multiPage highlight content.pdf”)、SerializationModeEnum.Incremental);
(方法testHighlightInContent)

您将从原始示例中识别文本提取框架。只是现在,整个页面中的四元组在处理之前就被收集起来了,处理代码(主要是从
TextMarkup.refreshAppearance()
)将表示四元组的表单绘制到页面内容中


注意,要使其正常工作,必须在插入新指令之前重置图形状态(该位置标有
TODO
注释)。这可以通过应用保存/恢复状态或实际抵消不需要的更改状态条目来实现。不幸的是,我没有看到如何在PDF小丑中实现前者,也没有时间实现后者。

为了不在注释中突出显示,而是在实际的页面内容流中,必须将图形命令放入页面内容流中,对于
org.pdfclown.samples.cli.TextHighlightSample
示例,页面内容流将隐式放入正常注释外观流中

这可以这样实现:

org.pdfclown.files.File file = new org.pdfclown.files.File(resource);
Pattern pattern = Pattern.compile("S", Pattern.CASE_INSENSITIVE);
TextExtractor textExtractor = new TextExtractor(true, true);

for (final Page page : file.getDocument().getPages())
{
    final List<Quad> highlightQuads = new ArrayList<Quad>();

    Map<Rectangle2D, List<ITextString>> textStrings = textExtractor.extract(page);
    final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings));

    textExtractor.filter(textStrings, new TextExtractor.IIntervalFilter()
    {
        @Override
        public boolean hasNext()
        {
            return matcher.find();
        }

        @Override
        public Interval<Integer> next()
        {
            return new Interval<Integer>(matcher.start(), matcher.end());
        }

        @Override
        public void process(Interval<Integer> interval, ITextString match)
        {
            {
                Rectangle2D textBox = null;
                for (TextChar textChar : match.getTextChars())
                {
                    Rectangle2D textCharBox = textChar.getBox();
                    if (textBox == null)
                    {
                        textBox = (Rectangle2D) textCharBox.clone();
                    }
                    else
                    {
                        if (textCharBox.getY() > textBox.getMaxY())
                        {
                            highlightQuads.add(Quad.get(textBox));
                            textBox = (Rectangle2D) textCharBox.clone();
                        }
                        else
                        {
                            textBox.add(textCharBox);
                        }
                    }
                }
                highlightQuads.add(Quad.get(textBox));
            }
        }

        @Override
        public void remove()
        {
            throw new UnsupportedOperationException();
        }
    });

    // Highlight the text pattern match!
    ExtGState defaultExtGState = new ExtGState(file.getDocument());
    defaultExtGState.setAlphaShape(false);
    defaultExtGState.setBlendMode(Arrays.asList(BlendModeEnum.Multiply));

    PrimitiveComposer composer = new PrimitiveComposer(page);
    composer.getScanner().moveEnd();
    // TODO: reset graphics state here.
    composer.applyState(defaultExtGState);
    composer.setFillColor(new DeviceRGBColor(1, 1, 0));
    {
        for (Quad markupBox : highlightQuads)
        {
            Point2D[] points = markupBox.getPoints();
            double markupBoxHeight = points[3].getY() - points[0].getY();
            double markupBoxMargin = markupBoxHeight * .25;
            composer.drawCurve(new Point2D.Double(points[3].getX(), points[3].getY()),
                    new Point2D.Double(points[0].getX(), points[0].getY()),
                    new Point2D.Double(points[3].getX() - markupBoxMargin, points[3].getY() - markupBoxMargin),
                    new Point2D.Double(points[0].getX() - markupBoxMargin, points[0].getY() + markupBoxMargin));
            composer.drawLine(new Point2D.Double(points[1].getX(), points[1].getY()));
            composer.drawCurve(new Point2D.Double(points[2].getX(), points[2].getY()),
                    new Point2D.Double(points[1].getX() + markupBoxMargin, points[1].getY() + markupBoxMargin),
                    new Point2D.Double(points[2].getX() + markupBoxMargin, points[2].getY() - markupBoxMargin));
            composer.fill();
        }
    }
    composer.flush();
}

file.save(new File(RESULT_FOLDER, "multiPage-highlight-content.pdf"), SerializationModeEnum.Incremental);
org.pdfclown.files.File File=new org.pdfclown.files.File(资源);
Pattern=Pattern.compile(“S”,Pattern.CASE\u不区分大小写);
TextExtractor TextExtractor=新文本