C# 特别是在图像之前的图像标题,以及b必须在单个指令中绘制标题。这两个条件都不是自动正确的,pdf格式允许以任意顺序绘制内容,文本行可以逐段绘制。因此,这种方法可能会使用很多PDF,但也有很多PDF是无法使用的。图像标题不必在图像之前。代码提取找到标题后找到的

C# 特别是在图像之前的图像标题,以及b必须在单个指令中绘制标题。这两个条件都不是自动正确的,pdf格式允许以任意顺序绘制内容,文本行可以逐段绘制。因此,这种方法可能会使用很多PDF,但也有很多PDF是无法使用的。图像标题不必在图像之前。代码提取找到标题后找到的,c#,itext,C#,Itext,特别是在图像之前的图像标题,以及b必须在单个指令中绘制标题。这两个条件都不是自动正确的,pdf格式允许以任意顺序绘制内容,文本行可以逐段绘制。因此,这种方法可能会使用很多PDF,但也有很多PDF是无法使用的。图像标题不必在图像之前。代码提取找到标题后找到的第一个图像。否则,我同意你的看法。我们正在使用的PDF是标准格式的,所以它应该适用于他们。我并不是说这适用于每种类型的pdf。“图像标题不必在图像之前。”-好的,更准确地说,首先是标题,然后必须绘制图像;在标题和相关图像之间,不必绘制其他图像。


特别是在图像之前的图像标题,以及b必须在单个指令中绘制标题。这两个条件都不是自动正确的,pdf格式允许以任意顺序绘制内容,文本行可以逐段绘制。因此,这种方法可能会使用很多PDF,但也有很多PDF是无法使用的。图像标题不必在图像之前。代码提取找到标题后找到的第一个图像。否则,我同意你的看法。我们正在使用的PDF是标准格式的,所以它应该适用于他们。我并不是说这适用于每种类型的pdf。“图像标题不必在图像之前。”-好的,更准确地说,首先是标题,然后必须绘制图像;在标题和相关图像之间,不必绘制其他图像。(PDF制作人在开始或结束时绘制所有图像,因此即使这些条件也无法确定。)
public class ImageExtractor : IRenderListener
{
    int _currentPage = 1;
    int _imageCount = 0;
    int _index = 0;
    int _count = 0;
    readonly string _outputFilePrefix;
    readonly string _outputFolder;
    readonly bool _overwriteExistingFiles;
    string[] _fileTypes;

    public ImageExtractor(string outputFilePrefix, string outputFolder, bool overwriteExistingFiles, string[] fileTypes, int index)
    {
        _outputFilePrefix = outputFilePrefix;
        _outputFolder = outputFolder;
        _overwriteExistingFiles = overwriteExistingFiles;
        _fileTypes = fileTypes;
        _index = index;
    }

    public static int ExtractImageByIndex(string pdfPath, string outputFilePrefix, string outputFolder, bool overwriteExistingFiles, int pageNumber, int index, string[] fileTypes = null)
    {
        // Handle setting of any default values
        outputFilePrefix = outputFilePrefix ?? System.IO.Path.GetFileNameWithoutExtension(pdfPath);
        outputFolder = String.IsNullOrEmpty(outputFolder) ? System.IO.Path.GetDirectoryName(pdfPath) : outputFolder;

        var instance = new ImageExtractor(outputFilePrefix, outputFolder, overwriteExistingFiles, fileTypes, index);
        instance._currentPage = pageNumber;

        using (var pdfReader = new PdfReader(pdfPath))
        {
            if (pdfReader.NumberOfPages == 0)
                return 0;

            if (pdfReader.IsEncrypted())
                throw new ApplicationException(pdfPath + " is encrypted.");

            var pdfParser = new PdfReaderContentParser(pdfReader);

            pdfParser.ProcessContent(instance._currentPage, instance);
        }

        return instance._imageCount;
    }

    public void BeginTextBlock() { }
    public void EndTextBlock() { }
    public void RenderText(TextRenderInfo renderInfo) { }

    public void RenderImage(ImageRenderInfo renderInfo)
    {
        // If _index is greater than 0, we're looking for a specific image. If _count is
        // equal to _index, we've already found it, so don't go any farther.
        if (_index > 0 && _count == _index)
            return;

        var imageObject = renderInfo.GetImage();

        var imageFileName = "";

        if (_fileTypes != null)
        {
            var type = imageObject.GetFileType().ToLower();
            var flag = false;
            foreach (var t in _fileTypes)
            {
                if (t.ToLower() == type)
                {
                    flag = true;
                    break;
                }
            }
            if (flag)
                imageFileName = String.Format("{0}_{1}_{2}.{3}", _outputFilePrefix, _currentPage, _imageCount, imageObject.GetFileType());
        }
        else
        {
            imageFileName = String.Format("{0}_{1}_{2}.{3}", _outputFilePrefix, _currentPage, _imageCount, imageObject.GetFileType());
        }

        if (!string.IsNullOrEmpty(imageFileName))
        {
            // If _index is 0, multiple images may be extracted. If _index is greater than 0,
            // RenderImage will increment count every time it finds an image that matches the
            // file type and will only extract the image if count equals index.
            if (_index > 0)
            {
                _count++;
                if (_count != _index)
                    return;
            }

            var imagePath = System.IO.Path.Combine(_outputFolder, imageFileName);

            if (_overwriteExistingFiles || !File.Exists(imagePath))
            {
                var imageRawBytes = imageObject.GetImageAsBytes();

                File.WriteAllBytes(imagePath, imageRawBytes);

            }

            // Subtle: Always increment even if file is not written. This ensures consistency should only some
            //   of a PDF file's images actually exist.
            _imageCount++;
        }
    }
}
public class SimpleMixedExtractionStrategy : LocationTextExtractionStrategy
{
    FieldInfo field = typeof(LocationTextExtractionStrategy).GetField("locationalResult", BindingFlags.Instance | BindingFlags.NonPublic);
    LineSegment UNIT_LINE = new LineSegment(new Vector(0, 0, 1), new Vector(1, 0, 1));
    String outputPath;
    String name;
    int counter = 0;

    public SimpleMixedExtractionStrategy(String outputPath, String name)
    {
        this.outputPath = outputPath;
        this.name = name;
    }

    public override void RenderImage(ImageRenderInfo renderInfo)
    {
        PdfImageObject image = renderInfo.GetImage();
        if (image == null) return;
        int number = counter++;
        String filename = name + "-" + number + "." + image.GetFileType();
        File.WriteAllBytes(outputPath + filename, image.GetImageAsBytes());

        LineSegment segment = UNIT_LINE.TransformBy(renderInfo.GetImageCTM());
        TextChunk location = new TextChunk("[" + filename + "]", segment.GetStartPoint(), segment.GetEndPoint(), 0f);

        List<TextChunk> locationalResult = (List<TextChunk>)field.GetValue(this);
        locationalResult.Add(location);
    }
}
String sourceFile = @"SOURCE.pdf";
String imagePath = @"extract\";
String imageBaseName = "SOURCE-";
Directory.CreateDirectory(imagePath);

using (PdfReader pdfReader = new PdfReader(sourceFile))
{
    PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);
    for (var i = 1; i <= pdfReader.NumberOfPages; i++)
    {
        SimpleMixedExtractionStrategy listener = new SimpleMixedExtractionStrategy(imagePath, imageBaseName + i);
        parser.ProcessContent(i, listener);
        String text = listener.GetResultantText();
        Console.Write("Text of page {0}:\n---\n{1}\n\n", i, text);
    }
}
public class ImageExtractor : IRenderListener
{
    private string caption;
    private bool _captionFound;
    private string _outputFolder;

    ....
    ....

    public void BeginTextBlock() { }

    public void EndTextBlock() { }

    public void RenderText(TextRenderInfo renderInfo) {
        // If this line of text contains the caption, set _captionFound to true
        if (renderInfo.GetText().Contains(_caption))
            _captionFound = true;
    }

    public void RenderImage(ImageRenderInfo renderInfo)
    {
        // Skip the image if _captionFound is false
        if (!_captionFound)
            return;

        // _captionFound is true, so extract the image

        // Code to extract image

        // Set _captionFound back to false, so that only the first image found is
        // extracted.
        _captionFound = false;

    }

    public static int ExtractImageByCaption(string caption, string pdfPath, string outputFolder, string outputFolder, bool overwriteExistingFiles, string[] fileTypes = null)
    {
        var instance = new ImageExtractor(outputFilePrefix, outputFolder, overwriteExistingFiles, fileTypes, 0);

        instance._caption = caption;
        instance._outputFolder = outputFolder;

        using (var pdfReader = new PdfReader(pdfPath))
        {
            if (pdfReader.IsEncrypted())
                throw new ApplicationException(pdfPath + " is encrypted.");

            var pdfParser = new PdfReaderContentParser(pdfReader);

            while (instance._currentPage <= pdfReader.NumberOfPages)
            {
                pdfParser.ProcessContent(instance._currentPage, instance);

                instance._currentPage++;
            }
        }
    }
}