C# 特别是在图像之前的图像标题,以及b必须在单个指令中绘制标题。这两个条件都不是自动正确的,pdf格式允许以任意顺序绘制内容,文本行可以逐段绘制。因此,这种方法可能会使用很多PDF,但也有很多PDF是无法使用的。图像标题不必在图像之前。代码提取找到标题后找到的
C# 特别是在图像之前的图像标题,以及b必须在单个指令中绘制标题。这两个条件都不是自动正确的,pdf格式允许以任意顺序绘制内容,文本行可以逐段绘制。因此,这种方法可能会使用很多PDF,但也有很多PDF是无法使用的。图像标题不必在图像之前。代码提取找到标题后找到的,c#,itext,C#,Itext,特别是在图像之前的图像标题,以及b必须在单个指令中绘制标题。这两个条件都不是自动正确的,pdf格式允许以任意顺序绘制内容,文本行可以逐段绘制。因此,这种方法可能会使用很多PDF,但也有很多PDF是无法使用的。图像标题不必在图像之前。代码提取找到标题后找到的第一个图像。否则,我同意你的看法。我们正在使用的PDF是标准格式的,所以它应该适用于他们。我并不是说这适用于每种类型的pdf。“图像标题不必在图像之前。”-好的,更准确地说,首先是标题,然后必须绘制图像;在标题和相关图像之间,不必绘制其他图像。
特别是在图像之前的图像标题,以及b必须在单个指令中绘制标题。这两个条件都不是自动正确的,pdf格式允许以任意顺序绘制内容,文本行可以逐段绘制。因此,这种方法可能会使用很多PDF,但也有很多PDF是无法使用的。图像标题不必在图像之前。代码提取找到标题后找到的第一个图像。否则,我同意你的看法。我们正在使用的PDF是标准格式的,所以它应该适用于他们。我并不是说这适用于每种类型的pdf。“图像标题不必在图像之前。”-好的,更准确地说,首先是标题,然后必须绘制图像;在标题和相关图像之间,不必绘制其他图像。(PDF制作人在开始或结束时绘制所有图像,因此即使这些条件也无法确定。)
public class ImageExtractor : IRenderListener
{
int _currentPage = 1;
int _imageCount = 0;
int _index = 0;
int _count = 0;
readonly string _outputFilePrefix;
readonly string _outputFolder;
readonly bool _overwriteExistingFiles;
string[] _fileTypes;
public ImageExtractor(string outputFilePrefix, string outputFolder, bool overwriteExistingFiles, string[] fileTypes, int index)
{
_outputFilePrefix = outputFilePrefix;
_outputFolder = outputFolder;
_overwriteExistingFiles = overwriteExistingFiles;
_fileTypes = fileTypes;
_index = index;
}
public static int ExtractImageByIndex(string pdfPath, string outputFilePrefix, string outputFolder, bool overwriteExistingFiles, int pageNumber, int index, string[] fileTypes = null)
{
// Handle setting of any default values
outputFilePrefix = outputFilePrefix ?? System.IO.Path.GetFileNameWithoutExtension(pdfPath);
outputFolder = String.IsNullOrEmpty(outputFolder) ? System.IO.Path.GetDirectoryName(pdfPath) : outputFolder;
var instance = new ImageExtractor(outputFilePrefix, outputFolder, overwriteExistingFiles, fileTypes, index);
instance._currentPage = pageNumber;
using (var pdfReader = new PdfReader(pdfPath))
{
if (pdfReader.NumberOfPages == 0)
return 0;
if (pdfReader.IsEncrypted())
throw new ApplicationException(pdfPath + " is encrypted.");
var pdfParser = new PdfReaderContentParser(pdfReader);
pdfParser.ProcessContent(instance._currentPage, instance);
}
return instance._imageCount;
}
public void BeginTextBlock() { }
public void EndTextBlock() { }
public void RenderText(TextRenderInfo renderInfo) { }
public void RenderImage(ImageRenderInfo renderInfo)
{
// If _index is greater than 0, we're looking for a specific image. If _count is
// equal to _index, we've already found it, so don't go any farther.
if (_index > 0 && _count == _index)
return;
var imageObject = renderInfo.GetImage();
var imageFileName = "";
if (_fileTypes != null)
{
var type = imageObject.GetFileType().ToLower();
var flag = false;
foreach (var t in _fileTypes)
{
if (t.ToLower() == type)
{
flag = true;
break;
}
}
if (flag)
imageFileName = String.Format("{0}_{1}_{2}.{3}", _outputFilePrefix, _currentPage, _imageCount, imageObject.GetFileType());
}
else
{
imageFileName = String.Format("{0}_{1}_{2}.{3}", _outputFilePrefix, _currentPage, _imageCount, imageObject.GetFileType());
}
if (!string.IsNullOrEmpty(imageFileName))
{
// If _index is 0, multiple images may be extracted. If _index is greater than 0,
// RenderImage will increment count every time it finds an image that matches the
// file type and will only extract the image if count equals index.
if (_index > 0)
{
_count++;
if (_count != _index)
return;
}
var imagePath = System.IO.Path.Combine(_outputFolder, imageFileName);
if (_overwriteExistingFiles || !File.Exists(imagePath))
{
var imageRawBytes = imageObject.GetImageAsBytes();
File.WriteAllBytes(imagePath, imageRawBytes);
}
// Subtle: Always increment even if file is not written. This ensures consistency should only some
// of a PDF file's images actually exist.
_imageCount++;
}
}
}
public class SimpleMixedExtractionStrategy : LocationTextExtractionStrategy
{
FieldInfo field = typeof(LocationTextExtractionStrategy).GetField("locationalResult", BindingFlags.Instance | BindingFlags.NonPublic);
LineSegment UNIT_LINE = new LineSegment(new Vector(0, 0, 1), new Vector(1, 0, 1));
String outputPath;
String name;
int counter = 0;
public SimpleMixedExtractionStrategy(String outputPath, String name)
{
this.outputPath = outputPath;
this.name = name;
}
public override void RenderImage(ImageRenderInfo renderInfo)
{
PdfImageObject image = renderInfo.GetImage();
if (image == null) return;
int number = counter++;
String filename = name + "-" + number + "." + image.GetFileType();
File.WriteAllBytes(outputPath + filename, image.GetImageAsBytes());
LineSegment segment = UNIT_LINE.TransformBy(renderInfo.GetImageCTM());
TextChunk location = new TextChunk("[" + filename + "]", segment.GetStartPoint(), segment.GetEndPoint(), 0f);
List<TextChunk> locationalResult = (List<TextChunk>)field.GetValue(this);
locationalResult.Add(location);
}
}
String sourceFile = @"SOURCE.pdf";
String imagePath = @"extract\";
String imageBaseName = "SOURCE-";
Directory.CreateDirectory(imagePath);
using (PdfReader pdfReader = new PdfReader(sourceFile))
{
PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);
for (var i = 1; i <= pdfReader.NumberOfPages; i++)
{
SimpleMixedExtractionStrategy listener = new SimpleMixedExtractionStrategy(imagePath, imageBaseName + i);
parser.ProcessContent(i, listener);
String text = listener.GetResultantText();
Console.Write("Text of page {0}:\n---\n{1}\n\n", i, text);
}
}
public class ImageExtractor : IRenderListener
{
private string caption;
private bool _captionFound;
private string _outputFolder;
....
....
public void BeginTextBlock() { }
public void EndTextBlock() { }
public void RenderText(TextRenderInfo renderInfo) {
// If this line of text contains the caption, set _captionFound to true
if (renderInfo.GetText().Contains(_caption))
_captionFound = true;
}
public void RenderImage(ImageRenderInfo renderInfo)
{
// Skip the image if _captionFound is false
if (!_captionFound)
return;
// _captionFound is true, so extract the image
// Code to extract image
// Set _captionFound back to false, so that only the first image found is
// extracted.
_captionFound = false;
}
public static int ExtractImageByCaption(string caption, string pdfPath, string outputFolder, string outputFolder, bool overwriteExistingFiles, string[] fileTypes = null)
{
var instance = new ImageExtractor(outputFilePrefix, outputFolder, overwriteExistingFiles, fileTypes, 0);
instance._caption = caption;
instance._outputFolder = outputFolder;
using (var pdfReader = new PdfReader(pdfPath))
{
if (pdfReader.IsEncrypted())
throw new ApplicationException(pdfPath + " is encrypted.");
var pdfParser = new PdfReaderContentParser(pdfReader);
while (instance._currentPage <= pdfReader.NumberOfPages)
{
pdfParser.ProcessContent(instance._currentPage, instance);
instance._currentPage++;
}
}
}
}