C# 使用itextsharp从pdf提取图像时system.drawing中出现异常_C#_Image_Pdf_Itextsharp

C# 使用itextsharp从pdf提取图像时system.drawing中出现异常

c# image pdf

C# 使用itextsharp从pdf提取图像时system.drawing中出现异常,c#,image,pdf,itextsharp,C#,Image,Pdf,Itextsharp,我正在尝试使用itextsharp从pdf文件中提取图像我正在使用的pdf示例我使用的代码是：- static void Main(string[] args) { try { WriteImageFile(); // write image file System.Console.WriteLine(AppDomain.CurrentDomain.BaseDirectory);

我正在尝试使用itextsharp从pdf文件中提取图像

我正在使用的pdf示例

我使用的代码是：-

static void Main(string[] args)
    {

        try
        {
            WriteImageFile(); // write image file
            System.Console.WriteLine(AppDomain.CurrentDomain.BaseDirectory);
            System.Console.ReadLine();
        }
        catch (Exception ex)
        {
            System.Console.WriteLine(ex.Message);
        }
    }

    private static List<System.Drawing.Image> ExtractImages(String PDFSourcePath)
    {
        List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>();

        iTextSharp.text.pdf.RandomAccessFileOrArray RAFObj = null;
        iTextSharp.text.pdf.PdfReader PDFReaderObj = null;
        iTextSharp.text.pdf.PdfObject PDFObj = null;
        iTextSharp.text.pdf.PdfStream PDFStremObj = null;

        try
        {
            RAFObj = new iTextSharp.text.pdf.RandomAccessFileOrArray(PDFSourcePath);
            PDFReaderObj = new iTextSharp.text.pdf.PdfReader(RAFObj, null);
            if (PDFReaderObj.IsOpenedWithFullPermissions)
            {
                Debug.Print("this is a test");
            }

            for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
            {
                PDFObj = PDFReaderObj.GetPdfObject(i);

                if ((PDFObj != null) && PDFObj.IsStream())
                {
                    PDFStremObj = (iTextSharp.text.pdf.PdfStream)PDFObj;
                    iTextSharp.text.pdf.PdfObject subtype = PDFStremObj.Get(iTextSharp.text.pdf.PdfName.SUBTYPE);

                    if ((subtype != null) && subtype.ToString() == iTextSharp.text.pdf.PdfName.IMAGE.ToString())
                    {
                        byte[] bytes = iTextSharp.text.pdf.PdfReader.GetStreamBytesRaw((iTextSharp.text.pdf.PRStream)PDFStremObj);

                        if ((bytes != null))
                        {
                            try
                            {
                                System.IO.MemoryStream MS = new System.IO.MemoryStream(bytes);

                                MS.Position = 0;
                                System.Drawing.Image ImgPDF = System.Drawing.Image.FromStream(MS);

                                ImgList.Add(ImgPDF);

                            }
                            catch (Exception e)
                            {
                                Console.WriteLine  ("Exception in extract: " + e);
                            }
                        }
                    }
                }
            }
            PDFReaderObj.Close();
        }
        catch (Exception ex)
        {
            throw new Exception(ex.Message);
        }
        return ImgList;
    }


    private static void WriteImageFile()
    {
        try
        {
            System.Console.WriteLine("Wait for extracting image from PDF file....");

            // Get a List of Image
            List<System.Drawing.Image> ListImage = ExtractImages(@"C:\Users\pradyut.bhattacharya\Documents\CEVA PDF\more\CS_75.pdf");

            for (int i = 0; i < ListImage.Count; i++)
            {
                try
                {
                    // Write Image File
                    ListImage[i].Save(@"C:\Users\pradyut.bhattacharya\Documents\CEVA PDF\more\Image" + i + ".jpeg", System.Drawing.Imaging.ImageFormat.Jpeg);
                    System.Console.WriteLine("Image" + i + ".jpeg write sucessfully");
                }
                catch (Exception)
                { }
            }

        }
        catch (Exception ex)
        {
            throw new Exception(ex.Message);
        }
    }

有什么帮助吗

谢谢，PDF中的图像可以以多种方式存储。您的代码将适用于.Net Framework有解码器的所有类型，但对于没有解码器的类型，代码将失败。具体来说，您的代码失败是因为该PDF的图像编码为

JBIG2Decode

。您可以通过查看

PDFStremObj

/FILTER

属性来检查这一点

PdfObject filterType = PDFStremObj.Get(PdfName.FILTER);
if(filterType.Equals(PdfName.JBIG2DECODE)){
    //...
}

对于框架不知道的类型，您需要一个库或编写自己的解码器

在JBIG上，如果您想尝试自己滚动。这显示了一些可能也支持解码的编码器，这正是您所需要的。

我知道，这是一个老问题，但实际上我已经找到了一个不错的解决方案。我也很难从带有JBig2编码的PDF中提取图像。iTextSharp的较新版本（4.1.6之后）实际上支持它，但这些版本现在处于AGPL许可之下

使用（版本2不是免费的），您可以将JBig2编码的图像转换为

System.Drawing.Bitmap

，并根据需要进行保存/修改。但是，该库仅对数据进行解码，无法将图像编码为JBig2格式

一个很小但很小的警告是，这个库是用Java编写的。不过，这对于C#用户来说根本不是什么问题，这要归功于。如果您还不知道的话，IKVM有一个完整的java虚拟机，它运行在.NET中，并且有java类库的本机.NET实现。它很容易安装，我在大约2小时前亲自测试过

从上面的链接下载IKVM和JBig2 jar后，可以执行此命令，让IKVM将jar转换为本机.NET dll

ikvmc-target:library[到jbig2.jar的路径]

这将输出一个名为

jbig2.dll

的.NET dll到jar或ikvmc可执行文件的同一目录中（我不记得是哪个）。然后，在项目中参考

jbig2.dll

，

IKVM.OpenJDK.Core

，

IKVM.OpenJDK.Media

，

IKVM.OpenJDK.SwingAWT

和

IKVM.Runtime

。我使用了与以下类似的代码来提取图像：

// code to iterate over PDF objects and get bytes of a valid image elided
var imageBytes = GetRawImageBytesFromPdf();

if (filterType.Equals(PdfName.JBIG2DECODE))
{
    var jbg2 = new JBIG2Decoder();

    // Some JBig2 will extract without setting the JBig2Globals
    var decodeParams = stream.GetAsDict(PdfName.DECODEPARMS);
    if(decodeParams != null)
    {
        var globalRef = decodeParams.GetAsIndirectObject(
                                        PdfName.JBIG2GLOBALS);
        if(globalRef != null)
        {
            var globals = PdfReader.GetPdfObject(globalRef);
            var globalStream = globals as PRStream;
            var globalBytes = PdfReader.GetStreamBytesRaw(globalStream);

            if (globalBytes != null)
            {
                jbg2.setGlobalData(globalBytes);
            }
        }
    }

    jbg2.decodeJBIG2(imageBytes);

    var pages = jbg2.getNumberOfPages();

    for(int p = 0; p < pages; p++)
    {
        java.awt.image.BufferedImage bufImg = jbg2.getPageAsBufferedImage(p);

        var bitmap = bufImg.getBitmap();
        bitmap.Save(@"c:\path\to\file.tif", ImageFormat.Tiff);
        // note: I am unsure about the need to free the memory of the internal
        //       bitmap used in the BufferedImage class.  The docs for IKVM and
        //       that class should probably be consulted to find out if that
        //       should be done.
    }
}
// handle other formats like CCITTFAXDECODE

//迭代PDF对象并删除有效图像字节的代码
var imageBytes=GetRawImageBytesFromPdf（）；
if（filterType.Equals（PdfName.JBIG2DECODE））
{
var jbg2=new JBIG2Decoder（）；
//某些JBig2将在不设置JBig2Globals的情况下提取
var decodeParams=stream.GetAsDict（PdfName.DECODEPARMS）；
if（decodeParams！=null）
{
var globalRef=decodeParams.GetAsIndirectObject(
PdfName.JBIG2GLOBALS）；
if（globalRef！=null）
{
var globals=PdfReader.GetPdfObject（globalRef）；
var globalStream=全局作为PRStream；
var globalBytes=PdfReader.GetStreamBytesRaw（globalStream）；
if（globalBytes！=null）
{
jbg2.setGlobalData（globalBytes）；
}
}
}
jbg2.decodeJBIG2（imageBytes）；
var pages=jbg2.getNumberOfPages（）；
对于（int p=0；p


虽然这个库不是最快的，但它做得很好（这与它在IKVM中使用的事实无关，开发人员承认这个库的版本1是低效的）。我不喜欢编写/编辑java代码，所以如果我想自己提高速度，我想我可能会直接将其移植到C代码。然而，这段java代码还有另一个分支，声称速度提高了2.5-4.5倍。您可能会编译这个jar并使用ikvmc
希望这能帮助那些仍在寻找解决方案的人
 谢谢分享这个想法
他的解决方案是我发现的使用免费版本的最优雅的
正如你所建议的，我包括了图书馆：
jbig2dec.dll (generated from promt >ikmvc jbig2dec.jar)
ICSharpCode.SharpZipLib
IKVM.Runtime
IKVM.OpenJDK.Core
IKVM.OpenJDK.Media
IKVM.OpenJDK.SwingAWT

每当我尝试调用“jbgI.getPageAsBuffereImage”时，我都会遇到一个异常“java.awt.image.DataBuffer的类型初始值设定项引发异常。-->“sun.awt.image.SunWritableRaster”的类型初始值设定项引发了异常。-->“java.awt.image.WritableRaster”的类型初始值设定项引发异常。-->“java.awt.image.Raster”的类型初始值设定项引发了异常。-->“java.awt.image.ColorModel”的类型初始值设定项引发了异常。-->java.lang.UnsatifiedLinkError:java.library.path中没有awt，“也许这些库不再工作了？字节是什么？在这行jbg2.decodeJBIG2（字节）；？@Ray这些是PDF中图像的字节。我想应该是这样的：byte[]bytes=iTextSharp.text.PDF.PdfReader.getstreambytessraw(（iTextSharp.text.pdf.PRStream）PDFStremObj）；（从用户问题中复制）。
jbig2dec.dll (generated from promt >ikmvc jbig2dec.jar)
ICSharpCode.SharpZipLib
IKVM.Runtime
IKVM.OpenJDK.Core
IKVM.OpenJDK.Media
IKVM.OpenJDK.SwingAWT