无法识别通过Flate解码从PDF中提取的PNG-C#
我参与编写的C#软件有一个组件,用于从扫描的文档中读取条形码。PDF本身是使用打开的 不幸的是,当涉及PDF的平面解码时,我们遇到了一个问题。基本上,我们得到的只是一堆模糊,这意味着没有条形码可检查,文档也无法识别 我们的代码(我们无耻地“借用”了另一个堆栈溢出案例!)如下:无法识别通过Flate解码从PDF中提取的PNG-C#,c#,pdf,png,barcode,decode,C#,Pdf,Png,Barcode,Decode,我参与编写的C#软件有一个组件,用于从扫描的文档中读取条形码。PDF本身是使用打开的 不幸的是,当涉及PDF的平面解码时,我们遇到了一个问题。基本上,我们得到的只是一堆模糊,这意味着没有条形码可检查,文档也无法识别 我们的代码(我们无耻地“借用”了另一个堆栈溢出案例!)如下: private FileInfo ExportAsPngImage(PdfDictionary image, string sourceFileName, ref int count) { //Th
private FileInfo ExportAsPngImage(PdfDictionary image, string sourceFileName, ref int count)
{
//This code basically comes from http://forum.pdfsharp.net/viewtopic.php?f=2&t=2338#p6755
//and http://stackoverflow.com/questions/10024908/how-to-extract-flatedecoded-images-from-pdf-with-pdfsharp
string tempFile = string.Format("{0}_Image{1}.png", sourceFileName, count);
int width = image.Elements.GetInteger(PdfImage.Keys.Width);
int height = image.Elements.GetInteger(PdfImage.Keys.Height);
int bitsPerComponent = image.Elements.GetInteger(PdfImage.Keys.BitsPerComponent);
var pixelFormat = new PixelFormat();
switch (bitsPerComponent)
{
case 1:
pixelFormat = System.Drawing.Imaging.PixelFormat.Format1bppIndexed;
break;
case 8:
pixelFormat = System.Drawing.Imaging.PixelFormat.Format8bppIndexed;
break;
case 24:
pixelFormat = System.Drawing.Imaging.PixelFormat.Format24bppRgb;
break;
default:
throw new Exception("Unknown pixel format " + bitsPerComponent);
}
var fd = new FlateDecode();
byte[] decodedBytes = fd.Decode(image.Stream.Value);
byte[] resultBytes = null;
int newWidth = width;
int alignment = 4;
if (newWidth % alignment != 0)
//Image data in BMP files always starts at a DWORD boundary, in PDF it starts at a BYTE boundary.
//Most images have a width that is a multiple of 4, so there is no problem with them.
//You must copy the image data line by line and start each line at the DWORD boundary.
{
while (newWidth % alignment != 0)
{
newWidth++;
}
var copy_dword_boundary = new byte[height, newWidth];
for (int y = 0; y < height; y++)
{
for (int x = 0; x < newWidth; x++)
{
if (x <= width && (x + (y * width) < decodedBytes.Length))
// while not at end of line, take orignal array
copy_dword_boundary[y, x] = decodedBytes[x + (y * width)];
else //fill new array with ending 0
copy_dword_boundary[y, x] = 0;
}
}
resultBytes = new byte[newWidth * height];
int counter = 0;
for (int x = 0; x < copy_dword_boundary.GetLength(0); x++)
{
for (int y = 0; y < copy_dword_boundary.GetLength(1); y++)
{ //put 2dim array back in 1dim array
resultBytes[counter] = copy_dword_boundary[x, y];
counter++;
}
}
}
else
{
resultBytes = new byte[decodedBytes.Length];
decodedBytes.CopyTo(resultBytes, 0);
}
//Create a new bitmap and shove the bytes into it
var bitmap = new Bitmap(newWidth, height, pixelFormat);
BitmapData bitmapData = bitmap.LockBits(new Rectangle(0, 0, bitmap.Width, bitmap.Height), ImageLockMode.WriteOnly, bitmap.PixelFormat);
int length = (int)Math.Ceiling(width * bitsPerComponent / 8.0);
for (int i = 0; i < height; i++)
{
int offset = i * length;
int scanOffset = i * bitmapData.Stride;
Marshal.Copy(resultBytes, offset, new IntPtr(bitmapData.Scan0.ToInt32() + scanOffset), length);
}
bitmap.UnlockBits(bitmapData);
//Now save the bitmap to memory
using (var fs = new FileStream(String.Format(tempFile, count++), FileMode.Create, FileAccess.Write))
{
bitmap.Save(fs, ImageFormat.Png);
}
return new FileInfo(tempFile);
}
private FileInfo ExportAsPngImage(PdfDictionary图像,字符串sourceFileName,ref int count)
{
//此代码基本上来自http://forum.pdfsharp.net/viewtopic.php?f=2&t=2338#p6755
//及http://stackoverflow.com/questions/10024908/how-to-extract-flatedecoded-images-from-pdf-with-pdfsharp
string tempFile=string.Format(“{0}\u Image{1}.png”,sourceFileName,count);
int width=image.Elements.GetInteger(PdfImage.Keys.width);
int height=image.Elements.GetInteger(PdfImage.Keys.height);
int bitsPerComponent=image.Elements.GetInteger(PdfImage.Keys.bitsPerComponent);
var pixelFormat=新的pixelFormat();
交换机(比特组件)
{
案例1:
pixelFormat=System.Drawing.Imaging.pixelFormat.Format1BPindexed;
打破
案例8:
pixelFormat=System.Drawing.Imaging.pixelFormat.Format8Bppined;
打破
案例24:
pixelFormat=System.Drawing.Imaging.pixelFormat.Format24bppRgb;
打破
违约:
抛出新异常(“未知像素格式”+bitsPerComponent);
}
var fd=新的FlateCode();
字节[]decodedBytes=fd.Decode(image.Stream.Value);
字节[]结果字节=null;
int newWidth=宽度;
int对齐=4;
如果(新宽度%对齐!=0)
//BMP文件中的图像数据总是从DWORD边界开始,而在PDF中则从字节边界开始。
//大多数图像的宽度是4的倍数,因此它们没有问题。
//必须逐行复制图像数据,并从DWORD边界开始每行。
{
while(newWidth%对齐!=0)
{
newWidth++;
}
var copy_dword_boundary=新字节[高度,新宽度];
对于(int y=0;y 如果(x感谢大家的建议。其他一位开发人员设法破解了它-它是(正如Jongware所建议的)一个JPEG,但它实际上也是压缩的!一旦解压缩,它就可以被正常处理和识别。不知道PDFSharp图像的bitsPerComponent
是什么,但PDF的bitsPerComponent
可以是1,2,4,8(或16,但这不是你的情况)。你应该检查bitsPerComponent
和ColorSpace
以检查所有可能的变体。哈哈。目前还不知道你的代码在什么特定点失败,但解压你的“PNG”图像显示IDAT
内容实际上是原始JPEG字节。不幸的是,我有点希望通过这种方式获得一个可显示的图像(我没有),因此显然还有其他问题。你确定你的输入图像实际上是PNG吗?顺便问一下,“PNG”部分说它是1656 x 2340像素,但这个(格式错误的)JPEG告诉我它是1654 x 2340像素。数字可能会帮助你找到原因,也可能不会帮助你找到原因。有机会看到解决方案吗?谢谢