Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/csharp/323.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
在C#中使用IFilter并从数据库而不是文件系统中检索文件_C#_Ifilter - Fatal编程技术网

在C#中使用IFilter并从数据库而不是文件系统中检索文件

在C#中使用IFilter并从数据库而不是文件系统中检索文件,c#,ifilter,C#,Ifilter,对于C#web应用程序,我希望为数据库中存储的PDF、DOC等文件中的文本编制索引 我一直在试验,它对文件系统中的文件非常有效,但我的文件存储在MS-SQL数据库中 有人能帮我找到一个示例,从存储在数据库中的文件中提取文本,或者知道如何修改代码项目代码以使用数据库而不是文件系统吗?我过去曾致力于提供iFilter,用于提供对AutoCad dwg文件中文本内容的任何搜索/索引工具访问。你可以在这里阅读我的一些冒险经历: 您引用的代码很旧,但仍然有效。但是,现在除了GetTextFromFile之

对于C#web应用程序,我希望为数据库中存储的PDF、DOC等文件中的文本编制索引

我一直在试验,它对文件系统中的文件非常有效,但我的文件存储在MS-SQL数据库中


有人能帮我找到一个示例,从存储在数据库中的文件中提取文本,或者知道如何修改代码项目代码以使用数据库而不是文件系统吗?

我过去曾致力于提供iFilter,用于提供对AutoCad dwg文件中文本内容的任何搜索/索引工具访问。你可以在这里阅读我的一些冒险经历:

您引用的代码很旧,但仍然有效。但是,现在除了GetTextFromFile之外,还有更多的接口在使用。您将需要使用流阅读器,在我上面提到的链接中的IPersistStream中读取。如果我了解您想要做什么,您需要从数据库中以流的形式打开文件,并将此流呈现给您选择的搜索/索引器或iFilter

祝你好运,
Marco

我本希望做同样的事情,但最后我在数据库表中为TextContent添加了另一列。我将BinaryContent保存到一个临时文件中,使用CodeProject库Epocalisde.IFilter dll查找文本,并将其添加到TextContent列。

经过几个小时后,我终于想出了如何实现这一点!我需要对存储在数据库中的PDF内容运行IFilter,我希望避免将数据保存到临时文件中

首先,我尝试使用API为流中存储的内容创建IFilter,但它似乎不能正常工作(至少在这种情况下不能)。所以不要走那条路

相反,您需要为文件扩展名创建IFilter(或以其他方式访问它)。然后您可以访问COM接口并使用它将PDF内容加载到IFilter中。其余的工作原理与文件相同。但是,请注意,并非每个IFilter都可以实现IPersistStream API。但它适用于Adobe PDF IFilter

代码应该是这样的(我删除了一些返回代码检查以使代码更可读,但是,您应该检查所有可能的返回代码)

在我的代码中,从过滤器中提取的文本如下所示。在web上有很多这样的例子,可以根据您的需要以多种方式实现

private string ExtractTextFromIFilter(IFilter filter)
{
   var sb = new StringBuilder();

   while (true)
   {
      StatChunk chunk;
      result = filter.GetChunk(out chunk);

      if (result == FilterReturnCodes.S_OK)
      {
         if (chunk.flags == ChunkState.CHUNK_TEXT)
         {
            sb.Append(ExtractTextFromChunk(filter, chunk));
         }

         continue;
      }

      if (result == FilterReturnCodes.FILTER_E_END_OF_CHUNKS)
      {
         return sb.ToString();
      }

      Marshal.ThrowExceptionForHR((int)result);
   }
}

private virtual string ExtractTextFromChunk(IFilter filter, StatChunk chunk)
{
   var sb = new StringBuilder();

   var result = FilterReturnCodes.S_OK;
   while (result == FilterReturnCodes.S_OK)
   {
      int sizeBuffer = 16384;
      var buffer = new StringBuilder(sizeBuffer);
      result = filter.GetText(ref sizeBuffer, buffer);

      if ((result == FilterReturnCodes.S_OK) || (result == FilterReturnCodes.FILTER_S_LAST_TEXT))
      {
         if((sizeBuffer > 0) && (buffer.Length > 0))
         {
            sb.Append(buffer.ToString(0, sizeBuffer));
         }
      }

      if (result == FilterReturnCodes.FILTER_E_NO_TEXT)
      {
         return string.Empty;
      }

      if ((result == FilterReturnCodes.FILTER_S_LAST_TEXT) || (result == FilterReturnCodes.FILTER_E_NO_MORE_TEXT))
      {
         return sb.ToString();
      }
   }

   return sb.ToString();
}
下面是本机方法的定义及其使用的结构

internal static class NativeMethods
{
    [DllImport("query.dll", SetLastError = true, CharSet = CharSet.Unicode)]
    public static extern FilterReturnCodes LoadIFilter(
        string pwcsPath,
        [MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter,
        ref IFilter ppIUnk);

    [DllImport("ole32.dll")]
    public static extern int CreateStreamOnHGlobal(IntPtr hGlobal, bool fDeleteOnRelease, out IStream ppstm);
}

[ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IFilter
{
    [PreserveSig]
    FilterReturnCodes Init(FilterInit grfFlags, int cAttributes, IntPtr aAttributes, out FilterFlags pdwFlags);

    [PreserveSig]
    FilterReturnCodes GetChunk(out StatChunk pStat);

    [PreserveSig]
    FilterReturnCodes GetText(
        ref int pcwcBuffer,
        [Out, MarshalAs(UnmanagedType.LPWStr)] StringBuilder awcBuffer);

    [PreserveSig]
    FilterReturnCodes GetValue(ref IntPtr propVal);

    [PreserveSig]
    FilterReturnCodes BindRegion(ref FilterRegion origPos, ref Guid riid, ref object ppunk);
}

[InterfaceType(ComInterfaceType.InterfaceIsIUnknown), Guid("0000010c-0000-0000-C000-000000000046")]
public interface IPersist
{
    void GetClassID(out Guid pClassID);
}

[InterfaceType(ComInterfaceType.InterfaceIsIUnknown), Guid("00000109-0000-0000-C000-000000000046")]
public interface IPersistStream : IPersist
{
    new void GetClassID(out Guid pClassID);

    [PreserveSig]
    int IsDirty();

    void Load([In] IStream pStm);

    void Save(
        [In] IStream pStm,
        [In, MarshalAs(UnmanagedType.Bool)] bool fClearDirty);

    void GetSizeMax(out long pcbSize);
}

public struct StatChunk
{
    public int idChunk;
    [MarshalAs(UnmanagedType.U4)]
    public ChunkBreaktype breakType;
    [MarshalAs(UnmanagedType.U4)]
    public ChunkState flags;
    public int locale;
    public FullPropSpec attribute;
    public int idChunkSource;
    public int cwcStartSource;
    public int cwcLenSource;
}

public enum ChunkBreaktype
{
    CHUNK_NO_BREAK = 0,
    CHUNK_EOW = 1,
    CHUNK_EOS = 2,
    CHUNK_EOP = 3,
    CHUNK_EOC = 4
}

public enum ChunkState
{
    CHUNK_TEXT = 0x1,
    CHUNK_VALUE = 0x2,
    CHUNK_FILTER_OWNED_VALUE = 0x4
}

[Flags]
public enum FilterFlags
{
    IFILTER_FLAGS_OLE_PROPERTIES = 1
}

[Flags]
public enum FilterInit
{
    IFILTER_INIT_CANON_PARAGRAPHS = 1,
    IFILTER_INIT_HARD_LINE_BREAKS = 2,
    IFILTER_INIT_CANON_HYPHENS = 4,
    IFILTER_INIT_CANON_SPACES = 8,
    IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 16,
    IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES = 256,
    IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 32,
    IFILTER_INIT_INDEXING_ONLY = 64,
    IFILTER_INIT_SEARCH_LINKS = 128,
    IFILTER_INIT_FILTER_OWNED_VALUE_OK = 512
}

public struct FilterRegion
{
    public int idChunk;
    public int cwcStart;
    public int cwcExtent;
}

public enum FilterReturnCodes : uint
{
    S_OK = 0,
    E_ACCESSDENIED = 0x80070005,
    E_HANDLE = 0x80070006,
    E_INVALIDARG = 0x80070057,
    E_OUTOFMEMORY = 0x8007000E,
    E_NOTIMPL = 0x80004001,
    E_FAIL = 0x80000008,
    FILTER_E_PASSWORD = 0x8004170B,
    FILTER_E_UNKNOWNFORMAT = 0x8004170C,
    FILTER_E_NO_TEXT = 0x80041705,
    FILTER_E_NO_VALUES = 0x80041706,
    FILTER_E_END_OF_CHUNKS = 0x80041700,
    FILTER_E_NO_MORE_TEXT = 0x80041701,
    FILTER_E_NO_MORE_VALUES = 0x80041702,
    FILTER_E_ACCESS = 0x80041703,
    FILTER_W_MONIKER_CLIPPED = 0x00041704,
    FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
    FILTER_E_LINK_UNAVAILABLE = 0x80041708,
    FILTER_S_LAST_TEXT = 0x00041709,
    FILTER_S_LAST_VALUES = 0x0004170A
}

public struct FullPropSpec
{
    public Guid guidPropSet;
    public PropSpec psProperty;
}

[StructLayout(LayoutKind.Explicit)]
public struct PropSpec
{
    [FieldOffset(0)]
    public int ulKind;     

    [FieldOffset(4)]
    public int propid;

    [FieldOffset(4)]
    public IntPtr lpwstr;
}

在Mareks示例的基础上,我的示例使用接口的实现,而不是通过
Marshal.AllocHGlobal
分配内存来创建COM流

它适用于和大量格式,如
.doc
.docx
.html
.odt
.rtf
,列表如下

完整示例:

using System;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;

namespace TextExtraction
{
    class Program
    {
        static void Main(string[] args)
        {
            var file = new FileInfo(@"C:\Path\To\Some.doc");

            using (var stream = file.OpenRead())
            {
                var filter = Load(stream, file.Extension);
                if (filter != null)
                {
                    var text = GetText(filter);
                    Console.WriteLine(text);
                }
            }

            Console.WriteLine("Press your favorite key to exit");
            Console.ReadKey();
        }

        private static IFilter Load(Stream stream, string extension)
        {
            IFilter filter = null;

            if (NativeMethods.LoadIFilter(extension, null, ref filter) == HRESULT.S_OK)
            {
                if (filter is IPersistStream persistStream)
                {
                    persistStream.Load(new ManagedStream(stream));

                    if (filter.Init(IFILTER_INIT.IFILTER_INIT_APPLY_INDEX_ATTRIBUTES, 0, IntPtr.Zero, out IFILTER_FLAGS filterFlags) == IFilterReturnCodes.S_OK)
                    {
                        return filter;
                    }
                }
            }

            return null;
        }

        private static string GetText(IFilter filter)
        {
            var text = new StringBuilder();

            while (filter.GetChunk(out var chunk) == IFilterReturnCodes.S_OK)
            {
                ReadChunk(filter, chunk, text);
            }

            return text.ToString();
        }

        private static void ReadChunk(IFilter filter, STAT_CHUNK chunk, StringBuilder text)
        {
            var textResult = IFilterReturnCodes.S_OK;
            while (textResult == IFilterReturnCodes.S_OK)
            {
                var bufferSize = 4096U;
                var buffer = new char[bufferSize];
                textResult = filter.GetText(ref bufferSize, buffer);

                if ((textResult == IFilterReturnCodes.S_OK || textResult == IFilterReturnCodes.FILTER_S_LAST_TEXT) && bufferSize > 0)
                {
                    if (chunk.breakType == CHUNK_BREAKTYPE.CHUNK_EOP)
                    {
                        text.Append('\n');
                    }

                    text.Append(buffer, 0, (int) bufferSize);
                }
            }
        }

        [Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
        [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
        public interface IFilter
        {
            [PreserveSig]
            IFilterReturnCodes Init(IFILTER_INIT grfFlags, int cAttributes, IntPtr aAttributes,
                out IFILTER_FLAGS pdwFlags);

            [PreserveSig]
            IFilterReturnCodes GetChunk(out STAT_CHUNK pStat);

            [PreserveSig]
            IFilterReturnCodes GetText(ref uint pcwcBuffer, [Out, MarshalAs(UnmanagedType.LPArray)]
                char[] awcBuffer);

            [PreserveSig]
            IFilterReturnCodes GetValue(ref IntPtr propVal);

            [PreserveSig]
            IFilterReturnCodes BindRegion(ref FILTERREGION origPos, ref Guid riid, ref object ppunk);
        }

        [Guid("0000010C-0000-0000-C000-000000000046")]
        [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
        public interface IPersist
        {
            void GetClassID(out Guid pClassID);
        }

        [Guid("00000109-0000-0000-C000-000000000046")]
        [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
        public interface IPersistStream : IPersist
        {
            new void GetClassID(out Guid pClassID);

            [PreserveSig]
            int IsDirty();

            void Load([In] IStream pStm);

            void Save([In] IStream pStm, [In, MarshalAs(UnmanagedType.Bool)] bool fClearDirty);

            void GetSizeMax(out long pcbSize);
        }

        [Guid("0000000C-0000-0000-C000-000000000046")]
        [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
        public interface IStream
        {
            [PreserveSig]
            HRESULT Read([MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)] [Out]
                byte[] pv, int cb, IntPtr pcbRead);

            [PreserveSig]
            HRESULT Write([MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)]
                byte[] pv, int cb, IntPtr pcbWritten);

            [PreserveSig]
            HRESULT Seek(long dlibMove, int dwOrigin, IntPtr plibNewPosition);

            [PreserveSig]
            HRESULT SetSize(long libNewSize);

            HRESULT CopyTo(IStream pstm, long cb, IntPtr pcbRead, IntPtr pcbWritten);

            [PreserveSig]
            HRESULT Commit(int grfCommitFlags);

            [PreserveSig]
            HRESULT Revert();

            [PreserveSig]
            HRESULT LockRegion(long libOffset, long cb, int dwLockType);

            [PreserveSig]
            HRESULT UnlockRegion(long libOffset, long cb, int dwLockType);

            [PreserveSig]
            HRESULT Stat(out STATSTG pstatstg, int grfStatFlag);

            [PreserveSig]
            HRESULT Clone(out IStream ppstm);
        }

        public class ManagedStream : IStream
        {
            private readonly Stream _stream;

            public ManagedStream(Stream stream)
            {
                _stream = stream ?? throw new ArgumentNullException(nameof(stream));
            }

            public HRESULT Clone(out IStream ppstm)
            {
                ppstm = null;
                return HRESULT.E_NOTIMPL;
            }

            public HRESULT Commit(int grfCommitFlags)
            {
                return HRESULT.E_NOTIMPL;
            }

            public HRESULT CopyTo(IStream pstm, long cb, IntPtr pcbRead, IntPtr pcbWritten)
            {
                return HRESULT.E_NOTIMPL;
            }

            public HRESULT LockRegion(long libOffset, long cb, int dwLockType)
            {
                return HRESULT.E_NOTIMPL;
            }

            public HRESULT Read(byte[] pv, int cb, IntPtr pcbRead)
            {
                var bytesRead = _stream.Read(pv, 0, cb);
                if (pcbRead != IntPtr.Zero)
                {
                    Marshal.WriteInt32(pcbRead, bytesRead);
                }

                return HRESULT.S_OK;
            }

            public HRESULT Revert()
            {
                return HRESULT.E_NOTIMPL;
            }

            public HRESULT Seek(long dlibMove, int dwOrigin, IntPtr plibNewPosition)
            {
                SeekOrigin seekOrigin;

                switch (dwOrigin)
                {
                    case (int) STREAM_SEEK.STREAM_SEEK_SET:
                        seekOrigin = SeekOrigin.Begin;
                        break;
                    case (int) STREAM_SEEK.STREAM_SEEK_CUR:
                        seekOrigin = SeekOrigin.Current;
                        break;
                    case (int) STREAM_SEEK.STREAM_SEEK_END:
                        seekOrigin = SeekOrigin.End;
                        break;
                    default:
                        return HRESULT.E_FAIL;
                }

                var position = _stream.Seek(dlibMove, seekOrigin);

                if (plibNewPosition != IntPtr.Zero)
                {
                    Marshal.WriteInt64(plibNewPosition, position);
                }

                return HRESULT.S_OK;
            }

            public HRESULT SetSize(long libNewSize)
            {
                return HRESULT.E_NOTIMPL;
            }

            public HRESULT Stat(out STATSTG pstatstg, int grfStatFlag)
            {
                pstatstg = new STATSTG
                {
                    type = (int) STGTY.STGTY_STREAM,
                    cbSize = _stream.Length,
                    grfMode = (int) STGM.STGM_READ
                };

                if (_stream.CanRead && _stream.CanWrite)
                {
                    pstatstg.grfMode |= (int) STGM.STGM_READWRITE;
                }
                else if (_stream.CanRead)
                {
                    pstatstg.grfMode |= (int) STGM.STGM_READ;
                }
                else if (_stream.CanWrite)
                {
                    pstatstg.grfMode |= (int) STGM.STGM_WRITE;
                }
                else
                {
                    return HRESULT.E_ACCESSDENIED;
                }

                return HRESULT.S_OK;
            }

            public HRESULT UnlockRegion(long libOffset, long cb, int dwLockType)
            {
                return HRESULT.E_NOTIMPL;
            }

            public HRESULT Write(byte[] pv, int cb, IntPtr pcbWritten)
            {
                return HRESULT.E_NOTIMPL;
            }
        }

        public class NativeMethods
        {
            [DllImport("query.dll", SetLastError = true, CharSet = CharSet.Unicode)]
            public static extern HRESULT LoadIFilter(string pwcsPath, [MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter, ref IFilter ppIUnk);
        }

        public struct FILETIME
        {
            public uint DateTimeLow;
            public uint DateTimeHigh;
        }

        [StructLayout(LayoutKind.Sequential)]
        public struct FILTERREGION
        {
            public ulong idChunk;
            public ulong cwcStart;
            public ulong cwcExtent;
        }

        [StructLayout(LayoutKind.Sequential)]
        public struct FULLPROPSPEC
        {
            public Guid guidPropSet;
            public PROPSPEC psProperty;
        }

        [StructLayout(LayoutKind.Explicit)]
        public struct PROPSPEC
        {
            [FieldOffset(0)]
            public PROPSPECKIND ulKind;

            [FieldOffset(4)]
            public uint propid;

            [FieldOffset(4)]
            public IntPtr lpwstr;
        }

        public struct STAT_CHUNK
        {
            public int idChunk;

            [MarshalAs(UnmanagedType.U4)]
            public CHUNK_BREAKTYPE breakType;

            [MarshalAs(UnmanagedType.U4)]
            public CHUNKSTATE flags;

            public int locale;

            public FULLPROPSPEC attribute;

            public int idChunkSource;

            public int cwcStartSource;

            public int cwcLenSource;
        }

        public struct STATSTG
        {
            [MarshalAs(UnmanagedType.LPTStr)]
            public string pwcsName;
            public int type;
            public long cbSize;
            public FILETIME mtime;
            public FILETIME ctime;
            public FILETIME atime;
            public int grfMode;
            public int grfLocksSupported;
            public Guid clsid;
            public int grfStateBits;
            public int reserved;
        }

        [Flags]
        public enum IFilterReturnCodes : uint
        {
            S_OK = 0,
            E_ACCESSDENIED = 0x80070005,
            E_HANDLE = 0x80070006,
            E_INVALIDARG = 0x80070057,
            E_OUTOFMEMORY = 0x8007000E,
            E_NOTIMPL = 0x80004001,
            E_FAIL = 0x80000008,

            FILTER_E_PASSWORD = 0x8004170B,
            FILTER_E_UNKNOWNFORMAT = 0x8004170C,
            FILTER_E_NO_TEXT = 0x80041705,
            FILTER_E_NO_VALUES = 0x80041706,
            FILTER_E_END_OF_CHUNKS = 0x80041700,
            FILTER_E_NO_MORE_TEXT = 0x80041701,
            FILTER_E_NO_MORE_VALUES = 0x80041702,
            FILTER_E_ACCESS = 0x80041703,
            FILTER_W_MONIKER_CLIPPED = 0x00041704,
            FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
            FILTER_E_LINK_UNAVAILABLE = 0x80041708,
            FILTER_S_LAST_TEXT = 0x00041709,
            FILTER_S_LAST_VALUES = 0x0004170A
        }

        [Flags]
        public enum CHUNK_BREAKTYPE : uint
        {
            CHUNK_NO_BREAK = 0,
            CHUNK_EOW = 1,
            CHUNK_EOS = 2,
            CHUNK_EOP = 3,
            CHUNK_EOC = 4
        }

        [Flags]
        public enum CHUNKSTATE : uint
        {
            CHUNK_TEXT = 0x1,
            CHUNK_VALUE = 0x2,
            CHUNK_FILTER_OWNED_VALUE = 0x4
        }

        [Flags]
        public enum HRESULT : uint
        {
            S_OK = 0x00000000,
            E_NOTIMPL = 0x80004001,
            E_NOINTERFACE = 0x80004002,
            E_POINTER = 0x80004003,
            E_ABORT = 0x80004004,
            E_FAIL = 0x80004005,
            E_UNEXPECTED = 0x8000FFFF,
            E_ACCESSDENIED = 0x80070005,
            E_HANDLE = 0x80070006,
            E_OUTOFMEMORY = 0x8007000E,
            E_INVALIDARG = 0x80070057
        }

        [Flags]
        public enum IFILTER_FLAGS
        {
            IFILTER_FLAGS_OLE_PROPERTIES = 1
        }

        [Flags]
        public enum IFILTER_INIT
        {
            IFILTER_INIT_CANON_PARAGRAPHS = 1,
            IFILTER_INIT_HARD_LINE_BREAKS = 2,
            IFILTER_INIT_CANON_HYPHENS = 4,
            IFILTER_INIT_CANON_SPACES = 8,
            IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 16,
            IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES = 256,
            IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 32,
            IFILTER_INIT_INDEXING_ONLY = 64,
            IFILTER_INIT_SEARCH_LINKS = 128,
            IFILTER_INIT_FILTER_OWNED_VALUE_OK = 512,
            IFILTER_INIT_FILTER_AGGRESSIVE_BREAK = 1024,
            IFILTER_INIT_DISABLED_EMBEDDED = 2048,
            IFILTER_INIT_EMIT_FORMATTING = 4096
        }

        [Flags]
        public enum PROPSPECKIND : ulong
        {
            PRSPEC_LPWSTR = 0,
            PRSPEC_PROPID = 1
        }

        [Flags]
        public enum STGM : ulong
        {
            STGM_READ = 0x00000000L,
            STGM_WRITE = 0x00000001L,
            STGM_READWRITE = 0x00000002L,
            STGM_SHARE_DENY_NONE = 0x00000040L,
            STGM_SHARE_DENY_READ = 0x00000030L,
            STGM_SHARE_DENY_WRITE = 0x00000020L,
            STGM_SHARE_EXCLUSIVE = 0x00000010L,
            STGM_PRIORITY = 0x00040000L,
            STGM_CREATE = 0x00001000L,
            STGM_CONVERT = 0x00020000L,
            STGM_FAILIFTHERE = 0x00000000L,
            STGM_DIRECT = 0x00000000L,
            STGM_TRANSACTED = 0x00010000L,
            STGM_NOSCRATCH = 0x00100000L,
            STGM_NOSNAPSHOT = 0x00200000L,
            STGM_SIMPLE = 0x08000000L,
            STGM_DIRECT_SWMR = 0x00400000L,
            STGM_DELETEONRELEASE = 0x04000000L
        }

        [Flags]
        public enum STGTY : int
        {
            STGTY_STORAGE = 1,
            STGTY_STREAM = 2,
            STGTY_LOCKBYTES = 3,
            STGTY_PROPERTY = 4
        }

        [Flags]
        public enum STREAM_SEEK : int
        {
            STREAM_SEEK_SET = 0,
            STREAM_SEEK_CUR = 1,
            STREAM_SEEK_END = 2
        }
    }
}

这似乎是正确的途径,但我还没有找到任何工作的例子,也没有得到任何工作自己。我还读到AdobePDF IFilter只适用于文件而不适用于流-不确定这是否属实-但我想知道这是否真的可能。codeplex上不是有2005年的示例代码吗?无论如何,缺少活样本对我的项目来说是一个很大的挫折。查看流接口的Microsoft定义,然后从那里开始。有一个缓冲区内存和一个标志,表示您需要重复调用以获得另一个缓冲区填充。对于PDF ifilter,您可能需要寻找其他供应商,因为之前adobe并不是第一个提供PDF ifilter的供应商,例如64位版本。试试Foxit网站:哇,这就是我所说的详细答案!
using System;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;

namespace TextExtraction
{
    class Program
    {
        static void Main(string[] args)
        {
            var file = new FileInfo(@"C:\Path\To\Some.doc");

            using (var stream = file.OpenRead())
            {
                var filter = Load(stream, file.Extension);
                if (filter != null)
                {
                    var text = GetText(filter);
                    Console.WriteLine(text);
                }
            }

            Console.WriteLine("Press your favorite key to exit");
            Console.ReadKey();
        }

        private static IFilter Load(Stream stream, string extension)
        {
            IFilter filter = null;

            if (NativeMethods.LoadIFilter(extension, null, ref filter) == HRESULT.S_OK)
            {
                if (filter is IPersistStream persistStream)
                {
                    persistStream.Load(new ManagedStream(stream));

                    if (filter.Init(IFILTER_INIT.IFILTER_INIT_APPLY_INDEX_ATTRIBUTES, 0, IntPtr.Zero, out IFILTER_FLAGS filterFlags) == IFilterReturnCodes.S_OK)
                    {
                        return filter;
                    }
                }
            }

            return null;
        }

        private static string GetText(IFilter filter)
        {
            var text = new StringBuilder();

            while (filter.GetChunk(out var chunk) == IFilterReturnCodes.S_OK)
            {
                ReadChunk(filter, chunk, text);
            }

            return text.ToString();
        }

        private static void ReadChunk(IFilter filter, STAT_CHUNK chunk, StringBuilder text)
        {
            var textResult = IFilterReturnCodes.S_OK;
            while (textResult == IFilterReturnCodes.S_OK)
            {
                var bufferSize = 4096U;
                var buffer = new char[bufferSize];
                textResult = filter.GetText(ref bufferSize, buffer);

                if ((textResult == IFilterReturnCodes.S_OK || textResult == IFilterReturnCodes.FILTER_S_LAST_TEXT) && bufferSize > 0)
                {
                    if (chunk.breakType == CHUNK_BREAKTYPE.CHUNK_EOP)
                    {
                        text.Append('\n');
                    }

                    text.Append(buffer, 0, (int) bufferSize);
                }
            }
        }

        [Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
        [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
        public interface IFilter
        {
            [PreserveSig]
            IFilterReturnCodes Init(IFILTER_INIT grfFlags, int cAttributes, IntPtr aAttributes,
                out IFILTER_FLAGS pdwFlags);

            [PreserveSig]
            IFilterReturnCodes GetChunk(out STAT_CHUNK pStat);

            [PreserveSig]
            IFilterReturnCodes GetText(ref uint pcwcBuffer, [Out, MarshalAs(UnmanagedType.LPArray)]
                char[] awcBuffer);

            [PreserveSig]
            IFilterReturnCodes GetValue(ref IntPtr propVal);

            [PreserveSig]
            IFilterReturnCodes BindRegion(ref FILTERREGION origPos, ref Guid riid, ref object ppunk);
        }

        [Guid("0000010C-0000-0000-C000-000000000046")]
        [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
        public interface IPersist
        {
            void GetClassID(out Guid pClassID);
        }

        [Guid("00000109-0000-0000-C000-000000000046")]
        [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
        public interface IPersistStream : IPersist
        {
            new void GetClassID(out Guid pClassID);

            [PreserveSig]
            int IsDirty();

            void Load([In] IStream pStm);

            void Save([In] IStream pStm, [In, MarshalAs(UnmanagedType.Bool)] bool fClearDirty);

            void GetSizeMax(out long pcbSize);
        }

        [Guid("0000000C-0000-0000-C000-000000000046")]
        [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
        public interface IStream
        {
            [PreserveSig]
            HRESULT Read([MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)] [Out]
                byte[] pv, int cb, IntPtr pcbRead);

            [PreserveSig]
            HRESULT Write([MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)]
                byte[] pv, int cb, IntPtr pcbWritten);

            [PreserveSig]
            HRESULT Seek(long dlibMove, int dwOrigin, IntPtr plibNewPosition);

            [PreserveSig]
            HRESULT SetSize(long libNewSize);

            HRESULT CopyTo(IStream pstm, long cb, IntPtr pcbRead, IntPtr pcbWritten);

            [PreserveSig]
            HRESULT Commit(int grfCommitFlags);

            [PreserveSig]
            HRESULT Revert();

            [PreserveSig]
            HRESULT LockRegion(long libOffset, long cb, int dwLockType);

            [PreserveSig]
            HRESULT UnlockRegion(long libOffset, long cb, int dwLockType);

            [PreserveSig]
            HRESULT Stat(out STATSTG pstatstg, int grfStatFlag);

            [PreserveSig]
            HRESULT Clone(out IStream ppstm);
        }

        public class ManagedStream : IStream
        {
            private readonly Stream _stream;

            public ManagedStream(Stream stream)
            {
                _stream = stream ?? throw new ArgumentNullException(nameof(stream));
            }

            public HRESULT Clone(out IStream ppstm)
            {
                ppstm = null;
                return HRESULT.E_NOTIMPL;
            }

            public HRESULT Commit(int grfCommitFlags)
            {
                return HRESULT.E_NOTIMPL;
            }

            public HRESULT CopyTo(IStream pstm, long cb, IntPtr pcbRead, IntPtr pcbWritten)
            {
                return HRESULT.E_NOTIMPL;
            }

            public HRESULT LockRegion(long libOffset, long cb, int dwLockType)
            {
                return HRESULT.E_NOTIMPL;
            }

            public HRESULT Read(byte[] pv, int cb, IntPtr pcbRead)
            {
                var bytesRead = _stream.Read(pv, 0, cb);
                if (pcbRead != IntPtr.Zero)
                {
                    Marshal.WriteInt32(pcbRead, bytesRead);
                }

                return HRESULT.S_OK;
            }

            public HRESULT Revert()
            {
                return HRESULT.E_NOTIMPL;
            }

            public HRESULT Seek(long dlibMove, int dwOrigin, IntPtr plibNewPosition)
            {
                SeekOrigin seekOrigin;

                switch (dwOrigin)
                {
                    case (int) STREAM_SEEK.STREAM_SEEK_SET:
                        seekOrigin = SeekOrigin.Begin;
                        break;
                    case (int) STREAM_SEEK.STREAM_SEEK_CUR:
                        seekOrigin = SeekOrigin.Current;
                        break;
                    case (int) STREAM_SEEK.STREAM_SEEK_END:
                        seekOrigin = SeekOrigin.End;
                        break;
                    default:
                        return HRESULT.E_FAIL;
                }

                var position = _stream.Seek(dlibMove, seekOrigin);

                if (plibNewPosition != IntPtr.Zero)
                {
                    Marshal.WriteInt64(plibNewPosition, position);
                }

                return HRESULT.S_OK;
            }

            public HRESULT SetSize(long libNewSize)
            {
                return HRESULT.E_NOTIMPL;
            }

            public HRESULT Stat(out STATSTG pstatstg, int grfStatFlag)
            {
                pstatstg = new STATSTG
                {
                    type = (int) STGTY.STGTY_STREAM,
                    cbSize = _stream.Length,
                    grfMode = (int) STGM.STGM_READ
                };

                if (_stream.CanRead && _stream.CanWrite)
                {
                    pstatstg.grfMode |= (int) STGM.STGM_READWRITE;
                }
                else if (_stream.CanRead)
                {
                    pstatstg.grfMode |= (int) STGM.STGM_READ;
                }
                else if (_stream.CanWrite)
                {
                    pstatstg.grfMode |= (int) STGM.STGM_WRITE;
                }
                else
                {
                    return HRESULT.E_ACCESSDENIED;
                }

                return HRESULT.S_OK;
            }

            public HRESULT UnlockRegion(long libOffset, long cb, int dwLockType)
            {
                return HRESULT.E_NOTIMPL;
            }

            public HRESULT Write(byte[] pv, int cb, IntPtr pcbWritten)
            {
                return HRESULT.E_NOTIMPL;
            }
        }

        public class NativeMethods
        {
            [DllImport("query.dll", SetLastError = true, CharSet = CharSet.Unicode)]
            public static extern HRESULT LoadIFilter(string pwcsPath, [MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter, ref IFilter ppIUnk);
        }

        public struct FILETIME
        {
            public uint DateTimeLow;
            public uint DateTimeHigh;
        }

        [StructLayout(LayoutKind.Sequential)]
        public struct FILTERREGION
        {
            public ulong idChunk;
            public ulong cwcStart;
            public ulong cwcExtent;
        }

        [StructLayout(LayoutKind.Sequential)]
        public struct FULLPROPSPEC
        {
            public Guid guidPropSet;
            public PROPSPEC psProperty;
        }

        [StructLayout(LayoutKind.Explicit)]
        public struct PROPSPEC
        {
            [FieldOffset(0)]
            public PROPSPECKIND ulKind;

            [FieldOffset(4)]
            public uint propid;

            [FieldOffset(4)]
            public IntPtr lpwstr;
        }

        public struct STAT_CHUNK
        {
            public int idChunk;

            [MarshalAs(UnmanagedType.U4)]
            public CHUNK_BREAKTYPE breakType;

            [MarshalAs(UnmanagedType.U4)]
            public CHUNKSTATE flags;

            public int locale;

            public FULLPROPSPEC attribute;

            public int idChunkSource;

            public int cwcStartSource;

            public int cwcLenSource;
        }

        public struct STATSTG
        {
            [MarshalAs(UnmanagedType.LPTStr)]
            public string pwcsName;
            public int type;
            public long cbSize;
            public FILETIME mtime;
            public FILETIME ctime;
            public FILETIME atime;
            public int grfMode;
            public int grfLocksSupported;
            public Guid clsid;
            public int grfStateBits;
            public int reserved;
        }

        [Flags]
        public enum IFilterReturnCodes : uint
        {
            S_OK = 0,
            E_ACCESSDENIED = 0x80070005,
            E_HANDLE = 0x80070006,
            E_INVALIDARG = 0x80070057,
            E_OUTOFMEMORY = 0x8007000E,
            E_NOTIMPL = 0x80004001,
            E_FAIL = 0x80000008,

            FILTER_E_PASSWORD = 0x8004170B,
            FILTER_E_UNKNOWNFORMAT = 0x8004170C,
            FILTER_E_NO_TEXT = 0x80041705,
            FILTER_E_NO_VALUES = 0x80041706,
            FILTER_E_END_OF_CHUNKS = 0x80041700,
            FILTER_E_NO_MORE_TEXT = 0x80041701,
            FILTER_E_NO_MORE_VALUES = 0x80041702,
            FILTER_E_ACCESS = 0x80041703,
            FILTER_W_MONIKER_CLIPPED = 0x00041704,
            FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
            FILTER_E_LINK_UNAVAILABLE = 0x80041708,
            FILTER_S_LAST_TEXT = 0x00041709,
            FILTER_S_LAST_VALUES = 0x0004170A
        }

        [Flags]
        public enum CHUNK_BREAKTYPE : uint
        {
            CHUNK_NO_BREAK = 0,
            CHUNK_EOW = 1,
            CHUNK_EOS = 2,
            CHUNK_EOP = 3,
            CHUNK_EOC = 4
        }

        [Flags]
        public enum CHUNKSTATE : uint
        {
            CHUNK_TEXT = 0x1,
            CHUNK_VALUE = 0x2,
            CHUNK_FILTER_OWNED_VALUE = 0x4
        }

        [Flags]
        public enum HRESULT : uint
        {
            S_OK = 0x00000000,
            E_NOTIMPL = 0x80004001,
            E_NOINTERFACE = 0x80004002,
            E_POINTER = 0x80004003,
            E_ABORT = 0x80004004,
            E_FAIL = 0x80004005,
            E_UNEXPECTED = 0x8000FFFF,
            E_ACCESSDENIED = 0x80070005,
            E_HANDLE = 0x80070006,
            E_OUTOFMEMORY = 0x8007000E,
            E_INVALIDARG = 0x80070057
        }

        [Flags]
        public enum IFILTER_FLAGS
        {
            IFILTER_FLAGS_OLE_PROPERTIES = 1
        }

        [Flags]
        public enum IFILTER_INIT
        {
            IFILTER_INIT_CANON_PARAGRAPHS = 1,
            IFILTER_INIT_HARD_LINE_BREAKS = 2,
            IFILTER_INIT_CANON_HYPHENS = 4,
            IFILTER_INIT_CANON_SPACES = 8,
            IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 16,
            IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES = 256,
            IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 32,
            IFILTER_INIT_INDEXING_ONLY = 64,
            IFILTER_INIT_SEARCH_LINKS = 128,
            IFILTER_INIT_FILTER_OWNED_VALUE_OK = 512,
            IFILTER_INIT_FILTER_AGGRESSIVE_BREAK = 1024,
            IFILTER_INIT_DISABLED_EMBEDDED = 2048,
            IFILTER_INIT_EMIT_FORMATTING = 4096
        }

        [Flags]
        public enum PROPSPECKIND : ulong
        {
            PRSPEC_LPWSTR = 0,
            PRSPEC_PROPID = 1
        }

        [Flags]
        public enum STGM : ulong
        {
            STGM_READ = 0x00000000L,
            STGM_WRITE = 0x00000001L,
            STGM_READWRITE = 0x00000002L,
            STGM_SHARE_DENY_NONE = 0x00000040L,
            STGM_SHARE_DENY_READ = 0x00000030L,
            STGM_SHARE_DENY_WRITE = 0x00000020L,
            STGM_SHARE_EXCLUSIVE = 0x00000010L,
            STGM_PRIORITY = 0x00040000L,
            STGM_CREATE = 0x00001000L,
            STGM_CONVERT = 0x00020000L,
            STGM_FAILIFTHERE = 0x00000000L,
            STGM_DIRECT = 0x00000000L,
            STGM_TRANSACTED = 0x00010000L,
            STGM_NOSCRATCH = 0x00100000L,
            STGM_NOSNAPSHOT = 0x00200000L,
            STGM_SIMPLE = 0x08000000L,
            STGM_DIRECT_SWMR = 0x00400000L,
            STGM_DELETEONRELEASE = 0x04000000L
        }

        [Flags]
        public enum STGTY : int
        {
            STGTY_STORAGE = 1,
            STGTY_STREAM = 2,
            STGTY_LOCKBYTES = 3,
            STGTY_PROPERTY = 4
        }

        [Flags]
        public enum STREAM_SEEK : int
        {
            STREAM_SEEK_SET = 0,
            STREAM_SEEK_CUR = 1,
            STREAM_SEEK_END = 2
        }
    }
}