C# 比较两个不同长度的Word文档_C#_Ms Word_Diff_Openxml_Openxml Sdk

C# 比较两个不同长度的Word文档

c# ms-word

C# 比较两个不同长度的Word文档,c#,ms-word,diff,openxml,openxml-sdk,C#,Ms Word,Diff,Openxml,Openxml Sdk,我正在比较两个word文档。我的目标是得到一个列表，其中包含两个文件中的每一行，以及它们是否匹配。我让那部分工作。我的问题是，如果一个文件比另一个文件大，则较大文件中的文本不会添加到上述列表中。有些情况下，一份文件可能比另一份文件大。这是一个文件修订系统。当前版本可能比上一版本包含更多文本，反之亦然到目前为止，我有这个代码。我修改了找到的示例下面是我一直在使用的两个示例文件（word文档） Test1.docx： Test This is a test document. It was c

我正在比较两个word文档。我的目标是得到一个列表，其中包含两个文件中的每一行，以及它们是否匹配。我让那部分工作。我的问题是，如果一个文件比另一个文件大，则较大文件中的文本不会添加到上述列表中。有些情况下，一份文件可能比另一份文件大。这是一个文件修订系统。当前版本可能比上一版本包含更多文本，反之亦然

到目前为止，我有这个代码。我修改了找到的示例

下面是我一直在使用的两个示例文件（word文档）
Test1.docx：

Test

This is a test document. It was created May 31.
The contents of this document are:
Unknown

Test2.docx：

Test

This is a test document. It was created Apr 1.
The contents of this document are:
Test Item 1
Test Item 2

这是我的Program.cs文件，这是我编辑的地方。在比较文档方法中：

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml.Linq;
using DocumentFormat.OpenXml.Packaging;

namespace DocxDiff
{
    public class Program
    {
        private static List<DocumentCompare> _differences = new List<DocumentCompare>();

        public static string GetParagraphText(XElement p)
        {
            return p.Descendants(W.r)
                .Where(e => e.Parent.Name != W.del && e.Parent.Name != W.moveFrom)
                .Descendants(W.t)
                .Select(t => (string) t)
                .StringConcatenate();
        }

        public static List<DocumentCompare> CompareDocuments(WordprocessingDocument doc1, WordprocessingDocument doc2)
        {
            XDocument xDoc1 = doc1.MainDocumentPart.GetXDocument();
            XDocument xDoc2 = doc2.MainDocumentPart.GetXDocument();

            var doc1Elements = xDoc1
                .Descendants()
                .Where(e => e.Name != W.commentRangeStart
                            && e.Name != W.commentRangeEnd
                            && e.Name != W.proofErr
                            && !e.Ancestors(W.p).Any());
            var doc2Elements = xDoc2
                .Descendants()
                .Where(e => e.Name != W.commentRangeStart
                            && e.Name != W.commentRangeEnd
                            && e.Name != W.proofErr
                            && !e.Ancestors(W.p).Any());

            List<DocumentCompare> differences = new List<DocumentCompare>();

            IEnumerable<bool> correspondingElementEquivalency = doc1Elements.Zip(doc2Elements, (e1, e2) =>
            {
                // if the lines are different, set to true
                bool difference = false;
                if (e1.Name != e2.Name)
                {
                    return false;
                }

                if (e1.Name == W.p && e2.Name == W.p)
                {
                    if ((GetParagraphText(e1) != GetParagraphText(e2)))
                    {
                        // there is a difference between the documents
                        difference = true;
                    }

                    // record lines
                    differences.Add(new DocumentCompare() { Document1Text = e1.Value, Document2Text = e2.Value, Difference = difference });
                }

                // this is from the code in the link above
                // this method does not return a bool, it returns the list of differences
                return true;
            });

            // determine if the documents are equivalent
            // this has to be here to run the code above
            bool test = correspondingElementEquivalency.Any(e => e != true);

            return differences;
        }

        public static void Main(string[] args)
        {
            var doc1Path = @"C:\Diff\Test1.docx";
            var doc2Path = @"C:\Diff\Test2.docx";

            using(WordprocessingDocument doc1 = WordprocessingDocument.Open(doc1Path, false))
            using(WordprocessingDocument doc2 = WordprocessingDocument.Open(doc2Path, false))
            {
                _differences = CompareDocuments(doc1, doc2);

                foreach (var t in _differences)
                {
                    Console.WriteLine("Difference: {0}\nDoc 1: {1}\nDoc 2: {2}", t.Difference, t.Document1Text, t.Document2Text);
                }
            }

            Console.Read();
        }
    }
}

这是我的Extensions.cs文件（来自教程，未修改）：

编辑：我非常确定我需要修改Zip方法，以便在较大文件中添加行，并为另一个文件添加空字符串。我尝试（未成功）将其修改为工作状态（从）：

好的，这就是我想到的。（这可能不是最有效的，但我相信这是一个很好的解决方案）。以下是代码（解释如下）：

公共静态列表比较文档（WordProcessingDocumentDoc1、WordProcessingDocumentDoc2）
{
XDocument xDoc1=doc1.MainDocumentPart.GetXDocument（）；
XDocument xDoc2=doc2.MainDocumentPart.GetXDocument（）；
//这些查询返回word文档中包含文本的元素
var doc1元素=xDoc1
.后代（）
.其中（e=>e.Name！=W.commentRangeStart
&&e.名称！=W.commentRangeEnd
&&e.名称！=W.证明错误
&&！e.p.Any（））；
var doc2Elements=xDoc2
.后代（）
.其中（e=>e.Name！=W.commentRangeStart
&&e.名称！=W.commentRangeEnd
&&e.名称！=W.证明错误
&&！e.p.Any（））；
列表差异=新列表（）；
IEnumerable correspondingElementequality=doc1Elements.Zip（doc2Elements，（e1，e2）=>
{
布尔差=假；
if（e1.Name！=e2.Name）
{
返回false；
}
如果（e1.Name==W.p&&e2.Name==W.p）
{
//e1.名称==W.p&&
如果（（GetParagraphText（e1）！=GetParagraphText（e2）））
{
//文件之间存在差异
差异=真；
}
//记录行
添加（新的DocumentCompare（）{Document1Text=e1.Value，Document2Text=e2.Value，Difference=Difference}）；
}
返回true；
});
//确定文件是否等效
布尔测试=对应的元素等效性。任意（e=>e！=真）；
var doc1Values=（来自doc1Elements中的ie1，其中ie1.Name==W.p选择ie1.Value）；
var doc2Values=（来自doc2Elements中的ie2，其中ie2.Name==W.p选择ie2.Value）；
//确定较大的文档以将该文档的其余部分添加到列表中
var largerDoc=doc1Values.Count（）>doc2Values.Count（）&&doc1Values.Count（）！=doc2Values.Count（）？doc1Values:doc2Values；
var smallerDocCount=doc1Values.Count（）doc2Values.Count（）&&doc1Values.Count（）！=doc2Values.Count（）？true:false；
var doc1Arr=doc1Values.ToArray（）；
var doc2Arr=doc2Values.ToArray（）；
//为较大的文档添加剩余文本
对于（var i=smallerDocCount；i


好的，IEnumerable结束到最小文档结束的地方。然后，我得到较大文档的其余部分（从较小文档的末尾开始），并将其附加到列表的末尾。我仍然需要收紧一些代码，但它是有效的
public class DocumentCompare
{
    public string Document1Text { get; set; }
    public string Document2Text { get; set; }
    public bool Difference { get; set; }
}

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Linq;
using DocumentFormat.OpenXml.Packaging;

namespace DocxDiff
{
    public static class Extensions
    {
        public static XDocument GetXDocument(this OpenXmlPart part)
        {
            XDocument xdoc = part.Annotation<XDocument>();

            if (xdoc != null)
                return xdoc;

            using (StreamReader streamReader = new StreamReader(part.GetStream()))
                xdoc = XDocument.Load(XmlReader.Create(streamReader));

            part.AddAnnotation(xdoc);
            return xdoc;
        }

        public static string StringConcatenate(this IEnumerable<string> source)
        {
            StringBuilder sb = new StringBuilder();
            foreach (var s in source)
                sb.Append(s);
            return sb.ToString();
        }

        public static IEnumerable<TResult> Zip<TFirst, TSecond, TResult>(this IEnumerable<TFirst> first, IEnumerable<TSecond> second, Func<TFirst, TSecond, TResult> func)
        {
            var ie1 = first.GetEnumerator();
            var ie2 = second.GetEnumerator();

            while (ie1.MoveNext() && ie2.MoveNext())
            {
                yield return func(ie1.Current, ie2.Current);
            }
        }
    }
}

public static class W
{
    public static XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";

    public static XName p = w + "p";
    public static XName r = w + "r";
    public static XName t = w + "t";
    public static XName commentRangeStart = w + "commentRangeStart";
    public static XName commentRangeEnd = w + "commentRangeEnd";
    public static XName proofErr = w + "proofErr";
    public static XName del = w + "del";
    public static XName moveFrom = w + "moveFrom";
}

static void Main() {
    var a = new List<int> { 1, 2, 3 };
    var b = new List<int> { 1, 2, 3, 4, 5 };
    foreach (var c in a.Merge(b, (x, y) => x + y)) {
        Console.WriteLine(c);
    }
}
static IEnumerable<T> Merge<T>(this IEnumerable<T> first,
        IEnumerable<T> second, Func<T, T, T> operation) {
    using (var iter1 = first.GetEnumerator())
    using (var iter2 = second.GetEnumerator()) {
        while (iter1.MoveNext()) {
            if (iter2.MoveNext()) {
                yield return operation(iter1.Current, iter2.Current);
            } else {
                yield return iter1.Current;
            }
        }
        while (iter2.MoveNext()) {
            yield return iter2.Current;
        }
    }
}

// get document sizes
        var largerDoc = doc1Elements.Count() > doc2Elements.Count() && doc1Elements.Count() != doc2Elements.Count() ? doc1Elements : doc2Elements;
        var smallerDocCount = doc1Elements.Count() < doc2Elements.Count() && doc1Elements.Count() != doc2Elements.Count() ? doc1Elements.Count() : doc2Elements.Count();
        var doc1Larger = doc1Elements.Count() > doc2Elements.Count() && doc1Elements.Count() != doc2Elements.Count() ? true : false;
        var doc1Arr = doc1Elements.ToArray();
        var doc2Arr = doc2Elements.ToArray();

        // add in the remaining text for the larger document
        for (var i = smallerDocCount; i < largerDoc.Count(); i++)
        {
            // if doc1 is larger, add doc 1 and null for doc 2
            if (doc1Larger)
            {
                Console.WriteLine("doc1 Text: {0}", doc1Arr[i].Value);
                differences.Add(new DocumentComparison() { Document1Text = doc1Arr[i].Value, Document2Text = "", Difference = true });
            }
            else if(!doc1Larger) {
                Console.WriteLine("doc2 Text: {0}", doc2Arr[i].Value);
                differences.Add(new DocumentComparison() { Document1Text = "", Document2Text = doc2Arr[i].Value, Difference = true });
            }
        }

public static List<DocumentCompare> CompareDocuments(WordprocessingDocument doc1, WordprocessingDocument doc2)
    {
        XDocument xDoc1 = doc1.MainDocumentPart.GetXDocument();
        XDocument xDoc2 = doc2.MainDocumentPart.GetXDocument();

        // these queries return the elements that contain text in the word documents
        var doc1Elements = xDoc1
            .Descendants()
            .Where(e => e.Name != W.commentRangeStart
                        && e.Name != W.commentRangeEnd
                        && e.Name != W.proofErr
                        && !e.Ancestors(W.p).Any());
        var doc2Elements = xDoc2
            .Descendants()
            .Where(e => e.Name != W.commentRangeStart
                        && e.Name != W.commentRangeEnd
                        && e.Name != W.proofErr
                        && !e.Ancestors(W.p).Any());

        List<DocumentCompare> differences = new List<DocumentCompare>();

        IEnumerable<bool> correspondingElementEquivalency = doc1Elements.Zip(doc2Elements, (e1, e2) =>
        {
            bool difference = false;
            if (e1.Name != e2.Name)
            {
                return false;
            }

            if (e1.Name == W.p && e2.Name == W.p)
            {
                // e1.Name == W.p && 
                if ((GetParagraphText(e1) != GetParagraphText(e2)))
                {
                    // there is a difference between the documents
                    difference = true;
                }

                // record lines
                differences.Add(new DocumentCompare() { Document1Text = e1.Value, Document2Text = e2.Value, Difference = difference });
            }
            return true;
        });

        // determine if the documents are equivalent
        bool test = correspondingElementEquivalency.Any(e => e != true);

        var doc1Values = (from ie1 in doc1Elements where ie1.Name == W.p select ie1.Value).ToList();
        var doc2Values = (from ie2 in doc2Elements where ie2.Name == W.p select ie2.Value).ToList();

        // determine the larger document to add the remainder of that document to the list
        var largerDoc = doc1Values.Count() > doc2Values.Count() && doc1Values.Count() != doc2Values.Count() ? doc1Values : doc2Values;
        var smallerDocCount = doc1Values.Count() < doc2Values.Count() && doc1Values.Count() != doc2Values.Count() ? doc1Values.Count() : doc2Values.Count();
        var doc1Larger = doc1Values.Count() > doc2Values.Count() && doc1Values.Count() != doc2Values.Count() ? true : false;
        var doc1Arr = doc1Values.ToArray();
        var doc2Arr = doc2Values.ToArray();

        // add in the remaining text for the larger document
        for (var i = smallerDocCount; i < largerDoc.Count(); i++)
        {
            // if doc1 is larger, add doc 1 and null for doc 2
            if (doc1Larger)
            {
                Console.WriteLine("doc1 Text: {0}", doc1Arr[i]);
                differences.Add(new DocumentCompare() { Document1Text = doc1Arr[i], Document2Text = "", Difference = true });
            }
            else if(!doc1Larger) {
                Console.WriteLine("doc2 Text: {0}", doc2Arr[i]);
                differences.Add(new DocumentCompare() { Document1Text = "", Document2Text = doc2Arr[i], Difference = true });
            }
        }

        return differences;
    }