C# 按顺序搜索匹配单词的字符串列表_C#_Arrays_Regex_List

C# 按顺序搜索匹配单词的字符串列表
c# arrays regex list
C# 按顺序搜索匹配单词的字符串列表,c#,arrays,regex,list,C#,Arrays,Regex,List,我有一个来自总是变化的外部源的字符串的列表我想搜索每个字符串，在所有字符串之间按顺序找到匹配的单词然后从每个字符串中删除这组单词，只留下书名例子《指环王》是一本经典之作。《战争与和平》是一本经典之作。名为《三个火枪手》的书是一部经典之作名为的书将被删除。是经典。将被删除。名为序列的书不会被删除，因为战争与和平不是以序列开始的序列必须出现在所有字符串之间才能删除指环王战争与和平三个火枪手这是一个示例列表。我想在字符串上使用它，而不是书名例如：我去了家得宝。我去
我有一个来自总是变化的外部源的
字符串的列表

我想搜索每个字符串，在所有字符串之间按顺序找到匹配的单词
然后从每个字符串中删除这组单词，只留下书名

例子
《指环王》是一本经典之作。

《战争与和平》是一本经典之作。

名为《三个火枪手》的书是一部经典之作
名为
的书将被删除。

是经典。
将被删除。

名为
序列的书不会被删除，因为战争与和平
不是以序列开始的
序列必须出现在所有字符串之间才能删除
指环王

战争与和平

三个火枪手

这是一个示例列表。我想在字符串上使用它，而不是书名
例如：
我去了家得宝。

我去了沃尔格林。

我去了百思买
我去的
被删除
洛杉矶湖人队是我的最爱。

纽约尼克斯篮球队是我的最爱。

芝加哥公牛队是我最喜欢的篮球队
篮球队
被移除。

是我的最爱。
已删除

解决方案
我的想法是从一开始就搜索字符串，将匹配的单词分组，直到找到一个不匹配的单词，然后找到前缀
然后从字符串的末尾向后执行相同的操作，以查找后缀
它会揭示标题在中间。
但我不知道该怎么做
C#
List<string> sentences = new List<string>() 
{ 
    "The book named The Lord of the Rings is a classic.",
    "The book named War and Peace is a classic.",
    "The book named The Three Musketeers is a classic.",
};

List<string> titles = new List<string>() 


for (int i = 0; i < sentences.Count; i++)
{
    // Add Titles to their own List
    //
    titles.Add(FindTitle(sentence[i]));
}


String FindTitle(string sentence) 
{
    string title = string.Empty;

    // compare all strings in List
    // group common word sequences prefix (The book named)
    // group common word sequences suffix (is a classic.)
    // remove those word sequences from each string in List

    return title;
}

List语句=新列表（）
{ 
《指环王》是一部经典之作，
《战争与和平》是一本经典之作，
《三剑客》是一部经典之作，
};
列表标题=新列表（）
for（int i=0；i<句子数；i++）
{
//将标题添加到自己的列表中
//
标题。添加（FindTitle（第[i]句））；
}
字符串FindTitle（字符串语句）
{
string title=string.Empty；
//比较列表中的所有字符串
//组通用字串前缀（书名为）
//组通用单词序列后缀（是一个经典。）
//从列表中的每个字符串中删除这些单词序列
返回标题；
}
以下是我的方法。我选择了性能路线-我想仍然可以优化
编辑：使用regex.Escape帮助解决特殊字符的情况
用秒表计时我的v/s Rufus L的解决方案

使用-Rufus L的测试句输入：
private static List<List<string>> GetTestSentences()
{
    return new List<List<string>>
    {
        new List<string>()
        {
            "The book named The Lord of the Rings is a classic.",
            "The book named War and Peace is a classic.",
            "The book named The Three Musketeers is a classic.",
        },
        new List<string>
        {
            "I went to The Home Depot.",
            "I went to Walgreens.",
            "I went to Best Buy."
        },
        new List<string>
        {
            "The basketball team Los Angeles Lakers are my favorite.",
            "The basketball team New York Knicks are my favorite.",
            "The basketball team Chicago Bulls are my favorite."
        },
        new List<string>()
        {
            "The book named Lord of the Flies is a classic (500 This is a test)",
            "The book named Wuthering Heights is a classic (500 This is a test)",
            "The book named Great Expectations is a classic (500 This is a test)",
            "The book named The Lord of the Rings is a classic (500 This is a test)",
            "The book named War and Peace is a classic (500 This is a test)"
        }
    };
}

下面是神奇的方法：
private static string FindMatchingPattern(string sample1, string sample2, bool forwardDirection)
{
    string shorter = string.Empty;
    string longer = string.Empty;

    if (sample1.Length <= sample2.Length)
    {
        shorter = sample1;
        longer = sample2;
    }
    else
    {
        shorter = sample2;
        longer = sample1;
    }

    StringBuilder matchingPattern = new StringBuilder();
    StringBuilder wordHolder = new StringBuilder();

    if (forwardDirection)
    {
        for (int idx = 0; idx < shorter.Length; idx++)
        {
            if (shorter[idx] == longer[idx])
                if (shorter[idx] == ' ')
                {
                    matchingPattern.Append(wordHolder + " ");
                    wordHolder.Clear();
                }
                else
                    wordHolder.Append(shorter[idx]);
            else
                break;
        }
    }
    else
    {
        while (true)
        {
            if (shorter.Length > 0 && shorter[shorter.Length - 1] == longer[longer.Length - 1])
            {
                if (shorter[shorter.Length - 1] == ' ')
                {
                    matchingPattern.Insert(0, " " + wordHolder);
                    wordHolder.Clear();
                }
                else
                    wordHolder.Insert(0, shorter[shorter.Length - 1]);

                shorter = shorter.Remove(shorter.Length - 1, 1);
                longer = longer.Remove(longer.Length - 1, 1);
            }
            else
            {
                break;
            }
        }
    }

    return matchingPattern.ToString();
}

私有静态字符串FindMatchingPattern（字符串样本1、字符串样本2、布尔前进方向）
{
string shorter=string.Empty；
string longer=string.Empty；
if（sample1.Length 0&&shorter[shorter.Length-1]==longer[longer.Length-1]）
{
如果（较短[较短的长度-1]=''）
{
匹配模式。插入（0，“+字夹）；
wordHolder.Clear（）；
}
其他的
wordHolder.Insert（0，更短[shorter.Length-1]）；
更短=更短。删除（更短。长度-1，1）；
更长=更长。删除（更长。长度-1，1）；
}
其他的
{
打破
}
}
}
返回matchingPattern.ToString（）；
}
更新我修改了示例数据以包含不同类型的测试，并修改了RemoveCommonPrefixAndSuffix
以处理这些新测试

我发现，如果前两本书（或无论主题是什么）以相同的单词开头和/或结尾，那么仅仅比较前两个字符串的共同前缀和后缀可能是一个错误
例如：
new List<string>()
{
    "The book named Lord of the Rings 2 is a classic.",
    "The book named Lord of the Flies 2 is a classic.",
    "The book named This is pretty is a classic.",                
    "The book named War and Peace is a classic.",
    "The book named The Three Musketeers is a classic.",                
},

以下是获取测试数据的方法：
private static List<List<string>> GetTestSentences()
{
    return new List<List<string>>
    {
        // Prefix-only test
        new List<string>
        {
            "I went to The Home Depot",
            "I went to Walgreens",
            "I went to Best Buy",
        },
        // Suffix-only test
        new List<string>
        {
            "Game of Thrones is a good TV series",
            "Breaking Bad is a good TV series",
            "The Office is a good TV series",
        },
        // Prefix / Suffix test
        new List<string>
        {
            "The basketball team Los Angeles Lakers are my favorite",
            "The basketball team New York Knicks are my favorite",
            "The basketball team Chicago Bulls are my favorite",
        },
        // No prefix or suffix - all sentences are different
        new List<string>
        {
            "I went to The Home Depot",
            "Game of Thrones is a good TV series",
            "The basketball team Los Angeles Lakers are my favorite",
        },
        // All sentences are the same - no "topic" between prefix and suffix
        new List<string>()
        {
            "These sentences are all the same",
            "These sentences are all the same",
            "These sentences are all the same",
        },
        // Some sentences have no content between prefix and suffix
        new List<string>()
        {
            "This sentence has no topic",
            "This sentence [topic here] has no topic",
            "This sentence has no topic",
            "This sentence [another one] has no topic",
        },
        // First two topics have common beginnings
        new List<string>()
        {
            "The book named Lord of the Rings is a classic",
            "The book named Lord of the Flies is a classic",
            "The book named This is pretty is a classic",
            "The book named War and Peace is a classic",
            "The book named The Three Musketeers is a classic",
        },
        // The first two topics have a common ending
        new List<string>
        {
            "The movie named Matrix 2 is very good",
            "The movie named Avatar 2 is very good",
            "The movie named The Sound of Music is very good",
            "The movie named Terminator 2 is very good",
        }
    };
}

private static List gettest句子（）
{
返回新列表
{
//仅前缀测试
新名单
{
“我去了家得宝”，
“我去了沃尔格林”，
“我去了百思买”，
},
//仅后缀测试
新名单
{
《权力的游戏》是一部很好的电视剧，
“打破坏习惯是一部好电视剧”，
“办公室是一部很好的电视连续剧”，
},
//前缀/后缀测试
新名单
{
“洛杉矶湖人队是我的最爱”，
“纽约尼克斯篮球队是我的最爱”，
“芝加哥公牛队是我的最爱”，
},
//没有前缀或后缀-所有句子都不同
新名单
{
“我去了家得宝”，
《权力的游戏》是一部很好的电视剧，
“洛杉矶湖人队是我的最爱”，
},
//所有句子都是相同的-前缀和后缀之间没有“主题”
新名单（）
{
“这些句子都一样”，
“这些句子都一样”，
“这些句子都一样”，
},
//有些句子在前缀和后缀之间没有内容
新名单（）
{
“这句话没有主题”，
“这句话[这里的主题]没有主题”，
“这句话没有主题”，
“这句话[另一句]没有主题”，
},
//前两个主题有共同的开端
新名单（）
{
《指环王》是一部经典之作，
“名为《蝇王》的书是一部经典之作”，
“名为《这是美丽的》的书是一部经典之作”，
“名为《战争与和平》的书是一部经典之作”，
public static List<string> RemoveCommonPrefixAndSuffix(List<string> sentences,
    int minSeqenceLength = 2)
{
    if (sentences == null) return null;

    if (sentences.Count < 2 ||
        sentences.Any(s => s.Count(c => c == ' ') < minSeqenceLength - 1))
    {
        return sentences.ToList();
    }

    if (sentences.All(s => s == sentences[0]))
    {
        return sentences.Select(s => string.Empty).ToList();
    }

    var sentenceWords = sentences.Select(s => s.Split()).ToList();
    var firstSentence = sentenceWords[0];
    var length = sentenceWords.Min(s => s.Length);
    var commonPrefix = new StringBuilder();
    var commonSuffix = new StringBuilder();
    var prefixDone = false;
    var suffixDone = false;

    for (var i = 0; i < length && !(prefixDone && suffixDone); i++)
    {
        if (!prefixDone && sentenceWords.All(s => s[i] == firstSentence[i]))
        {
            commonPrefix.Append(firstSentence[i] + " ");
        }
        else
        {
            prefixDone = true;
        }

        if (!suffixDone && sentenceWords.All(s =>
            s[s.Length - i - 1] == firstSentence[firstSentence.Length - i - 1]))
        {
            commonSuffix.Insert(0, firstSentence[firstSentence.Length - i - 1] + " ");
        }
        else
        {
            suffixDone = true;
        }
    }

    var prefix = commonPrefix.ToString().Count(c => c == ' ') >= minSeqenceLength - 1
        ? commonPrefix.ToString()
        : string.Empty;

    var suffix = commonSuffix.ToString().Count(c => c == ' ') >= minSeqenceLength - 1
        ? commonSuffix.ToString()
        : string.Empty;

    var commonLength = prefix.Length + suffix.Length;

    return sentences
        .Select(s => s.Length > commonLength
            ? s.Substring(prefix.Length, s.Length - prefix.Length - suffix.Length)
            : string.Empty)
        .ToList();
}

private static List<List<string>> GetTestSentences()
{
    return new List<List<string>>
    {
        // Prefix-only test
        new List<string>
        {
            "I went to The Home Depot",
            "I went to Walgreens",
            "I went to Best Buy",
        },
        // Suffix-only test
        new List<string>
        {
            "Game of Thrones is a good TV series",
            "Breaking Bad is a good TV series",
            "The Office is a good TV series",
        },
        // Prefix / Suffix test
        new List<string>
        {
            "The basketball team Los Angeles Lakers are my favorite",
            "The basketball team New York Knicks are my favorite",
            "The basketball team Chicago Bulls are my favorite",
        },
        // No prefix or suffix - all sentences are different
        new List<string>
        {
            "I went to The Home Depot",
            "Game of Thrones is a good TV series",
            "The basketball team Los Angeles Lakers are my favorite",
        },
        // All sentences are the same - no "topic" between prefix and suffix
        new List<string>()
        {
            "These sentences are all the same",
            "These sentences are all the same",
            "These sentences are all the same",
        },
        // Some sentences have no content between prefix and suffix
        new List<string>()
        {
            "This sentence has no topic",
            "This sentence [topic here] has no topic",
            "This sentence has no topic",
            "This sentence [another one] has no topic",
        },
        // First two topics have common beginnings
        new List<string>()
        {
            "The book named Lord of the Rings is a classic",
            "The book named Lord of the Flies is a classic",
            "The book named This is pretty is a classic",
            "The book named War and Peace is a classic",
            "The book named The Three Musketeers is a classic",
        },
        // The first two topics have a common ending
        new List<string>
        {
            "The movie named Matrix 2 is very good",
            "The movie named Avatar 2 is very good",
            "The movie named The Sound of Music is very good",
            "The movie named Terminator 2 is very good",
        }
    };
}

private static void Main()
{
    var sentenceLists = GetTestSentences();
    var padLength = sentenceLists.Max(t => t.Max(s => s.Length)) + 2;
    Console.WriteLine("\nComparison Results\n------------------\n");

    // Rufus' solution
    var sw = Stopwatch.StartNew();
    foreach (var sentenceList in sentenceLists)
    {
        var trimmedSentences = RemoveCommonPrefixAndSuffix(sentenceList);

        for (var j = 0; j < trimmedSentences.Count; j++)
        {
            Console.WriteLine("{0} {1}", sentenceList[j].PadRight(padLength, '.'),
                trimmedSentences[j]);
        }

        Console.WriteLine();
    }
    sw.Stop();

    Console.WriteLine($"Rufus' solution took {sw.ElapsedMilliseconds} ms\n");
    Console.WriteLine(new string('-', Console.WindowWidth));

    // Prateek's solution
    sw.Restart();
    foreach (var sentenceList in sentenceLists)
    {
        var prefix = FindMatchingPattern(sentenceList[0], sentenceList[1], true);
        var suffix = FindMatchingPattern(sentenceList[0], sentenceList[1], false);

        if (prefix.Length > 0) prefix = Regex.Escape(prefix);
        if (suffix.Length > 0) suffix = Regex.Escape(suffix);

        foreach (var item in sentenceList)
        {
            var result = Regex.Replace(item, prefix, string.Empty);
            result = Regex.Replace(result, suffix, string.Empty);
            Console.WriteLine($"{item.PadRight(padLength, '.')} {result}");
        }

        Console.WriteLine();
    }
    sw.Stop();

    Console.WriteLine($"Prateek's solution took {sw.ElapsedMilliseconds} ms\n");
    Console.WriteLine(new string('-', Console.WindowWidth));

    GetKeyFromUser("\nDone!! Press any key to exit...");
}