C# 按顺序搜索匹配单词的字符串列表

C# 按顺序搜索匹配单词的字符串列表,c#,arrays,regex,list,C#,Arrays,Regex,List,我有一个来自总是变化的外部源的字符串的列表 我想搜索每个字符串,在所有字符串之间按顺序找到匹配的单词 然后从每个字符串中删除这组单词,只留下书名 例子 《指环王》是一本经典之作。 《战争与和平》是一本经典之作。 名为《三个火枪手》的书是一部经典之作 名为的书将被删除。 是经典。将被删除。 名为序列的书不会被删除,因为战争与和平不是以序列开始的 序列必须出现在所有字符串之间才能删除 指环王 战争与和平 三个火枪手 这是一个示例列表。我想在字符串上使用它,而不是书名 例如: 我去了家得宝。 我去

我有一个来自总是变化的外部源的
字符串的
列表

我想搜索每个字符串,在所有字符串之间按顺序找到匹配的单词

然后从每个字符串中删除这组单词,只留下书名


例子 《指环王》是一本经典之作。
《战争与和平》是一本经典之作。
名为《三个火枪手》的书是一部经典之作

名为
的书将被删除。
是经典。
将被删除。
名为
序列的书不会被删除,因为
战争与和平
不是以
序列开始的

序列必须出现在所有字符串之间才能删除

指环王
战争与和平
三个火枪手


这是一个示例列表。我想在字符串上使用它,而不是书名

例如:

我去了家得宝。
我去了沃尔格林。
我去了百思买

我去的
被删除

洛杉矶湖人队是我的最爱。
纽约尼克斯篮球队是我的最爱。
芝加哥公牛队是我最喜欢的篮球队

篮球队
被移除。
是我的最爱。
已删除


解决方案 我的想法是从一开始就搜索字符串,将匹配的单词分组,直到找到一个不匹配的单词,然后找到前缀

然后从字符串的末尾向后执行相同的操作,以查找后缀

它会揭示标题在中间。

但我不知道该怎么做

C#

List<string> sentences = new List<string>() 
{ 
    "The book named The Lord of the Rings is a classic.",
    "The book named War and Peace is a classic.",
    "The book named The Three Musketeers is a classic.",
};

List<string> titles = new List<string>() 


for (int i = 0; i < sentences.Count; i++)
{
    // Add Titles to their own List
    //
    titles.Add(FindTitle(sentence[i]));
}


String FindTitle(string sentence) 
{
    string title = string.Empty;

    // compare all strings in List
    // group common word sequences prefix (The book named)
    // group common word sequences suffix (is a classic.)
    // remove those word sequences from each string in List

    return title;
}
List语句=新列表()
{ 
《指环王》是一部经典之作,
《战争与和平》是一本经典之作,
《三剑客》是一部经典之作,
};
列表标题=新列表()
for(int i=0;i<句子数;i++)
{
//将标题添加到自己的列表中
//
标题。添加(FindTitle(第[i]句));
}
字符串FindTitle(字符串语句)
{
string title=string.Empty;
//比较列表中的所有字符串
//组通用字串前缀(书名为)
//组通用单词序列后缀(是一个经典。)
//从列表中的每个字符串中删除这些单词序列
返回标题;
}

以下是我的方法。我选择了性能路线-我想仍然可以优化

编辑:使用regex.Escape帮助解决特殊字符的情况

用秒表计时我的v/s Rufus L的解决方案

使用-Rufus L的测试句输入:

private static List<List<string>> GetTestSentences()
{
    return new List<List<string>>
    {
        new List<string>()
        {
            "The book named The Lord of the Rings is a classic.",
            "The book named War and Peace is a classic.",
            "The book named The Three Musketeers is a classic.",
        },
        new List<string>
        {
            "I went to The Home Depot.",
            "I went to Walgreens.",
            "I went to Best Buy."
        },
        new List<string>
        {
            "The basketball team Los Angeles Lakers are my favorite.",
            "The basketball team New York Knicks are my favorite.",
            "The basketball team Chicago Bulls are my favorite."
        },
        new List<string>()
        {
            "The book named Lord of the Flies is a classic (500 This is a test)",
            "The book named Wuthering Heights is a classic (500 This is a test)",
            "The book named Great Expectations is a classic (500 This is a test)",
            "The book named The Lord of the Rings is a classic (500 This is a test)",
            "The book named War and Peace is a classic (500 This is a test)"
        }
    };
}
下面是神奇的方法:

private static string FindMatchingPattern(string sample1, string sample2, bool forwardDirection)
{
    string shorter = string.Empty;
    string longer = string.Empty;

    if (sample1.Length <= sample2.Length)
    {
        shorter = sample1;
        longer = sample2;
    }
    else
    {
        shorter = sample2;
        longer = sample1;
    }

    StringBuilder matchingPattern = new StringBuilder();
    StringBuilder wordHolder = new StringBuilder();

    if (forwardDirection)
    {
        for (int idx = 0; idx < shorter.Length; idx++)
        {
            if (shorter[idx] == longer[idx])
                if (shorter[idx] == ' ')
                {
                    matchingPattern.Append(wordHolder + " ");
                    wordHolder.Clear();
                }
                else
                    wordHolder.Append(shorter[idx]);
            else
                break;
        }
    }
    else
    {
        while (true)
        {
            if (shorter.Length > 0 && shorter[shorter.Length - 1] == longer[longer.Length - 1])
            {
                if (shorter[shorter.Length - 1] == ' ')
                {
                    matchingPattern.Insert(0, " " + wordHolder);
                    wordHolder.Clear();
                }
                else
                    wordHolder.Insert(0, shorter[shorter.Length - 1]);

                shorter = shorter.Remove(shorter.Length - 1, 1);
                longer = longer.Remove(longer.Length - 1, 1);
            }
            else
            {
                break;
            }
        }
    }

    return matchingPattern.ToString();
}
私有静态字符串FindMatchingPattern(字符串样本1、字符串样本2、布尔前进方向)
{
string shorter=string.Empty;
string longer=string.Empty;
if(sample1.Length 0&&shorter[shorter.Length-1]==longer[longer.Length-1])
{
如果(较短[较短的长度-1]='')
{
匹配模式。插入(0,“+字夹);
wordHolder.Clear();
}
其他的
wordHolder.Insert(0,更短[shorter.Length-1]);
更短=更短。删除(更短。长度-1,1);
更长=更长。删除(更长。长度-1,1);
}
其他的
{
打破
}
}
}
返回matchingPattern.ToString();
}

更新我修改了示例数据以包含不同类型的测试,并修改了
RemoveCommonPrefixAndSuffix
以处理这些新测试


我发现,如果前两本书(或无论主题是什么)以相同的单词开头和/或结尾,那么仅仅比较前两个字符串的共同前缀和后缀可能是一个错误

例如:

new List<string>()
{
    "The book named Lord of the Rings 2 is a classic.",
    "The book named Lord of the Flies 2 is a classic.",
    "The book named This is pretty is a classic.",                
    "The book named War and Peace is a classic.",
    "The book named The Three Musketeers is a classic.",                
},
以下是获取测试数据的方法:

private static List<List<string>> GetTestSentences()
{
    return new List<List<string>>
    {
        // Prefix-only test
        new List<string>
        {
            "I went to The Home Depot",
            "I went to Walgreens",
            "I went to Best Buy",
        },
        // Suffix-only test
        new List<string>
        {
            "Game of Thrones is a good TV series",
            "Breaking Bad is a good TV series",
            "The Office is a good TV series",
        },
        // Prefix / Suffix test
        new List<string>
        {
            "The basketball team Los Angeles Lakers are my favorite",
            "The basketball team New York Knicks are my favorite",
            "The basketball team Chicago Bulls are my favorite",
        },
        // No prefix or suffix - all sentences are different
        new List<string>
        {
            "I went to The Home Depot",
            "Game of Thrones is a good TV series",
            "The basketball team Los Angeles Lakers are my favorite",
        },
        // All sentences are the same - no "topic" between prefix and suffix
        new List<string>()
        {
            "These sentences are all the same",
            "These sentences are all the same",
            "These sentences are all the same",
        },
        // Some sentences have no content between prefix and suffix
        new List<string>()
        {
            "This sentence has no topic",
            "This sentence [topic here] has no topic",
            "This sentence has no topic",
            "This sentence [another one] has no topic",
        },
        // First two topics have common beginnings
        new List<string>()
        {
            "The book named Lord of the Rings is a classic",
            "The book named Lord of the Flies is a classic",
            "The book named This is pretty is a classic",
            "The book named War and Peace is a classic",
            "The book named The Three Musketeers is a classic",
        },
        // The first two topics have a common ending
        new List<string>
        {
            "The movie named Matrix 2 is very good",
            "The movie named Avatar 2 is very good",
            "The movie named The Sound of Music is very good",
            "The movie named Terminator 2 is very good",
        }
    };
}
private static List gettest句子()
{
返回新列表
{
//仅前缀测试
新名单
{
“我去了家得宝”,
“我去了沃尔格林”,
“我去了百思买”,
},
//仅后缀测试
新名单
{
《权力的游戏》是一部很好的电视剧,
“打破坏习惯是一部好电视剧”,
“办公室是一部很好的电视连续剧”,
},
//前缀/后缀测试
新名单
{
“洛杉矶湖人队是我的最爱”,
“纽约尼克斯篮球队是我的最爱”,
“芝加哥公牛队是我的最爱”,
},
//没有前缀或后缀-所有句子都不同
新名单
{
“我去了家得宝”,
《权力的游戏》是一部很好的电视剧,
“洛杉矶湖人队是我的最爱”,
},
//所有句子都是相同的-前缀和后缀之间没有“主题”
新名单()
{
“这些句子都一样”,
“这些句子都一样”,
“这些句子都一样”,
},
//有些句子在前缀和后缀之间没有内容
新名单()
{
“这句话没有主题”,
“这句话[这里的主题]没有主题”,
“这句话没有主题”,
“这句话[另一句]没有主题”,
},
//前两个主题有共同的开端
新名单()
{
《指环王》是一部经典之作,
“名为《蝇王》的书是一部经典之作”,
“名为《这是美丽的》的书是一部经典之作”,
“名为《战争与和平》的书是一部经典之作”,
public static List<string> RemoveCommonPrefixAndSuffix(List<string> sentences,
    int minSeqenceLength = 2)
{
    if (sentences == null) return null;

    if (sentences.Count < 2 ||
        sentences.Any(s => s.Count(c => c == ' ') < minSeqenceLength - 1))
    {
        return sentences.ToList();
    }

    if (sentences.All(s => s == sentences[0]))
    {
        return sentences.Select(s => string.Empty).ToList();
    }

    var sentenceWords = sentences.Select(s => s.Split()).ToList();
    var firstSentence = sentenceWords[0];
    var length = sentenceWords.Min(s => s.Length);
    var commonPrefix = new StringBuilder();
    var commonSuffix = new StringBuilder();
    var prefixDone = false;
    var suffixDone = false;

    for (var i = 0; i < length && !(prefixDone && suffixDone); i++)
    {
        if (!prefixDone && sentenceWords.All(s => s[i] == firstSentence[i]))
        {
            commonPrefix.Append(firstSentence[i] + " ");
        }
        else
        {
            prefixDone = true;
        }

        if (!suffixDone && sentenceWords.All(s =>
            s[s.Length - i - 1] == firstSentence[firstSentence.Length - i - 1]))
        {
            commonSuffix.Insert(0, firstSentence[firstSentence.Length - i - 1] + " ");
        }
        else
        {
            suffixDone = true;
        }
    }

    var prefix = commonPrefix.ToString().Count(c => c == ' ') >= minSeqenceLength - 1
        ? commonPrefix.ToString()
        : string.Empty;

    var suffix = commonSuffix.ToString().Count(c => c == ' ') >= minSeqenceLength - 1
        ? commonSuffix.ToString()
        : string.Empty;

    var commonLength = prefix.Length + suffix.Length;

    return sentences
        .Select(s => s.Length > commonLength
            ? s.Substring(prefix.Length, s.Length - prefix.Length - suffix.Length)
            : string.Empty)
        .ToList();
}
private static List<List<string>> GetTestSentences()
{
    return new List<List<string>>
    {
        // Prefix-only test
        new List<string>
        {
            "I went to The Home Depot",
            "I went to Walgreens",
            "I went to Best Buy",
        },
        // Suffix-only test
        new List<string>
        {
            "Game of Thrones is a good TV series",
            "Breaking Bad is a good TV series",
            "The Office is a good TV series",
        },
        // Prefix / Suffix test
        new List<string>
        {
            "The basketball team Los Angeles Lakers are my favorite",
            "The basketball team New York Knicks are my favorite",
            "The basketball team Chicago Bulls are my favorite",
        },
        // No prefix or suffix - all sentences are different
        new List<string>
        {
            "I went to The Home Depot",
            "Game of Thrones is a good TV series",
            "The basketball team Los Angeles Lakers are my favorite",
        },
        // All sentences are the same - no "topic" between prefix and suffix
        new List<string>()
        {
            "These sentences are all the same",
            "These sentences are all the same",
            "These sentences are all the same",
        },
        // Some sentences have no content between prefix and suffix
        new List<string>()
        {
            "This sentence has no topic",
            "This sentence [topic here] has no topic",
            "This sentence has no topic",
            "This sentence [another one] has no topic",
        },
        // First two topics have common beginnings
        new List<string>()
        {
            "The book named Lord of the Rings is a classic",
            "The book named Lord of the Flies is a classic",
            "The book named This is pretty is a classic",
            "The book named War and Peace is a classic",
            "The book named The Three Musketeers is a classic",
        },
        // The first two topics have a common ending
        new List<string>
        {
            "The movie named Matrix 2 is very good",
            "The movie named Avatar 2 is very good",
            "The movie named The Sound of Music is very good",
            "The movie named Terminator 2 is very good",
        }
    };
}
private static void Main()
{
    var sentenceLists = GetTestSentences();
    var padLength = sentenceLists.Max(t => t.Max(s => s.Length)) + 2;
    Console.WriteLine("\nComparison Results\n------------------\n");

    // Rufus' solution
    var sw = Stopwatch.StartNew();
    foreach (var sentenceList in sentenceLists)
    {
        var trimmedSentences = RemoveCommonPrefixAndSuffix(sentenceList);

        for (var j = 0; j < trimmedSentences.Count; j++)
        {
            Console.WriteLine("{0} {1}", sentenceList[j].PadRight(padLength, '.'),
                trimmedSentences[j]);
        }

        Console.WriteLine();
    }
    sw.Stop();

    Console.WriteLine($"Rufus' solution took {sw.ElapsedMilliseconds} ms\n");
    Console.WriteLine(new string('-', Console.WindowWidth));

    // Prateek's solution
    sw.Restart();
    foreach (var sentenceList in sentenceLists)
    {
        var prefix = FindMatchingPattern(sentenceList[0], sentenceList[1], true);
        var suffix = FindMatchingPattern(sentenceList[0], sentenceList[1], false);

        if (prefix.Length > 0) prefix = Regex.Escape(prefix);
        if (suffix.Length > 0) suffix = Regex.Escape(suffix);

        foreach (var item in sentenceList)
        {
            var result = Regex.Replace(item, prefix, string.Empty);
            result = Regex.Replace(result, suffix, string.Empty);
            Console.WriteLine($"{item.PadRight(padLength, '.')} {result}");
        }

        Console.WriteLine();
    }
    sw.Stop();

    Console.WriteLine($"Prateek's solution took {sw.ElapsedMilliseconds} ms\n");
    Console.WriteLine(new string('-', Console.WindowWidth));

    GetKeyFromUser("\nDone!! Press any key to exit...");
}