C# 按顺序搜索匹配单词的字符串列表
我有一个来自总是变化的外部源的C# 按顺序搜索匹配单词的字符串列表,c#,arrays,regex,list,C#,Arrays,Regex,List,我有一个来自总是变化的外部源的字符串的列表 我想搜索每个字符串,在所有字符串之间按顺序找到匹配的单词 然后从每个字符串中删除这组单词,只留下书名 例子 《指环王》是一本经典之作。 《战争与和平》是一本经典之作。 名为《三个火枪手》的书是一部经典之作 名为的书将被删除。 是经典。将被删除。 名为序列的书不会被删除,因为战争与和平不是以序列开始的 序列必须出现在所有字符串之间才能删除 指环王 战争与和平 三个火枪手 这是一个示例列表。我想在字符串上使用它,而不是书名 例如: 我去了家得宝。 我去
字符串的列表
我想搜索每个字符串,在所有字符串之间按顺序找到匹配的单词
然后从每个字符串中删除这组单词,只留下书名
例子
《指环王》是一本经典之作。
《战争与和平》是一本经典之作。
名为《三个火枪手》的书是一部经典之作
名为
的书将被删除。
是经典。
将被删除。
名为
序列的书不会被删除,因为战争与和平
不是以序列开始的
序列必须出现在所有字符串之间才能删除
指环王
战争与和平
三个火枪手
这是一个示例列表。我想在字符串上使用它,而不是书名
例如:
我去了家得宝。
我去了沃尔格林。
我去了百思买
我去的
被删除
洛杉矶湖人队是我的最爱。
纽约尼克斯篮球队是我的最爱。
芝加哥公牛队是我最喜欢的篮球队
篮球队
被移除。
是我的最爱。
已删除
解决方案
我的想法是从一开始就搜索字符串,将匹配的单词分组,直到找到一个不匹配的单词,然后找到前缀
然后从字符串的末尾向后执行相同的操作,以查找后缀
它会揭示标题在中间。
但我不知道该怎么做
C#
List<string> sentences = new List<string>()
{
"The book named The Lord of the Rings is a classic.",
"The book named War and Peace is a classic.",
"The book named The Three Musketeers is a classic.",
};
List<string> titles = new List<string>()
for (int i = 0; i < sentences.Count; i++)
{
// Add Titles to their own List
//
titles.Add(FindTitle(sentence[i]));
}
String FindTitle(string sentence)
{
string title = string.Empty;
// compare all strings in List
// group common word sequences prefix (The book named)
// group common word sequences suffix (is a classic.)
// remove those word sequences from each string in List
return title;
}
List语句=新列表()
{
《指环王》是一部经典之作,
《战争与和平》是一本经典之作,
《三剑客》是一部经典之作,
};
列表标题=新列表()
for(int i=0;i<句子数;i++)
{
//将标题添加到自己的列表中
//
标题。添加(FindTitle(第[i]句));
}
字符串FindTitle(字符串语句)
{
string title=string.Empty;
//比较列表中的所有字符串
//组通用字串前缀(书名为)
//组通用单词序列后缀(是一个经典。)
//从列表中的每个字符串中删除这些单词序列
返回标题;
}
以下是我的方法。我选择了性能路线-我想仍然可以优化
编辑:使用regex.Escape帮助解决特殊字符的情况
用秒表计时我的v/s Rufus L的解决方案
使用-Rufus L的测试句输入:
private static List<List<string>> GetTestSentences()
{
return new List<List<string>>
{
new List<string>()
{
"The book named The Lord of the Rings is a classic.",
"The book named War and Peace is a classic.",
"The book named The Three Musketeers is a classic.",
},
new List<string>
{
"I went to The Home Depot.",
"I went to Walgreens.",
"I went to Best Buy."
},
new List<string>
{
"The basketball team Los Angeles Lakers are my favorite.",
"The basketball team New York Knicks are my favorite.",
"The basketball team Chicago Bulls are my favorite."
},
new List<string>()
{
"The book named Lord of the Flies is a classic (500 This is a test)",
"The book named Wuthering Heights is a classic (500 This is a test)",
"The book named Great Expectations is a classic (500 This is a test)",
"The book named The Lord of the Rings is a classic (500 This is a test)",
"The book named War and Peace is a classic (500 This is a test)"
}
};
}
下面是神奇的方法:
private static string FindMatchingPattern(string sample1, string sample2, bool forwardDirection)
{
string shorter = string.Empty;
string longer = string.Empty;
if (sample1.Length <= sample2.Length)
{
shorter = sample1;
longer = sample2;
}
else
{
shorter = sample2;
longer = sample1;
}
StringBuilder matchingPattern = new StringBuilder();
StringBuilder wordHolder = new StringBuilder();
if (forwardDirection)
{
for (int idx = 0; idx < shorter.Length; idx++)
{
if (shorter[idx] == longer[idx])
if (shorter[idx] == ' ')
{
matchingPattern.Append(wordHolder + " ");
wordHolder.Clear();
}
else
wordHolder.Append(shorter[idx]);
else
break;
}
}
else
{
while (true)
{
if (shorter.Length > 0 && shorter[shorter.Length - 1] == longer[longer.Length - 1])
{
if (shorter[shorter.Length - 1] == ' ')
{
matchingPattern.Insert(0, " " + wordHolder);
wordHolder.Clear();
}
else
wordHolder.Insert(0, shorter[shorter.Length - 1]);
shorter = shorter.Remove(shorter.Length - 1, 1);
longer = longer.Remove(longer.Length - 1, 1);
}
else
{
break;
}
}
}
return matchingPattern.ToString();
}
私有静态字符串FindMatchingPattern(字符串样本1、字符串样本2、布尔前进方向)
{
string shorter=string.Empty;
string longer=string.Empty;
if(sample1.Length 0&&shorter[shorter.Length-1]==longer[longer.Length-1])
{
如果(较短[较短的长度-1]='')
{
匹配模式。插入(0,“+字夹);
wordHolder.Clear();
}
其他的
wordHolder.Insert(0,更短[shorter.Length-1]);
更短=更短。删除(更短。长度-1,1);
更长=更长。删除(更长。长度-1,1);
}
其他的
{
打破
}
}
}
返回matchingPattern.ToString();
}
更新我修改了示例数据以包含不同类型的测试,并修改了RemoveCommonPrefixAndSuffix
以处理这些新测试
我发现,如果前两本书(或无论主题是什么)以相同的单词开头和/或结尾,那么仅仅比较前两个字符串的共同前缀和后缀可能是一个错误
例如:
new List<string>()
{
"The book named Lord of the Rings 2 is a classic.",
"The book named Lord of the Flies 2 is a classic.",
"The book named This is pretty is a classic.",
"The book named War and Peace is a classic.",
"The book named The Three Musketeers is a classic.",
},
以下是获取测试数据的方法:
private static List<List<string>> GetTestSentences()
{
return new List<List<string>>
{
// Prefix-only test
new List<string>
{
"I went to The Home Depot",
"I went to Walgreens",
"I went to Best Buy",
},
// Suffix-only test
new List<string>
{
"Game of Thrones is a good TV series",
"Breaking Bad is a good TV series",
"The Office is a good TV series",
},
// Prefix / Suffix test
new List<string>
{
"The basketball team Los Angeles Lakers are my favorite",
"The basketball team New York Knicks are my favorite",
"The basketball team Chicago Bulls are my favorite",
},
// No prefix or suffix - all sentences are different
new List<string>
{
"I went to The Home Depot",
"Game of Thrones is a good TV series",
"The basketball team Los Angeles Lakers are my favorite",
},
// All sentences are the same - no "topic" between prefix and suffix
new List<string>()
{
"These sentences are all the same",
"These sentences are all the same",
"These sentences are all the same",
},
// Some sentences have no content between prefix and suffix
new List<string>()
{
"This sentence has no topic",
"This sentence [topic here] has no topic",
"This sentence has no topic",
"This sentence [another one] has no topic",
},
// First two topics have common beginnings
new List<string>()
{
"The book named Lord of the Rings is a classic",
"The book named Lord of the Flies is a classic",
"The book named This is pretty is a classic",
"The book named War and Peace is a classic",
"The book named The Three Musketeers is a classic",
},
// The first two topics have a common ending
new List<string>
{
"The movie named Matrix 2 is very good",
"The movie named Avatar 2 is very good",
"The movie named The Sound of Music is very good",
"The movie named Terminator 2 is very good",
}
};
}
private static List gettest句子()
{
返回新列表
{
//仅前缀测试
新名单
{
“我去了家得宝”,
“我去了沃尔格林”,
“我去了百思买”,
},
//仅后缀测试
新名单
{
《权力的游戏》是一部很好的电视剧,
“打破坏习惯是一部好电视剧”,
“办公室是一部很好的电视连续剧”,
},
//前缀/后缀测试
新名单
{
“洛杉矶湖人队是我的最爱”,
“纽约尼克斯篮球队是我的最爱”,
“芝加哥公牛队是我的最爱”,
},
//没有前缀或后缀-所有句子都不同
新名单
{
“我去了家得宝”,
《权力的游戏》是一部很好的电视剧,
“洛杉矶湖人队是我的最爱”,
},
//所有句子都是相同的-前缀和后缀之间没有“主题”
新名单()
{
“这些句子都一样”,
“这些句子都一样”,
“这些句子都一样”,
},
//有些句子在前缀和后缀之间没有内容
新名单()
{
“这句话没有主题”,
“这句话[这里的主题]没有主题”,
“这句话没有主题”,
“这句话[另一句]没有主题”,
},
//前两个主题有共同的开端
新名单()
{
《指环王》是一部经典之作,
“名为《蝇王》的书是一部经典之作”,
“名为《这是美丽的》的书是一部经典之作”,
“名为《战争与和平》的书是一部经典之作”,
public static List<string> RemoveCommonPrefixAndSuffix(List<string> sentences,
int minSeqenceLength = 2)
{
if (sentences == null) return null;
if (sentences.Count < 2 ||
sentences.Any(s => s.Count(c => c == ' ') < minSeqenceLength - 1))
{
return sentences.ToList();
}
if (sentences.All(s => s == sentences[0]))
{
return sentences.Select(s => string.Empty).ToList();
}
var sentenceWords = sentences.Select(s => s.Split()).ToList();
var firstSentence = sentenceWords[0];
var length = sentenceWords.Min(s => s.Length);
var commonPrefix = new StringBuilder();
var commonSuffix = new StringBuilder();
var prefixDone = false;
var suffixDone = false;
for (var i = 0; i < length && !(prefixDone && suffixDone); i++)
{
if (!prefixDone && sentenceWords.All(s => s[i] == firstSentence[i]))
{
commonPrefix.Append(firstSentence[i] + " ");
}
else
{
prefixDone = true;
}
if (!suffixDone && sentenceWords.All(s =>
s[s.Length - i - 1] == firstSentence[firstSentence.Length - i - 1]))
{
commonSuffix.Insert(0, firstSentence[firstSentence.Length - i - 1] + " ");
}
else
{
suffixDone = true;
}
}
var prefix = commonPrefix.ToString().Count(c => c == ' ') >= minSeqenceLength - 1
? commonPrefix.ToString()
: string.Empty;
var suffix = commonSuffix.ToString().Count(c => c == ' ') >= minSeqenceLength - 1
? commonSuffix.ToString()
: string.Empty;
var commonLength = prefix.Length + suffix.Length;
return sentences
.Select(s => s.Length > commonLength
? s.Substring(prefix.Length, s.Length - prefix.Length - suffix.Length)
: string.Empty)
.ToList();
}
private static List<List<string>> GetTestSentences()
{
return new List<List<string>>
{
// Prefix-only test
new List<string>
{
"I went to The Home Depot",
"I went to Walgreens",
"I went to Best Buy",
},
// Suffix-only test
new List<string>
{
"Game of Thrones is a good TV series",
"Breaking Bad is a good TV series",
"The Office is a good TV series",
},
// Prefix / Suffix test
new List<string>
{
"The basketball team Los Angeles Lakers are my favorite",
"The basketball team New York Knicks are my favorite",
"The basketball team Chicago Bulls are my favorite",
},
// No prefix or suffix - all sentences are different
new List<string>
{
"I went to The Home Depot",
"Game of Thrones is a good TV series",
"The basketball team Los Angeles Lakers are my favorite",
},
// All sentences are the same - no "topic" between prefix and suffix
new List<string>()
{
"These sentences are all the same",
"These sentences are all the same",
"These sentences are all the same",
},
// Some sentences have no content between prefix and suffix
new List<string>()
{
"This sentence has no topic",
"This sentence [topic here] has no topic",
"This sentence has no topic",
"This sentence [another one] has no topic",
},
// First two topics have common beginnings
new List<string>()
{
"The book named Lord of the Rings is a classic",
"The book named Lord of the Flies is a classic",
"The book named This is pretty is a classic",
"The book named War and Peace is a classic",
"The book named The Three Musketeers is a classic",
},
// The first two topics have a common ending
new List<string>
{
"The movie named Matrix 2 is very good",
"The movie named Avatar 2 is very good",
"The movie named The Sound of Music is very good",
"The movie named Terminator 2 is very good",
}
};
}
private static void Main()
{
var sentenceLists = GetTestSentences();
var padLength = sentenceLists.Max(t => t.Max(s => s.Length)) + 2;
Console.WriteLine("\nComparison Results\n------------------\n");
// Rufus' solution
var sw = Stopwatch.StartNew();
foreach (var sentenceList in sentenceLists)
{
var trimmedSentences = RemoveCommonPrefixAndSuffix(sentenceList);
for (var j = 0; j < trimmedSentences.Count; j++)
{
Console.WriteLine("{0} {1}", sentenceList[j].PadRight(padLength, '.'),
trimmedSentences[j]);
}
Console.WriteLine();
}
sw.Stop();
Console.WriteLine($"Rufus' solution took {sw.ElapsedMilliseconds} ms\n");
Console.WriteLine(new string('-', Console.WindowWidth));
// Prateek's solution
sw.Restart();
foreach (var sentenceList in sentenceLists)
{
var prefix = FindMatchingPattern(sentenceList[0], sentenceList[1], true);
var suffix = FindMatchingPattern(sentenceList[0], sentenceList[1], false);
if (prefix.Length > 0) prefix = Regex.Escape(prefix);
if (suffix.Length > 0) suffix = Regex.Escape(suffix);
foreach (var item in sentenceList)
{
var result = Regex.Replace(item, prefix, string.Empty);
result = Regex.Replace(result, suffix, string.Empty);
Console.WriteLine($"{item.PadRight(padLength, '.')} {result}");
}
Console.WriteLine();
}
sw.Stop();
Console.WriteLine($"Prateek's solution took {sw.ElapsedMilliseconds} ms\n");
Console.WriteLine(new string('-', Console.WindowWidth));
GetKeyFromUser("\nDone!! Press any key to exit...");
}