C# 从字符串进行英语词典单词匹配_C#_.net_Linq

C# 从字符串进行英语词典单词匹配

c# .net linq

C# 从字符串进行英语词典单词匹配,c#,.net,linq,C#,.net,Linq,我正试图解决一个问题，即从字典文件中找出与给定字符串最匹配的英语单词例如（“行”是字典中的单词列表）：显然，“day/cake”最适合这个字符串，但是如果我在字符串中引入第三个单词，例如“cakedaynow”，它就不太好用了我知道这个例子很原始，它更像是一个概念证明，我想知道是否有人有过这种类型的字符串分析的经验谢谢您需要研究适合您所要做的事情的算法类。从维基百科开始另外，这里有一个C#中的Levenshtein编辑距离实现，让您开始： using System; namespa

我正试图解决一个问题，即从字典文件中找出与给定字符串最匹配的英语单词

例如（“行”是字典中的单词列表）：

显然，“day/cake”最适合这个字符串，但是如果我在字符串中引入第三个单词，例如“cakedaynow”，它就不太好用了

我知道这个例子很原始，它更像是一个概念证明，我想知道是否有人有过这种类型的字符串分析的经验

谢谢

您需要研究适合您所要做的事情的算法类。从维基百科开始

另外，这里有一个C#中的Levenshtein编辑距离实现，让您开始：

using System;

namespace StringMatching
{
    /// <summary>
    /// A class to extend the string type with a method to get Levenshtein Edit Distance.
    /// </summary>
    public static class LevenshteinDistanceStringExtension
    {
        /// <summary>
        /// Get the Levenshtein Edit Distance.
        /// </summary>
        /// <param name="strA">The current string.</param>
        /// <param name="strB">The string to determine the distance from.</param>
        /// <returns>The Levenshtein Edit Distance.</returns>
        public static int GetLevenshteinDistance(this string strA, string strB)
        {
            if (string.IsNullOrEmpty(strA) && string.IsNullOrEmpty(strB))
                return 0;

            if (string.IsNullOrEmpty(strA))
                return strB.Length;

            if (string.IsNullOrEmpty(strB))
                return strA.Length;

            int[,] deltas; // matrix
            int lengthA;
            int lengthB;
            int indexA;
            int indexB;
            char charA;
            char charB;
            int cost; // cost

            // Step 1
            lengthA = strA.Length;
            lengthB = strB.Length;

            deltas = new int[lengthA + 1, lengthB + 1];

            // Step 2
            for (indexA = 0; indexA <= lengthA; indexA++)
            {
                deltas[indexA, 0] = indexA;
            }

            for (indexB = 0; indexB <= lengthB; indexB++)
            {
                deltas[0, indexB] = indexB;
            }

            // Step 3
            for (indexA = 1; indexA <= lengthA; indexA++)
            {
                charA = strA[indexA - 1];

                // Step 4
                for (indexB = 1; indexB <= lengthB; indexB++)
                {
                    charB = strB[indexB - 1];

                    // Step 5
                    if (charA == charB)
                    {
                        cost = 0;
                    }
                    else
                    {
                        cost = 1;
                    }

                    // Step 6
                    deltas[indexA, indexB] = Math.Min(deltas[indexA - 1, indexB] + 1, Math.Min(deltas[indexA, indexB - 1] + 1, deltas[indexA - 1, indexB - 1] + cost));
                }
            }

            // Step 7
            return deltas[lengthA, lengthB];
        }
    }
}

使用系统；
命名空间字符串匹配
{
/// 
///使用获取Levenshtein编辑距离的方法扩展字符串类型的类。
/// 
公共静态类LevenshteinDistanceStringExtension
{
/// 
///获取Levenshtein编辑距离。
/// 
///当前字符串。
///用于确定距离的字符串。
///Levenshtein编辑距离。
public static int getLevenshteInstance（此字符串为strA，字符串为strB）
{
if（string.IsNullOrEmpty（strA）和&string.IsNullOrEmpty（strB））
返回0；
if（string.IsNullOrEmpty（strA））
返回strB.Length；
if（string.IsNullOrEmpty（strB））
返回段长度；
int[，]增量；//矩阵
内伦萨；
国际长度b；
int indexA；
int indexB；
Charchara；
炭炭；
int cost；//成本
//第一步
lengthA=直线长度；
长度b=标准参考长度；
增量=新整数[lengthA+1，lengthB+1]；
//步骤2
对于（indexA=0；indexA为什么不：
检查从当前搜索位置提取到字符串所有可能长度的搜索词内的所有字符串，并提取所有发现的词。例如：
var list = new List<string>{"the", "me", "cat", "at", "theme"};
const string testStr = "themecat";
var words = new List<string>();
var len = testStr.Length;
for (int x = 0; x < len; x++)
{
    for(int i = (len - 1); i > x; i--)
    {
        string test = testStr.Substring(x, i - x + 1);
        if (list.Contains(test) && !words.Contains(test))
        {
            words.Add(test);
        }
    }
}

words.ForEach(n=> Console.WriteLine("{0}, ",n));//spit out current values

var list=新列表{“the”、“me”、“cat”、“at”、“theme”}；
常量字符串testStr=“themecat”；
var words=新列表（）；
var len=测试长度；
对于（int x=0；xx；i--）
{
字符串test=testStr.Substring（x，i-x+1）；
if（list.Contains（测试）和&！words.Contains（测试））
{
添加（测试）；
}
}
}
words.ForEach（n=>Console.WriteLine（“{0}，”，n））；//吐出当前值

输出：
var list = new List<string>{"the", "me", "cat", "at", "theme", "crying", "them"};
const string testStr = "themecatcryingthem";
var words = new Dictionary<int, string>();
var len = testStr.Length;
for (int x = 0; x < len; x++)
{
    int n = len > 28 ? 28 : len;//assuming 28 is the maximum length of an english word
    for(int i = (n - 1); i > x; i--)
    {
        string test = testStr.Substring(x, i - x + 1);
        if (list.Contains(test))
        {
            if (!words.ContainsValue(test))
            {
                bool found = false;//to check if there's a shorter item starting from same index
                var key = testStr.IndexOf(test, x, len - x);
                foreach (var w in words)
                {
                    if (w.Value.Contains(test) && w.Key != key && key == (w.Key + w.Value.Length - test.Length))
                    {
                        found = true;
                    }
                }
                if (!found && !words.ContainsKey(key)) words.Add(key, test);
            }
        }
    }
}

words.Values.ToList().ForEach(n=> Console.WriteLine("{0}, ",n));//spit out current values

theme，the，me，cat，at
编辑
现场场景1:
例如，假设你想在一个杂乱的句子中总是选择最长的单词，你可以从头到尾阅读，减少文本阅读量，直到你读完为止。使用字典会更容易，通过存储发现的单词的索引，我们可以快速检查是否存储了包含另一个单词的单词重新评估之前
示例：
var list = new List<string>{"the", "me", "cat", "at", "theme", "crying", "them"};
const string testStr = "themecatcryingthem";
var words = new Dictionary<int, string>();
var len = testStr.Length;
for (int x = 0; x < len; x++)
{
    int n = len > 28 ? 28 : len;//assuming 28 is the maximum length of an english word
    for(int i = (n - 1); i > x; i--)
    {
        string test = testStr.Substring(x, i - x + 1);
        if (list.Contains(test))
        {
            if (!words.ContainsValue(test))
            {
                bool found = false;//to check if there's a shorter item starting from same index
                var key = testStr.IndexOf(test, x, len - x);
                foreach (var w in words)
                {
                    if (w.Value.Contains(test) && w.Key != key && key == (w.Key + w.Value.Length - test.Length))
                    {
                        found = true;
                    }
                }
                if (!found && !words.ContainsKey(key)) words.Add(key, test);
            }
        }
    }
}

words.Values.ToList().ForEach(n=> Console.WriteLine("{0}, ",n));//spit out current values

var list=新列表{“the”、“me”、“cat”、“at”、“theme”、“哭泣”、“theme”}；
const string testStr=“themecacryingthem”；
var words=新字典（）；
var len=测试长度；
对于（int x=0；x28？28:len；//假设28是英语单词的最大长度
对于（inti=（n-1）；i>x；i--）
{
字符串test=testStr.Substring（x，i-x+1）；
if（列表包含（测试））
{
如果（！words.ContainsValue（测试））
{
bool found=false；//检查是否有从同一索引开始的较短项
var key=testStr.IndexOf（test，x，len-x）；
foreach（大写的var w）
{
if（w.Value.Contains（test）&&w.Key！=Key&&Key==（w.Key+w.Value.Length-test.Length））
{
发现=真；
}
}
如果（！found&&！words.ContainsKey（key））words.Add（key，test）；
}
}
}
}
words.Values.ToList（）.ForEach（n=>Console.WriteLine（“{0}，”，n））；//吐出当前值

输出：
var list = new List<string>{"the", "me", "cat", "at", "theme", "crying", "them"};
const string testStr = "themecatcryingthem";
var words = new Dictionary<int, string>();
var len = testStr.Length;
for (int x = 0; x < len; x++)
{
    int n = len > 28 ? 28 : len;//assuming 28 is the maximum length of an english word
    for(int i = (n - 1); i > x; i--)
    {
        string test = testStr.Substring(x, i - x + 1);
        if (list.Contains(test))
        {
            if (!words.ContainsValue(test))
            {
                bool found = false;//to check if there's a shorter item starting from same index
                var key = testStr.IndexOf(test, x, len - x);
                foreach (var w in words)
                {
                    if (w.Value.Contains(test) && w.Key != key && key == (w.Key + w.Value.Length - test.Length))
                    {
                        found = true;
                    }
                }
                if (!found && !words.ContainsKey(key)) words.Add(key, test);
            }
        }
    }
}

words.Values.ToList().ForEach(n=> Console.WriteLine("{0}, ",n));//spit out current values

主题，猫，哭，他们
“卡基迪”不，永远也不会是英文字典的一部分。拿它，ReDIT！我猜<代码>计数器< /> >与<代码> x < /> >显然@ KonradRudolph认为蛋糕是一个词。是的，对不起，应该是席试了你的代码，但不幸的是它没有给我我希望的结果。我应该补充说整个字符串必须匹配。用尽可能少的单词进行编辑，因此对于前“themecat”应该输出到“theme cat”，而不是“the me cat”，因为它是最短的路径。事实上，我想我已经找到了另一个so问题的解决方案，现在将其转换为C:）@chrr：我代码中的第二种方法解决了这个问题。我做过很多关于人工智能和字符串操作的项目，所以这类事情对我来说一点也不新鲜或复杂。用外行的话来说，你可以从“themecat”中正确地提取“主题猫”通过迭代列表中的每个单词并删除列表中包含的任何其他字符串来使用上述方法。如果您愿意，我可以使用此更新代码。完成，但编写有点匆忙。如果您将其用于大量数据，我希望看到性能结果。您可以检查一些字典coll割礼。