C# 在C语言中，从字符串列表中标记子字符串列表的算法#_C#_Arrays_Linq_Optimization_Substring

C# 在C语言中，从字符串列表中标记子字符串列表的算法#

c# arrays linq optimization

C# 在C语言中，从字符串列表中标记子字符串列表的算法#,c#,arrays,linq,optimization,substring,C#,Arrays,Linq,Optimization,Substring,我有一个语句（字符串）/语料库列表，如下所示 List<string> allUtterances = new List<string> { "c2's are above the hierarchy than c1's", "c2's are better than c1's", "get me a group of 10 c1's", "he is a c2", "he was a c two", "hey i

我有一个语句（字符串）/语料库列表，如下所示

List<string> allUtterances = new List<string>
    {
    "c2's are above the hierarchy than c1's",
    "c2's are better than c1's",
    "get me a group of 10 c1's",
    "he is a c2",
    "he was a c two",
    "hey i am a c1",
    "jdsaxkjhasx",
    "khndsmcsdfcs",
    "my competency is c2",
    "none intent",
    "she is still a c 1",
    "this is a none intent, please ignore",
    "we are hiring fresh c1's"
};

我想获得上述话语的以下输出：

{
      "entity": "c2",
      "type": "Competency",
      "startIndex": 0,
      "endIndex": 1,
      "resolution": {
        "values": [
          "C2"
        ]
      }
},
{
      "entity": "c1",
      "type": "Competency",
      "startIndex": 21,
      "endIndex": 22,
      "resolution": {
        "values": [
          "C1"
        ]
      }
}

我想符合的规则如下：

对于

典故列表中的每一句话

，如果话语文本包含来自类子列表的属性

列表

的值，我想提取开始和结束位置，并用相应的键标记它们，这是

规范格式

，在这种情况下，用中的

名称

属性更新我的JSON负载中的类型键李斯特小姐

我尝试过以下方法：

using System;
using System.Linq;
using System.IO;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System.Collections.Generic;

namespace ListEntityProblem
{
    class Program
    {
        static void Main(string[] args)
        {
            List<string> allUtterances = new List<string>
            {
                "c2's are above the hierarchy than c1's",
                "c2's are better than c1's",
                "get me a group of 10 c1's",
                "he is a c2",
                "he was a c two",
                "hey i am a c1",
                "jdsaxkjhasx",
                "khndsmcsdfcs",
                "my competency is c2",
                "none intent",
                "she is still a c 1",
                "this is a none intent, please ignore",
                "we are hiring fresh c1's"
            };

            List<ListEntity> listEntities = new List<ListEntity>
            {
                new ListEntity
                {
                    name = "Competency",
                    subLists = new List<Sublist>
                    {
                        new Sublist
                        {
                            canonicalForm = "C1",
                            list = new List<string>
                            {
                                "c1",
                                "c one",
                                "c 1",
                                "C 1",
                                "C1",
                                "C one",
                                "C ONE"
                            }
                        },
                        new Sublist
                        {
                            canonicalForm = "C2",
                            list = new List<string>
                            {
                                "c2",
                                "c two",
                                "c 2",
                                "C 2",
                                "C2",
                                "C two",
                                "C TWO"
                            }
                        }
                    }
                }
            };


            List<Tuple<string, string, List<string>>> ListEntityLookup = new List<Tuple<string, string, List<string>>>();

            //n^2, construct lookup for list entities
            foreach (var item in listEntities)
            {
                string listEntityName = item.name;
                foreach (var innerItem in item.subLists)
                {
                    string normalizedValue = innerItem.canonicalForm;
                    List<string> synonymValues = innerItem.list;

                    ListEntityLookup.Add(Tuple.Create<string, string, List<string>>(listEntityName, normalizedValue, synonymValues));
                }
            }

            List<JObject> parsedEntities = new List<JObject>();

            //n^3, populate the parsed payload with start and end indices
            foreach (var item in allUtterances)
            {
                foreach (var ll in ListEntityLookup)
                {
                    foreach (var cf in ll.Item3)
                    {
                        int start = 0, end = 0;
                        if (item.Contains(cf))
                        {
                            start = item.IndexOf(cf);
                            end = start + cf.Length;




                            parsedEntities.Add(new JObject
                            {
                                new JProperty("Start", start),
                                new JProperty("End", end),
                                new JProperty("Query", item),
                                new JProperty("CanonicalForm", ll.Item2),
                                new JProperty("ListEntity", ll.Item1)
                            });
                        }
                    }
                }
            }

            //Group by query
            var groupedParsedEntities = parsedEntities.GroupBy(x => x["Query"]).ToList();



        }
    }
}

但是这种方法对于大量的话语来说似乎很慢，而且扩展性也不是很好。因为主循环运行n^3次。我们的服务器每秒必须进行太多的计算

我一直在思考是否应该使用Regex，它是否能为我提供一些性能优势

请帮我优化这个算法

非常感谢您的帮助。

您是否尝试过使用Linq查询而不是循环

这并不能完全满足您的需要，但我相信它确实提取了相关数据：

  allUtterances
    .AsParallel()
    .SelectMany(utterance => listEntities.SelectMany(l => l.subLists
                                .Where(sl => sl.list.Any(sle => utterance.Contains(sle)))
                                .SelectMany(sl => sl.list
                                                    .Where(sle => utterance.Contains(sle))
                                                            .Select(sle => new {
                                                                            canonicalForm = sl.canonicalForm,
                                                                            matchedValue = sle, 
                                                                            startindex = utterance.IndexOf(sle),
                                                                            endindex = utterance.IndexOf(sle) + sle.Length - 1
                                                                        })
                                )
                                .Select(o => new {
                                    // not sure if 'entity' and 'resolutionValue' are swopped around
                                        utterance = utterance,
                                        entity = o.matchedValue,
                                        type = l.name,
                                        startIndex = o.startindex,
                                        endIndex = o.endindex,
                                        resolutionValue = o.canonicalForm,
                                    }
                                )
                            /*
                            or change the Select above to create the JObjects:
                            .Select(jo => new JObject { 
                                new JProperty("Start", jo.startIndex),
                                new JProperty("End", jo.endIndex),
                                new JProperty("Query", jo.utterance),
                                new JProperty("CanonicalForm", jo.resolutionValue),
                                new JProperty("ListEntity", jo.entity)
                            })
                            */
                )).ToList();

或者，您可以尝试并行化循环：

allUtterances.AsParallel().ForAll(ut => {  .... });

对于绝对不使用正则表达式的用户来说，这可能也是一个好问题。字符串方法比正则表达式要有效得多。@jdweng我主要关心的是循环运行n^3次，它的负载会很重，我怀疑

。IndexOf（）

是否健壮/不会出错？正则表达式的性能会差10倍。

using System;
using System.Linq;
using System.IO;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System.Collections.Generic;

namespace ListEntityProblem
{
    class Program
    {
        static void Main(string[] args)
        {
            List<string> allUtterances = new List<string>
            {
                "c2's are above the hierarchy than c1's",
                "c2's are better than c1's",
                "get me a group of 10 c1's",
                "he is a c2",
                "he was a c two",
                "hey i am a c1",
                "jdsaxkjhasx",
                "khndsmcsdfcs",
                "my competency is c2",
                "none intent",
                "she is still a c 1",
                "this is a none intent, please ignore",
                "we are hiring fresh c1's"
            };

            List<ListEntity> listEntities = new List<ListEntity>
            {
                new ListEntity
                {
                    name = "Competency",
                    subLists = new List<Sublist>
                    {
                        new Sublist
                        {
                            canonicalForm = "C1",
                            list = new List<string>
                            {
                                "c1",
                                "c one",
                                "c 1",
                                "C 1",
                                "C1",
                                "C one",
                                "C ONE"
                            }
                        },
                        new Sublist
                        {
                            canonicalForm = "C2",
                            list = new List<string>
                            {
                                "c2",
                                "c two",
                                "c 2",
                                "C 2",
                                "C2",
                                "C two",
                                "C TWO"
                            }
                        }
                    }
                }
            };


            List<Tuple<string, string, List<string>>> ListEntityLookup = new List<Tuple<string, string, List<string>>>();

            //n^2, construct lookup for list entities
            foreach (var item in listEntities)
            {
                string listEntityName = item.name;
                foreach (var innerItem in item.subLists)
                {
                    string normalizedValue = innerItem.canonicalForm;
                    List<string> synonymValues = innerItem.list;

                    ListEntityLookup.Add(Tuple.Create<string, string, List<string>>(listEntityName, normalizedValue, synonymValues));
                }
            }

            List<JObject> parsedEntities = new List<JObject>();

            //n^3, populate the parsed payload with start and end indices
            foreach (var item in allUtterances)
            {
                foreach (var ll in ListEntityLookup)
                {
                    foreach (var cf in ll.Item3)
                    {
                        int start = 0, end = 0;
                        if (item.Contains(cf))
                        {
                            start = item.IndexOf(cf);
                            end = start + cf.Length;




                            parsedEntities.Add(new JObject
                            {
                                new JProperty("Start", start),
                                new JProperty("End", end),
                                new JProperty("Query", item),
                                new JProperty("CanonicalForm", ll.Item2),
                                new JProperty("ListEntity", ll.Item1)
                            });
                        }
                    }
                }
            }

            //Group by query
            var groupedParsedEntities = parsedEntities.GroupBy(x => x["Query"]).ToList();



        }
    }
}

            foreach (var item in allUtterances)
            {
                foreach (var listEntity in listEntities)
                {
                    foreach (var canonicalForm in listEntity.subLists)
                    {
                        foreach(var synonym in canonicalForm.list)
                        {
                            int start = item.IndexOf(synonym);
                            if(start != -1)
                            {
                                parsedEntities.Add(new JObject
                                {
                                    new JProperty("Start", start),
                                    new JProperty("End", start + synonym.Length),
                                    new JProperty("Query", item),
                                    new JProperty("CanonicalForm", canonicalForm.canonicalForm),
                                    new JProperty("ListEntity", listEntity.name)
                                });
                            }
                        }
                    }
                }
            }

  allUtterances
    .AsParallel()
    .SelectMany(utterance => listEntities.SelectMany(l => l.subLists
                                .Where(sl => sl.list.Any(sle => utterance.Contains(sle)))
                                .SelectMany(sl => sl.list
                                                    .Where(sle => utterance.Contains(sle))
                                                            .Select(sle => new {
                                                                            canonicalForm = sl.canonicalForm,
                                                                            matchedValue = sle, 
                                                                            startindex = utterance.IndexOf(sle),
                                                                            endindex = utterance.IndexOf(sle) + sle.Length - 1
                                                                        })
                                )
                                .Select(o => new {
                                    // not sure if 'entity' and 'resolutionValue' are swopped around
                                        utterance = utterance,
                                        entity = o.matchedValue,
                                        type = l.name,
                                        startIndex = o.startindex,
                                        endIndex = o.endindex,
                                        resolutionValue = o.canonicalForm,
                                    }
                                )
                            /*
                            or change the Select above to create the JObjects:
                            .Select(jo => new JObject { 
                                new JProperty("Start", jo.startIndex),
                                new JProperty("End", jo.endIndex),
                                new JProperty("Query", jo.utterance),
                                new JProperty("CanonicalForm", jo.resolutionValue),
                                new JProperty("ListEntity", jo.entity)
                            })
                            */
                )).ToList();

allUtterances.AsParallel().ForAll(ut => {  .... });