C# 在C语言中,从字符串列表中标记子字符串列表的算法#
我有一个语句(字符串)/语料库列表,如下所示C# 在C语言中,从字符串列表中标记子字符串列表的算法#,c#,arrays,linq,optimization,substring,C#,Arrays,Linq,Optimization,Substring,我有一个语句(字符串)/语料库列表,如下所示 List<string> allUtterances = new List<string> { "c2's are above the hierarchy than c1's", "c2's are better than c1's", "get me a group of 10 c1's", "he is a c2", "he was a c two", "hey i
List<string> allUtterances = new List<string>
{
"c2's are above the hierarchy than c1's",
"c2's are better than c1's",
"get me a group of 10 c1's",
"he is a c2",
"he was a c two",
"hey i am a c1",
"jdsaxkjhasx",
"khndsmcsdfcs",
"my competency is c2",
"none intent",
"she is still a c 1",
"this is a none intent, please ignore",
"we are hiring fresh c1's"
};
我想获得上述话语的以下输出:
{
"entity": "c2",
"type": "Competency",
"startIndex": 0,
"endIndex": 1,
"resolution": {
"values": [
"C2"
]
}
},
{
"entity": "c1",
"type": "Competency",
"startIndex": 21,
"endIndex": 22,
"resolution": {
"values": [
"C1"
]
}
}
我想符合的规则如下:
对于典故列表中的每一句话
,如果话语文本包含来自类子列表的属性列表
的值,我想提取开始和结束位置,并用相应的键标记它们,这是规范格式
,在这种情况下,用中的名称
属性更新我的JSON负载中的类型键李斯特小姐
我尝试过以下方法:
using System;
using System.Linq;
using System.IO;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System.Collections.Generic;
namespace ListEntityProblem
{
class Program
{
static void Main(string[] args)
{
List<string> allUtterances = new List<string>
{
"c2's are above the hierarchy than c1's",
"c2's are better than c1's",
"get me a group of 10 c1's",
"he is a c2",
"he was a c two",
"hey i am a c1",
"jdsaxkjhasx",
"khndsmcsdfcs",
"my competency is c2",
"none intent",
"she is still a c 1",
"this is a none intent, please ignore",
"we are hiring fresh c1's"
};
List<ListEntity> listEntities = new List<ListEntity>
{
new ListEntity
{
name = "Competency",
subLists = new List<Sublist>
{
new Sublist
{
canonicalForm = "C1",
list = new List<string>
{
"c1",
"c one",
"c 1",
"C 1",
"C1",
"C one",
"C ONE"
}
},
new Sublist
{
canonicalForm = "C2",
list = new List<string>
{
"c2",
"c two",
"c 2",
"C 2",
"C2",
"C two",
"C TWO"
}
}
}
}
};
List<Tuple<string, string, List<string>>> ListEntityLookup = new List<Tuple<string, string, List<string>>>();
//n^2, construct lookup for list entities
foreach (var item in listEntities)
{
string listEntityName = item.name;
foreach (var innerItem in item.subLists)
{
string normalizedValue = innerItem.canonicalForm;
List<string> synonymValues = innerItem.list;
ListEntityLookup.Add(Tuple.Create<string, string, List<string>>(listEntityName, normalizedValue, synonymValues));
}
}
List<JObject> parsedEntities = new List<JObject>();
//n^3, populate the parsed payload with start and end indices
foreach (var item in allUtterances)
{
foreach (var ll in ListEntityLookup)
{
foreach (var cf in ll.Item3)
{
int start = 0, end = 0;
if (item.Contains(cf))
{
start = item.IndexOf(cf);
end = start + cf.Length;
parsedEntities.Add(new JObject
{
new JProperty("Start", start),
new JProperty("End", end),
new JProperty("Query", item),
new JProperty("CanonicalForm", ll.Item2),
new JProperty("ListEntity", ll.Item1)
});
}
}
}
}
//Group by query
var groupedParsedEntities = parsedEntities.GroupBy(x => x["Query"]).ToList();
}
}
}
但是这种方法对于大量的话语来说似乎很慢,而且扩展性也不是很好。因为主循环运行n^3次。我们的服务器每秒必须进行太多的计算 我一直在思考是否应该使用Regex,它是否能为我提供一些性能优势 请帮我优化这个算法
非常感谢您的帮助。您是否尝试过使用Linq查询而不是循环 这并不能完全满足您的需要,但我相信它确实提取了相关数据:
allUtterances
.AsParallel()
.SelectMany(utterance => listEntities.SelectMany(l => l.subLists
.Where(sl => sl.list.Any(sle => utterance.Contains(sle)))
.SelectMany(sl => sl.list
.Where(sle => utterance.Contains(sle))
.Select(sle => new {
canonicalForm = sl.canonicalForm,
matchedValue = sle,
startindex = utterance.IndexOf(sle),
endindex = utterance.IndexOf(sle) + sle.Length - 1
})
)
.Select(o => new {
// not sure if 'entity' and 'resolutionValue' are swopped around
utterance = utterance,
entity = o.matchedValue,
type = l.name,
startIndex = o.startindex,
endIndex = o.endindex,
resolutionValue = o.canonicalForm,
}
)
/*
or change the Select above to create the JObjects:
.Select(jo => new JObject {
new JProperty("Start", jo.startIndex),
new JProperty("End", jo.endIndex),
new JProperty("Query", jo.utterance),
new JProperty("CanonicalForm", jo.resolutionValue),
new JProperty("ListEntity", jo.entity)
})
*/
)).ToList();
或者,您可以尝试并行化循环:
allUtterances.AsParallel().ForAll(ut => { .... });
对于绝对不使用正则表达式的用户来说,这可能也是一个好问题。字符串方法比正则表达式要有效得多。@jdweng我主要关心的是循环运行n^3次,它的负载会很重,我怀疑
。IndexOf()
是否健壮/不会出错?正则表达式的性能会差10倍。
using System;
using System.Linq;
using System.IO;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System.Collections.Generic;
namespace ListEntityProblem
{
class Program
{
static void Main(string[] args)
{
List<string> allUtterances = new List<string>
{
"c2's are above the hierarchy than c1's",
"c2's are better than c1's",
"get me a group of 10 c1's",
"he is a c2",
"he was a c two",
"hey i am a c1",
"jdsaxkjhasx",
"khndsmcsdfcs",
"my competency is c2",
"none intent",
"she is still a c 1",
"this is a none intent, please ignore",
"we are hiring fresh c1's"
};
List<ListEntity> listEntities = new List<ListEntity>
{
new ListEntity
{
name = "Competency",
subLists = new List<Sublist>
{
new Sublist
{
canonicalForm = "C1",
list = new List<string>
{
"c1",
"c one",
"c 1",
"C 1",
"C1",
"C one",
"C ONE"
}
},
new Sublist
{
canonicalForm = "C2",
list = new List<string>
{
"c2",
"c two",
"c 2",
"C 2",
"C2",
"C two",
"C TWO"
}
}
}
}
};
List<Tuple<string, string, List<string>>> ListEntityLookup = new List<Tuple<string, string, List<string>>>();
//n^2, construct lookup for list entities
foreach (var item in listEntities)
{
string listEntityName = item.name;
foreach (var innerItem in item.subLists)
{
string normalizedValue = innerItem.canonicalForm;
List<string> synonymValues = innerItem.list;
ListEntityLookup.Add(Tuple.Create<string, string, List<string>>(listEntityName, normalizedValue, synonymValues));
}
}
List<JObject> parsedEntities = new List<JObject>();
//n^3, populate the parsed payload with start and end indices
foreach (var item in allUtterances)
{
foreach (var ll in ListEntityLookup)
{
foreach (var cf in ll.Item3)
{
int start = 0, end = 0;
if (item.Contains(cf))
{
start = item.IndexOf(cf);
end = start + cf.Length;
parsedEntities.Add(new JObject
{
new JProperty("Start", start),
new JProperty("End", end),
new JProperty("Query", item),
new JProperty("CanonicalForm", ll.Item2),
new JProperty("ListEntity", ll.Item1)
});
}
}
}
}
//Group by query
var groupedParsedEntities = parsedEntities.GroupBy(x => x["Query"]).ToList();
}
}
}
foreach (var item in allUtterances)
{
foreach (var listEntity in listEntities)
{
foreach (var canonicalForm in listEntity.subLists)
{
foreach(var synonym in canonicalForm.list)
{
int start = item.IndexOf(synonym);
if(start != -1)
{
parsedEntities.Add(new JObject
{
new JProperty("Start", start),
new JProperty("End", start + synonym.Length),
new JProperty("Query", item),
new JProperty("CanonicalForm", canonicalForm.canonicalForm),
new JProperty("ListEntity", listEntity.name)
});
}
}
}
}
}
allUtterances
.AsParallel()
.SelectMany(utterance => listEntities.SelectMany(l => l.subLists
.Where(sl => sl.list.Any(sle => utterance.Contains(sle)))
.SelectMany(sl => sl.list
.Where(sle => utterance.Contains(sle))
.Select(sle => new {
canonicalForm = sl.canonicalForm,
matchedValue = sle,
startindex = utterance.IndexOf(sle),
endindex = utterance.IndexOf(sle) + sle.Length - 1
})
)
.Select(o => new {
// not sure if 'entity' and 'resolutionValue' are swopped around
utterance = utterance,
entity = o.matchedValue,
type = l.name,
startIndex = o.startindex,
endIndex = o.endindex,
resolutionValue = o.canonicalForm,
}
)
/*
or change the Select above to create the JObjects:
.Select(jo => new JObject {
new JProperty("Start", jo.startIndex),
new JProperty("End", jo.endIndex),
new JProperty("Query", jo.utterance),
new JProperty("CanonicalForm", jo.resolutionValue),
new JProperty("ListEntity", jo.entity)
})
*/
)).ToList();
allUtterances.AsParallel().ForAll(ut => { .... });