C# 根据用户查询对结果进行排序的简单评分算法
我正在开发一个非常基本的网络搜索引擎,它有几个部分。根据用户查询检索结果后,我想计算每个结果的速率,然后按计算出的速率对结果进行排序。我的问题是:C# 根据用户查询对结果进行排序的简单评分算法,c#,linq-to-sql,.net-4.0,search-engine,C#,Linq To Sql,.net 4.0,Search Engine,我正在开发一个非常基本的网络搜索引擎,它有几个部分。根据用户查询检索结果后,我想计算每个结果的速率,然后按计算出的速率对结果进行排序。我的问题是: var tmpQuery = (from urls in _context.Urls join documents in _context.Documents on urls.UrlId equals documents.DocumentId let
var tmpQuery = (from urls in _context.Urls
join documents in _context.Documents
on urls.UrlId equals documents.DocumentId
let words = (from words in _context.Words
join hits in _context.Hits
on words.WordId equals hits.WordId
where hits.DocumentId == documents.DocumentId
select words.Text)
select new { urls, documents, words });
var results = (from r in tmpQuery.AsEnumerable()
where r.urls.ResolvedPath.Contains(breakedQuery, KeywordParts.Url, part) ||
r.documents.Title.Contains(breakedQuery, KeywordParts.Title, part) ||
r.documents.Keywords.Contains(breakedQuery, KeywordParts.Keywords, part) ||
r.documents.Description.Contains(breakedQuery, Description, part) ||
r.words.Contains(breakedQuery, KeywordParts.Content, part)
select new SearchResult()
{
UrlId = r.urls.UrlId,
Url = r.urls.ResolvedPath,
IndexedOn = r.documents.IndexedOn,
Title = r.documents.Title,
Description = r.documents.Description,
Host = new Uri(r.urls.ResolvedPath).Host,
Length = r.documents.Length,
Rate = 0CalculateRating(breakedQuery, r.urls.ResolvedPath, r.documents.Title, r.documents.Keywords, r.documents.Description, r.words)
}).AsEnumerable()
.OrderByDescending(result => result.Rate)
.Distinct(new SearchResultEqualityComparer());
并且,通过以下方法计算速率:
private int CalculateRating(IEnumerable<string> breakedQuery, string resolvedPath, string title, string keywords, string description, IEnumerable<string> words)
{
var baseRate = 0;
foreach (var query in breakedQuery)
{
/*first I'm breaking up user raw query (Microsoft -Apple) to list of broken
queries (Microsoft, -Apple) if broken query start with - that means
results shouldn't have*/
var none = (query.StartsWith("-"));
string term = query.Replace("-", "");
var pathCount = Calculate(resolvedPath, term);
var titleCount = Calculate(title, term);
var keywordsCount = Calculate(keywords, term);
var descriptionCount = Calculate(description, term);
var wordsCount = Calculate(words, term);
var result = (pathCount * 100) + (titleCount * 50) + (keywordsCount * 25) + (descriptionCount * 10) + (wordsCount);
if (none)
baseRate -= result;
else
baseRate += result;
}
return baseRate;
}
private int Calculate(string source, string query)
{
if (!string.IsNullOrWhiteSpace(source))
return Calculate(source.Split(' ').AsEnumerable<string>(), query);
return 0;
}
private int Calculate(IEnumerable<string> sources, string query)
{
var count = 0;
if (sources != null && sources.Count() > 0)
{
//to comparing two strings
//first case sensitive
var elements = sources.Where(source => source == query);
count += elements.Count();
//second case insensitive (half point of sensitive)
count += sources.Except(elements).Where(source => source.ToLowerInvariant() == query.ToLowerInvariant()).Count() / 2;
}
return count;
}
private int计算(IEnumerable breakedQuery、字符串解析路径、字符串标题、字符串关键字、字符串描述、IEnumerable单词)
{
var基准利率=0;
foreach(breakedQuery中的var查询)
{
/*首先,我将用户原始查询(微软-苹果)分解到已分解的列表中
查询(Microsoft,-Apple)如果断开的查询以-开始,则表示
结果不应该如此*/
var none=(query.StartsWith(“-”);
string term=query.Replace(“-”,”);
var pathCount=Calculate(解析路径,术语);
var titleCount=计算(标题、期限);
var关键字cont=计算(关键字,术语);
var descriptionCount=计算(描述,术语);
var wordsCount=计算(单词、术语);
变量结果=(路径计数*100)+(标题计数*50)+(关键字计数*25)+(描述计数*10)+(关键字计数);
如有(无)
基准利率-=结果;
其他的
基准利率+=结果;
}
返回基准率;
}
私有整数计算(字符串源、字符串查询)
{
如果(!string.IsNullOrWhiteSpace(源))
返回计算(source.Split(“”).AsEnumerable(),query);
返回0;
}
私有int计算(IEnumerable源、字符串查询)
{
var计数=0;
if(sources!=null&&sources.Count()>0)
{
//比较两个字符串
//第一个区分大小写的
var elements=sources.Where(source=>source==query);
count+=elements.count();
//第二个不区分大小写(区分半个点)
count+=sources.Except(elements).Where(source=>source.ToLowerInvariant()==query.ToLowerInvariant()).count()/2;
}
返回计数;
}
请指导我提高性能(我的搜索引擎的速度非常低)我希望这取决于你的
(从URL的上下文中)。URL
-没有这个位置,你会得到很多数据,然后在建立结果时扔掉。tmpQuery/results中有多少项?是的,实际上在第二次查询中执行了真正的筛选。请看这里,了解我为什么在1176页、57283个URL、35733个单词和330621个点击(保存在这里的单词和文档之间的关系)的测试端口中使用此签名。我希望您最好在存储过程中尽可能多地使用此签名。