性能理念（内存中的C#哈希集和包含太慢）_C#_Performance_Hashtable_Contains_Hashset

性能理念（内存中的C#哈希集和包含太慢）

c# performance

性能理念（内存中的C#哈希集和包含太慢）,c#,performance,hashtable,contains,hashset,C#,Performance,Hashtable,Contains,Hashset,我有以下代码 private void LoadIntoMemory() { //Init large HashSet HashSet<document> hsAllDocuments = new HashSet<document>(); //Get first rows from database List<document> docsList = document.GetAllAboveDocID(0, 500000);

我有以下代码

private void LoadIntoMemory()
{
    //Init large HashSet
    HashSet<document> hsAllDocuments = new HashSet<document>();

    //Get first rows from database
    List<document> docsList = document.GetAllAboveDocID(0, 500000);

    //Load objects into dictionary
    foreach (document d in docsList)
    {
        hsAllDocuments.Add(d);
    }

    Application["dicAllDocuments"] = hsAllDocuments;
}

private HashSet<document> documentHits(HashSet<document> hsRawHit, HashSet<document> hsAllDocuments, string query, string[] queryArray)
{
    int counter = 0;
    const int maxCount = 1000;

    foreach (document d in hsAllDocuments)
    {
        //Headline
        if (d.Headline.Contains(query))
        {
            if (counter >= maxCount)
                break;
            hsRawHit.Add(d);
            counter++;
        }

        //Description
        if (d.Description.Contains(query))
        {
            if (counter >= maxCount)
                break;
            hsRawHit.Add(d);
            counter++;
        }

        //splitted query word by word
        //string[] queryArray = query.Split(' ');
        if (queryArray.Count() > 1)
        {
            foreach (string q in queryArray)
            {
                if (d.Headline.Contains(q))
                {
                    if (counter >= maxCount)
                        break;
                    hsRawHit.Add(d);
                    counter++;
                }

                //Description
                if (d.Description.Contains(q))
                {
                    if (counter >= maxCount)
                        break;
                    hsRawHit.Add(d);
                    counter++;
                }
            }
        }

    }

    return hsRawHit;
}

private void LoadIntoMemory（）
{
//初始化大哈希集
HashSet hsAllDocuments=新HashSet（）；
//从数据库中获取第一行
List docsList=document.GetAllAboveDocID（0，500000）；
//将对象加载到字典中
foreach（文档列表中的文档d）
{
hs所有文件。添加（d）；
}
申请[“dicAllDocuments”]=hs所有文件；
}
私有HashSet documentHits（HashSet hsRawHit、HashSet hsAllDocuments、字符串查询、字符串[]queryArray）
{
int计数器=0；
const int maxCount=1000；
foreach（hsAllDocuments中的文档d）
{
//标题
如果（d.Headline.Contains（查询））
{
如果（计数器>=最大计数）
打破
hsRawHit.Add（d）；
计数器++；
}
//描述
if（d.描述.包含（查询））
{
如果（计数器>=最大计数）
打破
hsRawHit.Add（d）；
计数器++；
}
//逐字拆分查询
//字符串[]queryArray=query.Split（“”）；
if（queryArray.Count（）>1）
{
foreach（queryArray中的字符串q）
{
如果（d.标题包含（q））
{
如果（计数器>=最大计数）
打破
hsRawHit.Add（d）；
计数器++；
}
//描述
如果（d.说明包含（q））
{
如果（计数器>=最大计数）
打破
hsRawHit.Add（d）；
计数器++；
}
}
}
}
返回hsRawHit；
}

首先，我将所有数据加载到一个hashset中（通过应用程序供以后使用）-运行良好-对于我正在做的事情来说，运行缓慢完全可以

将在C#中运行4.0框架（无法使用异步内容更新到4.0的新升级）

documentHits方法在我当前的设置中运行得相当慢——考虑到它都在内存中。我能做些什么来加速这种方法

示例将非常棒-谢谢。

如果您在开始创建数据库时有大量时间，您可以考虑使用

Trie将使字符串搜索更快

中有一些解释和实现

另一个实现：

您正在线性运行所有文档以查找匹配项-这是O（n），如果您解决了反问题，您可以做得更好，与全文索引的工作原理类似：从查询词开始，预处理与每个查询词匹配的文档集-由于这可能会变得复杂，我建议只使用具有全文功能的数据库，这将比您的方法快得多

另外，您正在滥用哈希集—只使用列表，不要重复—在

documentHits（）

中生成匹配的所有案例都应该是独占的。

我看到您使用的是

哈希集，但您没有使用它的任何优点，所以您应该使用列表
需要花费时间的是在所有文档中循环，并在其他字符串中查找字符串，因此您应该尽量省去这些字符串
一种可能性是设置文档的索引，其中包含哪些字符对。如果字符串query
包含Hello
，则您将查找包含He
、el
、ll
和lo
的文档
您可以设置一个字典
，其中字典键是字符组合，列表包含文档列表中文档的索引。当然，设置字典需要一些时间，但您可以将重点放在不太常见的字符组合上。如果80%的文档中存在字符组合，则删除文档是毫无用处的，但如果只有2%的文档中存在字符组合，则删除了98%的工作
如果您循环浏览列表中的文档，并将事件添加到字典中的列表中，索引列表将被排序，因此以后很容易加入列表。当您向列表添加索引时，如果列表太大，并且您发现它们对于省略文档没有用处，则可以将其丢弃。这样，您将只保留较短的列表，而不会占用太多内存
编辑：
它总结了一个小例子：
public class IndexElliminator<T> {

  private List<T> _items;
  private Dictionary<string, List<int>> _index;
  private Func<T, string> _getContent;

  private static HashSet<string> GetPairs(string value) {
    HashSet<string> pairs = new HashSet<string>();
    for (int i = 1; i < value.Length; i++) {
      pairs.Add(value.Substring(i - 1, 2));
    }
    return pairs;
  }

  public IndexElliminator(List<T> items, Func<T, string> getContent, int maxIndexSize) {
    _items = items;
    _getContent = getContent;
    _index = new Dictionary<string, List<int>>();
    for (int index = 0;index<_items.Count;index++) {
      T item = _items[index];
      foreach (string pair in GetPairs(_getContent(item))) {
        List<int> list;
        if (_index.TryGetValue(pair, out list)) {
          if (list != null) {
            if (list.Count == maxIndexSize) {
              _index[pair] = null;
            } else {
              list.Add(index);
            }
          }
        } else {
          list = new List<int>();
          list.Add(index);
          _index.Add(pair, list);
        }
      }
    }
  }

  private static List<int> JoinLists(List<int> list1, List<int> list2) {
    List<int> result = new List<int>();
    int i1 = 0, i2 = 0;
    while (i1 < list1.Count && i2 < list2.Count) {
      switch (Math.Sign(list1[i1].CompareTo(list2[i2]))) {
        case 0: result.Add(list1[i1]); i1++; i2++; break;
        case -1: i1++; break;
        case 1: i2++; break;
      }
    }
    return result;
  }

  public List<T> Find(string query) {
    HashSet<string> pairs = GetPairs(query);
    List<List<int>> indexes = new List<List<int>>();
    bool found = false;
    foreach (string pair in pairs) {
      List<int> list;
      if (_index.TryGetValue(pair, out list)) {
        found = true;
        if (list != null) {
          indexes.Add(list);
        }
      }
    }
    List<T> result = new List<T>();
    if (found && indexes.Count == 0) {
      indexes.Add(Enumerable.Range(0, _items.Count).ToList());
    }
    if (indexes.Count > 0) {
      while (indexes.Count > 1) {
        indexes[indexes.Count - 2] = JoinLists(indexes[indexes.Count - 2], indexes[indexes.Count - 1]);
        indexes.RemoveAt(indexes.Count - 1);
      }
      foreach (int index in indexes[0]) {
        if (_getContent(_items[index]).Contains(query)) {
          result.Add(_items[index]);
        }
      }
    }
    return result;
  }

}

您不应该针对所有测试步骤测试每个文档
相反，您应该在第一次成功的测试结果之后转到下一个文档
hsRawHit.Add(d);
counter++;

您应该添加继续在计数器++之后
hsRawHit.Add(d);
counter++;
continue;

你的档案员说什么是最慢的？从那开始。有多慢？“足够快”的预算是多少？可能是文档数量不是线性的。当你只是循环内容时，为什么还要麻烦使用哈希集。他使用哈希集来防止重复，这是错误的方法。
hsRawHit.Add(d);
counter++;

hsRawHit.Add(d);
counter++;
continue;