C# 查找.Intersect.Count>；门槛？_C#_.net_Linq

C# 查找.Intersect.Count>；门槛？

c# .net linq

C# 查找.Intersect.Count>；门槛？,c#,.net,linq,C#,.net,Linq,我有两个集合，我想确定相交元素的数量是否高于某个阈值我目前使用此代码（执行约8500万次，因此速度很重要）：这让我觉得可能效率低下，因为必须首先计算numberOfSharedPoints 是否有更优化的方法，例如，当达到阈值时，使用中断快捷方式迭代元素奖金问题：第一行代码的this.pointsA.Intersect（pointsB.Count（）速度会更快吗集合当前是列表-哈希集会更快吗要确定交叉点的项目数是否大于或等于阈值，可以使用以下结构： if (pointsA.Inter

我有两个集合，我想确定相交元素的数量是否高于某个阈值

我目前使用此代码（执行约8500万次，因此速度很重要）：

这让我觉得可能效率低下，因为必须首先计算

numberOfSharedPoints

是否有更优化的方法，例如，当达到阈值时，使用

中断

快捷方式迭代元素

奖金问题：

第一行代码的

this.pointsA.Intersect（pointsB.Count（）

速度会更快吗

集合当前是

列表

-哈希集会更快吗

要确定交叉点的项目数是否大于或等于

阈值

，可以使用以下结构：

if (pointsA.Intersect(pointsB).Skip(THRESHOLD - 1).Any())
{
    //...
}

正如在另一个答案下的评论中所指出的，我们将只完全列举第二个序列。因此，该解决方案的复杂性似乎是

O（n+m）

和

分别是

pointsA

和

pointsB

集合中的项数

O（m）

是构建

HashSet

的成本，因此我假设这种结构是在内部使用的。检查一个元素是否在散列集中是一个常量时间（如注释中所指出的），在最坏的情况下（例如：当交集为空时，需要检查所有元素），检查次数最多为

此外，如果您有固定时间的具体集合

Count

，您可以尝试以下优化，如果它们的大小可能存在显著差异：

var shorter = pointsA;
var longer = pointsB;

//makes sense if Count() is constant time
if (shorter.Count() > longer.Count())
{
    shorter = pointsB;
    longer = pointsA;
}

if (longer.Intersect(shorter).Skip(THRESHOLD - 1).Any())
{
    //...
}

试试这个：

if(pointsA.Where(pointsB.Contains).Skip(THRESHOLD-1).Any()){
   //...
}

如果速度很重要，请使用以下方法

HashSet<T> hash = new HashSet<T>(pointsA);
hash.IntersectWith(pointsB);
return hash.Count;

HashSet hash=新的HashSet（pointsA）；
hash.IntersectWith（pointsB）；
返回hash.Count；

如果可以使用具体的集合，则不应该在性能关键的情况下使用LINQ

或者，首先尝试以集合的形式获取元素。

我创建了一个示例，以了解这里给出的每个答案的性能，包括传统的

foreach

循环：

在我的示例控制台应用程序中，我为

pointsA

和

pointsB

生成了10000个随机浮点数。阈值计数为100，检查每个方法的性能，代码如下：

static void Main(string[] args)
{
    double totalTimeSpentIntersectAndSkip = 0;
    double totalTimeSpentHashSet = 0;
    double totalTimeSpentCount = 0;
    double totalTimeSpentWhereAndSkip = 0;
    double totalTimeSpentForEach = 0;
    int maxIteration = 1000;
    for (int j = 0; j < maxIteration; j++)
    {
        Random r = new Random();
        for (int i = 0; i < 10000; i++)
        {
            pointsA.Add(r.NextDouble());
        }

        for (int i = 0; i < 10000; i++)
        {
            pointsB.Add(r.NextDouble());
        }

        s.Reset(); s.Start();
        var timeSpentInSeconds = TestUsingIntersectAndSkip();
        s.Stop();
        Console.WriteLine("IntersectAndSkip: " + timeSpentInSeconds);
        totalTimeSpentIntersectAndSkip += timeSpentInSeconds;

        s.Reset(); s.Start();
        timeSpentInSeconds = TestUsingHashSet();
        s.Stop();
        Console.WriteLine("HashSet: " + timeSpentInSeconds);
        totalTimeSpentHashSet += timeSpentInSeconds;

        s.Reset(); s.Start();
        timeSpentInSeconds = TestUsingForEach();
        s.Stop();
        Console.WriteLine("ForEach: " + timeSpentInSeconds);
        totalTimeSpentForEach += timeSpentInSeconds;

        s.Reset(); s.Start();
        timeSpentInSeconds = TestUsingWhereAndSkip();
        s.Stop();
        Console.WriteLine("WhereAndSkip: " + timeSpentInSeconds);
        totalTimeSpentWhereAndSkip += timeSpentInSeconds;

        s.Reset(); s.Start();
        timeSpentInSeconds = TestUsingCount();
        s.Stop();
        Console.WriteLine("Count: " + timeSpentInSeconds);
        totalTimeSpentCount += timeSpentInSeconds;

        Console.WriteLine("-------------------------------------------------------------------------------");
        pointsA.Clear();
        pointsB.Clear();
    }

    Console.WriteLine("Following is Average TimeSpent by each method: "+Environment.NewLine);
    Console.WriteLine("IntersectAndSkip: " + totalTimeSpentIntersectAndSkip / maxIteration);
    Console.WriteLine("HashSet: " + totalTimeSpentHashSet / maxIteration);
    Console.WriteLine("ForEach: " + totalTimeSpentForEach / maxIteration);
    Console.WriteLine("WhereAndSkip: " + totalTimeSpentWhereAndSkip / maxIteration);
    Console.WriteLine("Count: " + totalTimeSpentCount / maxIteration);
    Console.WriteLine("-------------------------------------------------------------------------------");


}
static Stopwatch s = new Stopwatch();
const int THRESHOLD = 100;
static List<Double> pointsA = new List<double>();
static List<Double> pointsB = new List<double>();

private static double TestUsingHashSet()
{
    HashSet<double> hash = new HashSet<double>(pointsA);
    hash.IntersectWith(pointsB);
    if (hash.Count >= THRESHOLD)
    {
        return s.Elapsed.TotalSeconds;
    }
    else
    {
        return s.Elapsed.TotalSeconds;
    }
}

private static double TestUsingWhereAndSkip()
{
    if (pointsA.Where(pointsB.Contains).Skip(THRESHOLD - 1).Any())
    {
        return s.Elapsed.TotalSeconds;
    }
    else
    {
        return s.Elapsed.TotalSeconds;
    }
}

private static double TestUsingCount()
{
    int numberOfSharedPoints = pointsA.Count(pointsB.Contains);
    if (numberOfSharedPoints >= THRESHOLD)
    {
        return s.Elapsed.TotalSeconds;
    }
    else
    {
        return s.Elapsed.TotalSeconds;
    }
}

private static double TestUsingForEach()
{
    var intersectItemCount = 0;
    foreach (var d in pointsA)
    {
        if (pointsB.Contains(d)) intersectItemCount++;
        if (intersectItemCount > THRESHOLD)
        {
            return s.Elapsed.TotalSeconds;
        }
    }
    return s.Elapsed.TotalSeconds;
}

private static double TestUsingIntersectAndSkip()
{
    if (pointsA.Intersect(pointsB).Skip(THRESHOLD - 1).Any())
    {
        return s.Elapsed.TotalSeconds;
    }
    else
    {
        return s.Elapsed.TotalSeconds;
    }
}

将项目计数从10000更改为50000（5次运行）时，除了

HashSet

和IntersectWithSkip之外，所有项目都花费了太多时间。业绩排名几乎保持不变：

我认为你可以通过排序的IEnumerable得到O（n），因为你只需要通过一次
这是整数，但您可以使用泛型并传递比较器
在我的测试中，下面的比较比ss1.Intersect（ss2）.Skip（thresHold1-1）.Any（）快5:1
是的，快5倍

使用两个HashSet和一个ForEach包含，您还可以得到O（n）compare，但创建HashSet的成本更高

public static void TimeTest()
{
    int size = 20000;
    List<Int32>ss1 = new List<int>(size);
    List<Int32>ss2 = new List<int>(size);
    for(int i = 0; i < size; i++)
    {
        Int32 int1 = i;
        Int32 int2 = i + (Int32)((float)size / 2);
        ss1.Add(i);
        ss2.Add(int2);
    }
    //foreach (int iTest in ss1)
    //    System.Diagnostics.Debug.WriteLine(iTest);
    //System.Diagnostics.Debug.WriteLine("");
    //foreach (int iTest in ss2)
    //    System.Diagnostics.Debug.WriteLine(iTest);

    System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
    sw.Start();
    int thresHold1 = (Int32)((float)size / 4);
    int thresHold2 = (Int32)((float)size * 3 / 4);

    Int32 matchcount = 0;
    for(int i = 0; i <= size; i++)
    {
        if(CompareSS(ss1, ss2, thresHold1))
            matchcount++;
        if (CompareSS(ss1, ss2, thresHold2))
            matchcount++;
    }
    System.Diagnostics.Debug.WriteLine("sw.ms {0}   count {1}", sw.ElapsedMilliseconds.ToString("N0"), matchcount.ToString("N0"));
    sw.Restart();
    matchcount = 0;
    for (int i = 0; i <= size; i++)
    {
        if (ss1.Intersect(ss2).Skip(thresHold1 - 1).Any())
            matchcount++;
        if (ss1.Intersect(ss2).Skip(thresHold2 - 1).Any())
            matchcount++;
    }
    System.Diagnostics.Debug.WriteLine("sw.ms {0}   count {1}", sw.ElapsedMilliseconds.ToString("N0"), matchcount.ToString("N0"));
    sw.Stop();

}
public static bool CompareSS (IEnumerable<Int32> ss1, IEnumerable<Int32> ss2, Int32 threshold) 
{
    //System.Diagnostics.Debug.WriteLine("threshold {0}", threshold);
    using (var cursor1 = ss1.GetEnumerator())
    using (var cursor2 = ss2.GetEnumerator())
    {
        if (!cursor1.MoveNext() || !cursor2.MoveNext())
        {
            return false;
        }
        Int32 int1 = cursor1.Current;
        Int32 int2 = cursor2.Current;               
        int count = 0;
        while (true)
        {
            //System.Diagnostics.Debug.WriteLine("int1 {0}   int2 {1}", int1, int2);
            int comparison = int1.CompareTo(int2);
            if (comparison < 0)
            {
                if (!cursor1.MoveNext())
                {
                    return false;
                }
                int1 = cursor1.Current;
            }
            else if (comparison > 0)
            {
                if (!cursor2.MoveNext())
                {
                    return false;
                }
                int2 = cursor2.Current;
            }
            else
            {
                count++;
                if (count >= threshold)
                    return true;
                if (!cursor1.MoveNext() || !cursor2.MoveNext())
                    return false;
                int1 = cursor1.Current;
                int2 = cursor2.Current;
            }
        }
    }
}

publicstaticvoidtimetest（）
{
int size=20000；
Listss1=新列表（大小）；
Listss2=新列表（大小）；
对于（int i=0；i

哪些项目的顺序较短<代码>点SA或

点SB

？这很重要，因为join总是缓冲正确的序列。任何一个序列都可能较短，这是不同的，在设计时是未知的。很好的一个。您可能应该注意到，

pointsA

的计算是惰性的，而

pointsB

的类型应该是

IList

（最好使用

HashSet

）。最后一点已经是正确的，但是如果它是longer@IlyaIvanov否，

Intersect

仅完全读取第二个序列。第一个序列仍然是惰性评估的。“您不应该在性能关键的情况下使用LINQ。”这是为什么？因为它的性能很差。这是因为它通常涉及许多冗余操作，因为必须重复大量工作才能获得，例如，ITERSetting项的数量。真的吗？有证据证明吗，或者只是你的怀疑？如果在不了解其工作原理的情况下，将其用于错误的“为什么”中，它的性能肯定会很差。否则，我没有注意到使用LINQ而不是

foreach

会有任何明显的惩罚。在这种情况下，您可能是对的，但您的陈述通常是错误的。那么懒惰地计算序列呢？尝试手动操作，你的头就会开始沸腾（我的当然会）。从1000000项的序列中只读取300项可能会极大地提高性能。@BartoszKP有时使用LINQ不会受到惩罚。一个很好的例子是如何实现

Count（）

方法。它检查基础集合是否实现了

IList

，如果实现了，则返回其

Count

属性（假定为

O（1）

）。但是，一般来说，LINQ不保证性能。这种优化在任何地方都没有文档记录，当然也不适用于

1) Intersect with Skip
2) HashSet
3) Count (Given by OP)
4) Where and Skip
5) Foreach

public static void TimeTest()
{
    int size = 20000;
    List<Int32>ss1 = new List<int>(size);
    List<Int32>ss2 = new List<int>(size);
    for(int i = 0; i < size; i++)
    {
        Int32 int1 = i;
        Int32 int2 = i + (Int32)((float)size / 2);
        ss1.Add(i);
        ss2.Add(int2);
    }
    //foreach (int iTest in ss1)
    //    System.Diagnostics.Debug.WriteLine(iTest);
    //System.Diagnostics.Debug.WriteLine("");
    //foreach (int iTest in ss2)
    //    System.Diagnostics.Debug.WriteLine(iTest);

    System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
    sw.Start();
    int thresHold1 = (Int32)((float)size / 4);
    int thresHold2 = (Int32)((float)size * 3 / 4);

    Int32 matchcount = 0;
    for(int i = 0; i <= size; i++)
    {
        if(CompareSS(ss1, ss2, thresHold1))
            matchcount++;
        if (CompareSS(ss1, ss2, thresHold2))
            matchcount++;
    }
    System.Diagnostics.Debug.WriteLine("sw.ms {0}   count {1}", sw.ElapsedMilliseconds.ToString("N0"), matchcount.ToString("N0"));
    sw.Restart();
    matchcount = 0;
    for (int i = 0; i <= size; i++)
    {
        if (ss1.Intersect(ss2).Skip(thresHold1 - 1).Any())
            matchcount++;
        if (ss1.Intersect(ss2).Skip(thresHold2 - 1).Any())
            matchcount++;
    }
    System.Diagnostics.Debug.WriteLine("sw.ms {0}   count {1}", sw.ElapsedMilliseconds.ToString("N0"), matchcount.ToString("N0"));
    sw.Stop();

}
public static bool CompareSS (IEnumerable<Int32> ss1, IEnumerable<Int32> ss2, Int32 threshold) 
{
    //System.Diagnostics.Debug.WriteLine("threshold {0}", threshold);
    using (var cursor1 = ss1.GetEnumerator())
    using (var cursor2 = ss2.GetEnumerator())
    {
        if (!cursor1.MoveNext() || !cursor2.MoveNext())
        {
            return false;
        }
        Int32 int1 = cursor1.Current;
        Int32 int2 = cursor2.Current;               
        int count = 0;
        while (true)
        {
            //System.Diagnostics.Debug.WriteLine("int1 {0}   int2 {1}", int1, int2);
            int comparison = int1.CompareTo(int2);
            if (comparison < 0)
            {
                if (!cursor1.MoveNext())
                {
                    return false;
                }
                int1 = cursor1.Current;
            }
            else if (comparison > 0)
            {
                if (!cursor2.MoveNext())
                {
                    return false;
                }
                int2 = cursor2.Current;
            }
            else
            {
                count++;
                if (count >= threshold)
                    return true;
                if (!cursor1.MoveNext() || !cursor2.MoveNext())
                    return false;
                int1 = cursor1.Current;
                int2 = cursor2.Current;
            }
        }
    }
}