C# 使用Vector<；T>；运行速度比经典循环慢_C#_.net_Vector_Vectorization_Benchmarking

C# 使用Vector<；T>；运行速度比经典循环慢

c# .net vector

C# 使用Vector<；T>；运行速度比经典循环慢,c#,.net,vector,vectorization,benchmarking,C#,.net,Vector,Vectorization,Benchmarking,我看过几篇文章，描述了Vector是如何启用SIMD的，并且是如何使用JIT内部函数实现的，这样编译器就可以正确地输出AVS/SSE/。。。使用它时的指令，允许比经典的线性循环更快的代码（示例）我决定尝试重写一个方法，我必须看看我是否能够获得一些加速，但到目前为止我失败了，矢量化代码的运行速度比原来的慢了3倍，我不太清楚为什么。下面是两个版本的方法，用于检查两个Span实例是否所有项对都位于同一位置，且相对于阈值共享同一位置 // Classic implementation public s

我看过几篇文章，描述了

Vector

是如何启用SIMD的，并且是如何使用JIT内部函数实现的，这样编译器就可以正确地输出AVS/SSE/。。。使用它时的指令，允许比经典的线性循环更快的代码（示例）

我决定尝试重写一个方法，我必须看看我是否能够获得一些加速，但到目前为止我失败了，矢量化代码的运行速度比原来的慢了3倍，我不太清楚为什么。下面是两个版本的方法，用于检查两个

Span

实例是否所有项对都位于同一位置，且相对于阈值共享同一位置

// Classic implementation
public static unsafe bool MatchElementwiseThreshold(this Span<float> x1, Span<float> x2, float threshold)
{
    fixed (float* px1 = &x1.DangerousGetPinnableReference(), px2 = &x2.DangerousGetPinnableReference())
        for (int i = 0; i < x1.Length; i++)
            if (px1[i] > threshold != px2[i] > threshold)
                return false;
    return true;
}

// Vectorized
public static unsafe bool MatchElementwiseThresholdSIMD(this Span<float> x1, Span<float> x2, float threshold)
{
    // Setup the test vector
    int l = Vector<float>.Count;
    float* arr = stackalloc float[l];
    for (int i = 0; i < l; i++)
        arr[i] = threshold;
    Vector<float> cmp = Unsafe.Read<Vector<float>>(arr);
    fixed (float* px1 = &x1.DangerousGetPinnableReference(), px2 = &x2.DangerousGetPinnableReference())
    {
        // Iterate in chunks
        int
            div = x1.Length / l,
            mod = x1.Length % l,
            i = 0,
            offset = 0;
        for (; i < div; i += 1, offset += l)
        {
            Vector<float>
                v1 = Unsafe.Read<Vector<float>>(px1 + offset),
                v1cmp = Vector.GreaterThan<float>(v1, cmp),
                v2 = Unsafe.Read<Vector<float>>(px2 + offset),
                v2cmp = Vector.GreaterThan<float>(v2, cmp);
            float*
                pcmp1 = (float*)Unsafe.AsPointer(ref v1cmp),
                pcmp2 = (float*)Unsafe.AsPointer(ref v2cmp);
            for (int j = 0; j < l; j++)
                if (pcmp1[j] == 0 != (pcmp2[j] == 0))
                    return false;
        }

        // Test the remaining items, if any
        if (mod == 0) return true;
        for (i = x1.Length - mod; i < x1.Length; i++)
            if (px1[i] > threshold != px2[i] > threshold)
                return false;
    }
    return true;
}

//经典实现
公共静态不安全布尔MatchElementwiseThreshold（此Span x1、Span x2、浮点阈值）
{
已修复（float*px1=&x1.DangerousGetPinnableReference（），px2=&x2.DangerousGetPinnableReference（））
对于（int i=0；i阈值！=px2[i]>阈值）
返回false；
返回true；
}
//矢量化
公共静态不安全布尔匹配元素WiseThresholdSIMD（此跨度x1、跨度x2、浮点阈值）
{
//设置测试向量
int l=向量计数；
浮动*arr=stackalloc浮动[l]；
对于（int i=0；i阈值！=px2[i]>阈值）
返回false；
}
返回true；
}

正如我所说，我已经使用BenchmarkDotNet测试了这两个版本，而使用

Vector

的版本运行速度大约是另一个版本的3倍。我试着用不同长度的跨度（从大约100到超过2000）来运行测试，但是矢量化的方法比另一种方法慢得多

我是不是漏掉了什么明显的东西

谢谢

EDIT:我之所以使用不安全代码，并试图在不并行化的情况下尽可能优化此代码，是因为此方法已在

并行.For

迭代中调用

另外，拥有在多个线程上并行代码的能力通常不是让单个并行任务不优化的好理由。

**EDIT**

public static bool MatchElementwiseThresholdSIMD(ReadOnlySpan<float> x1, ReadOnlySpan<float> x2, float threshold)
{
    if (x1.Length != x2.Length) throw new ArgumentException("x1.Length != x2.Length");

    if (Vector.IsHardwareAccelerated)
    {
        var vx1 = x1.NonPortableCast<float, Vector<float>>();
        var vx2 = x2.NonPortableCast<float, Vector<float>>();

        var vthreshold = new Vector<float>(threshold);
        for (int i = 0; i < vx1.Length; ++i)
        {
            var v1cmp = Vector.GreaterThan(vx1[i], vthreshold);
            var v2cmp = Vector.GreaterThan(vx2[i], vthreshold);
            if (Vector.Xor(v1cmp, v2cmp) != Vector<int>.Zero)
                return false;
        }

        x1 = x1.Slice(Vector<float>.Count * vx1.Length);
        x2 = x2.Slice(Vector<float>.Count * vx2.Length);
    }

    for (var i = 0; i < x1.Length; i++)
        if (x1[i] > threshold != x2[i] > threshold)
            return false;

    return true;
}

公共静态布尔匹配元素WiseThresholdSIMD（只读范围x1、只读范围x2、浮点阈值）
{
如果（x1.Length！=x2.Length）抛出新的ArgumentException（“x1.Length！=x2.Length”）；
if（向量IsHardwareAccelerated）
{
var vx1=x1.NonPortableCast（）；
var vx2=x2.NonPortableCast（）；
var vthreshold=新向量（阈值）；
对于（int i=0；i阈值！=x2[i]>阈值）
返回false；
返回true；
}

现在，这并没有直接使用阵列的速度快（如果您有），但仍然比非SIMD版本快得多

（另一个编辑…）

…只是为了好玩，我想我会看到这个东西在完全通用的情况下可以很好地处理，答案很好。。。因此，您可以像下面这样编写代码，并且它与特定的代码一样高效（除了在非硬件加速的情况下，在这种情况下，它的速度略低于两倍-但并不完全糟糕…）

public static bool MatchElementwiseThreshold（只读span x1，只读span x2，T threshold）
其中T:struct
{
如果（x1.长度！=x2.长度）
抛出新的ArgumentException（“x1.Length！=x2.Length”）；
if（向量IsHardwareAccelerated）
{
var vx1=x1.NonPortableCast（）；
var vx2=x2.NonPortableCast（）；
var vthreshold=新向量（阈值）；
对于（int i=0；i0）！=（比较器比较（x2[i]，阈值）>0））
返回false；
返回true；
}

public static bool MatchElementwiseThreshold<T>(ReadOnlySpan<T> x1, ReadOnlySpan<T> x2, T threshold) where T : struct { if (x1.Length != x2.Length) throw new ArgumentException("x1.Length != x2.Length"); if (Vector.IsHardwareAccelerated) { var vx1 = x1.NonPortableCast<T, Vector<T>>(); var vx2 = x2.NonPortableCast<T, Vector<T>>(); var vthreshold = new Vector<T>(threshold); for (int i = 0; i < vx1.Length; ++i) { var v1cmp = Vector.GreaterThan(vx1[i], vthreshold); var v2cmp = Vector.GreaterThan(vx2[i], vthreshold); if (Vector.AsVectorInt32(Vector.Xor(v1cmp, v2cmp)) != Vector<int>.Zero) return false; } // slice them to handling remaining elementss x1 = x1.Slice(Vector<T>.Count * vx1.Length); x2 = x2.Slice(Vector<T>.Count * vx1.Length); } var comparer = System.Collections.Generic.Comparer<T>.Default; for (int i = 0; i < x1.Length; i++) if ((comparer.Compare(x1[i], threshold) > 0) != (comparer.Compare(x2[i], threshold) > 0)) return false; return true; }