C# 优化位的重新排列

C# 优化位的重新排列,c#,optimization,bit-manipulation,C#,Optimization,Bit Manipulation,我有一个核心C#函数,我正试图加速它。涉及安全或不安全代码的建议同样受欢迎。方法如下: public byte[] Interleave(uint[] vector) { var byteVector = new byte[BytesNeeded + 1]; // Extra byte needed when creating a BigInteger, for sign bit. foreach (var idx in PrecomputedIndices) {

我有一个核心C#函数,我正试图加速它。涉及安全或不安全代码的建议同样受欢迎。方法如下:

public byte[] Interleave(uint[] vector)
{
    var byteVector = new byte[BytesNeeded + 1]; // Extra byte needed when creating a BigInteger, for sign bit.
    foreach (var idx in PrecomputedIndices)
    {
        var bit = (byte)(((vector[idx.iFromUintVector] >> idx.iFromUintBit) & 1U) << idx.iToByteBit);
        byteVector[idx.iToByteVector] |= bit;
    }
    return byteVector;
}
交织
方法的目的是将位从UINT数组复制到字节数组。我已经预先计算了源和目标数组索引以及源和目标比特数,并将它们存储在索引对象中。源中没有两个相邻的位在目标中是相邻的,因此排除了某些优化

为了给你们一个尺度的概念,我正在研究的问题大约有4200个维度,所以“向量”有4200个元素。向量中的值范围从0到12,因此我只需要使用4位来将其值存储在字节数组中,因此我需要4200 x 4=16800位数据,或者每个向量需要2100字节的输出。此方法将被调用数百万次。在我需要优化的较大过程中,它消耗了大约三分之一的时间


更新1:将“索引”更改为结构,并缩小一些数据类型,使对象仅为8个字节(一个int、一个short和两个字节),将执行时间的百分比从35%减少到30%。

这些是我修改后的实现的关键部分,我的想法来自于注释者:

  • 将对象转换为struct,将数据类型收缩为较小的整数,并重新排列,使对象适合64位值,这对于64位计算机更为合适:

    struct Indices
    {
        /// <summary>
        /// Index into source vector of source uint to read.
        /// </summary>
        public readonly int iFromUintVector;
        /// <summary>
        /// Index into target vector of target byte to write.
        /// </summary>
        public readonly short iToByteVector;
        /// <summary>
        /// Index into source uint of source bit to read.
        /// </summary>
        public readonly byte iFromUintBit;
        /// <summary>
        /// Index into target byte of target bit to write.
        /// </summary>
        public readonly byte iToByteBit;
        public Indices(int fromUintVector, byte fromUintBit, short toByteVector, byte toByteBit)
        {
            iFromUintVector = fromUintVector;
            iFromUintBit = fromUintBit;
            iToByteVector = toByteVector;
            iToByteBit = toByteBit;
        }
    }
    
    struct索引
    {
    /// 
    ///索引到要读取的源uint的源向量中。
    /// 
    iFromUintVector中的公共只读;
    /// 
    ///索引到要写入的目标字节的目标向量中。
    /// 
    公共只读短字节向量;
    /// 
    ///索引到要读取的源位的源uint。
    /// 
    公共只读字节iFromUintBit;
    /// 
    ///索引到要写入的目标位的目标字节。
    /// 
    公共只读字节iToByteBit;
    公共索引(int-fromUintVector、byte-fromUintBit、short-toByteVector、byte-toByteBit)
    {
    iFromUintVector=fromUintVector;
    iFromUintBit=fromUintBit;
    iToByteVector=toByteVector;
    iToByteBit=toByteBit;
    }
    }
    
  • 对预计算进行排序,以便我按升序写入每个目标字节和位,从而提高内存缓存访问:

    Comparison<Indices> sortByTargetByteAndBit = (a, b) =>
    {
        if (a.iToByteVector < b.iToByteVector) return -1;
        if (a.iToByteVector > b.iToByteVector) return 1;
        if (a.iToByteBit < b.iToByteBit) return -1;
        if (a.iToByteBit > b.iToByteBit) return 1;
        return 0;
    };
    Array.Sort(PrecomputedIndices, sortByTargetByteAndBit);
    
    比较sortByTargetByteAndBit=(a,b)=> { if(a.iToByteVectorb.iToByteVector)返回1; 如果(a.iToByteBitb.iToByteBit)返回1; 返回0; }; Array.Sort(预计算dices、sortbytargetbytes和bit);
  • 展开循环,以便一次组装整个目标字节,减少我访问目标阵列的次数:

    public byte[] Interleave(uint[] vector)
    {
        var byteVector = new byte[BytesNeeded + 1]; // An extra byte is needed to hold the extra bits and a sign bit for the BigInteger.
        var extraBits = Bits - BytesNeeded << 3;
        int iIndex = 0;
        var iByte = 0;
        for (; iByte < BytesNeeded; iByte++)
        {
            // Unroll the loop so we compute the bits for a whole byte at a time.
            uint bits = 0;
            var idx0 = PrecomputedIndices[iIndex];
            var idx1 = PrecomputedIndices[iIndex + 1];
            var idx2 = PrecomputedIndices[iIndex + 2];
            var idx3 = PrecomputedIndices[iIndex + 3];
            var idx4 = PrecomputedIndices[iIndex + 4];
            var idx5 = PrecomputedIndices[iIndex + 5];
            var idx6 = PrecomputedIndices[iIndex + 6];
            var idx7 = PrecomputedIndices[iIndex + 7];
            bits = (((vector[idx0.iFromUintVector] >> idx0.iFromUintBit) & 1U))
                 | (((vector[idx1.iFromUintVector] >> idx1.iFromUintBit) & 1U) << 1)
                 | (((vector[idx2.iFromUintVector] >> idx2.iFromUintBit) & 1U) << 2)
                 | (((vector[idx3.iFromUintVector] >> idx3.iFromUintBit) & 1U) << 3)
                 | (((vector[idx4.iFromUintVector] >> idx4.iFromUintBit) & 1U) << 4)
                 | (((vector[idx5.iFromUintVector] >> idx5.iFromUintBit) & 1U) << 5)
                 | (((vector[idx6.iFromUintVector] >> idx6.iFromUintBit) & 1U) << 6)
                 | (((vector[idx7.iFromUintVector] >> idx7.iFromUintBit) & 1U) << 7);
            byteVector[iByte] = (Byte)bits;
            iIndex += 8;
        }
        for (; iIndex < PrecomputedIndices.Length; iIndex++)
        {
            var idx = PrecomputedIndices[iIndex];
            var bit = (byte)(((vector[idx.iFromUintVector] >> idx.iFromUintBit) & 1U) << idx.iToByteBit);
            byteVector[idx.iToByteVector] |= bit;
        }
        return byteVector;
    }
    
    公共字节[]交织(uint[]向量)
    {
    var byteVector=new byte[BytesRequired+1];//需要一个额外的字节来保存额外的位,需要一个符号位来保存BigInteger。
    var extraBits=位-字节(需要>idx0.iFromUintBit)&1U))
    
    |((向量[idx1.iFromUintVector]>>idx1.iFromUintBit)和1U)>idx2.iFromUintBit)&1U)>idx3.iFromUintBit)&1U)>idx4.iFromUintBit)&1U)>idx5.iFromUintBit)&1U)>idx6.iFromUintBit)&1U)>idx7.iFromUintBit)&1U)>idx.iFromUintBit)&1U)这些都是我修改后的实现的关键部分,从评论者那里汲取了一些想法:

  • 将对象转换为struct,将数据类型收缩为较小的整数,并重新排列,使对象适合64位值,这对于64位计算机更为合适:

    struct Indices
    {
        /// <summary>
        /// Index into source vector of source uint to read.
        /// </summary>
        public readonly int iFromUintVector;
        /// <summary>
        /// Index into target vector of target byte to write.
        /// </summary>
        public readonly short iToByteVector;
        /// <summary>
        /// Index into source uint of source bit to read.
        /// </summary>
        public readonly byte iFromUintBit;
        /// <summary>
        /// Index into target byte of target bit to write.
        /// </summary>
        public readonly byte iToByteBit;
        public Indices(int fromUintVector, byte fromUintBit, short toByteVector, byte toByteBit)
        {
            iFromUintVector = fromUintVector;
            iFromUintBit = fromUintBit;
            iToByteVector = toByteVector;
            iToByteBit = toByteBit;
        }
    }
    
    struct索引
    {
    /// 
    ///索引到要读取的源uint的源向量中。
    /// 
    iFromUintVector中的公共只读;
    /// 
    ///索引到要写入的目标字节的目标向量中。
    /// 
    公共只读短字节向量;
    /// 
    ///索引到要读取的源位的源uint。
    /// 
    公共只读字节iFromUintBit;
    /// 
    ///索引到要写入的目标位的目标字节。
    /// 
    公共只读字节iToByteBit;
    公共索引(int-fromUintVector、byte-fromUintBit、short-toByteVector、byte-toByteBit)
    {
    iFromUintVector=fromUintVector;
    iFromUintBit=fromUintBit;
    iToByteVector=toByteVector;
    iToByteBit=toByteBit;
    }
    }
    
  • 对预计算进行排序,以便我按升序写入每个目标字节和位,从而提高内存缓存访问:

    Comparison<Indices> sortByTargetByteAndBit = (a, b) =>
    {
        if (a.iToByteVector < b.iToByteVector) return -1;
        if (a.iToByteVector > b.iToByteVector) return 1;
        if (a.iToByteBit < b.iToByteBit) return -1;
        if (a.iToByteBit > b.iToByteBit) return 1;
        return 0;
    };
    Array.Sort(PrecomputedIndices, sortByTargetByteAndBit);
    
    比较sortByTargetByteAndBit=(a,b)=> { if(a.iToByteVectorb.iToByteVector)返回1; 如果(a.iToByteBitb.iToByteBit)返回1; 返回0; }; Array.Sort(预计算dices、sortbytargetbytes和bit);
  • 展开循环,以便一次组装整个目标字节,减少我访问目标阵列的次数:

    public byte[] Interleave(uint[] vector)
    {
        var byteVector = new byte[BytesNeeded + 1]; // An extra byte is needed to hold the extra bits and a sign bit for the BigInteger.
        var extraBits = Bits - BytesNeeded << 3;
        int iIndex = 0;
        var iByte = 0;
        for (; iByte < BytesNeeded; iByte++)
        {
            // Unroll the loop so we compute the bits for a whole byte at a time.
            uint bits = 0;
            var idx0 = PrecomputedIndices[iIndex];
            var idx1 = PrecomputedIndices[iIndex + 1];
            var idx2 = PrecomputedIndices[iIndex + 2];
            var idx3 = PrecomputedIndices[iIndex + 3];
            var idx4 = PrecomputedIndices[iIndex + 4];
            var idx5 = PrecomputedIndices[iIndex + 5];
            var idx6 = PrecomputedIndices[iIndex + 6];
            var idx7 = PrecomputedIndices[iIndex + 7];
            bits = (((vector[idx0.iFromUintVector] >> idx0.iFromUintBit) & 1U))
                 | (((vector[idx1.iFromUintVector] >> idx1.iFromUintBit) & 1U) << 1)
                 | (((vector[idx2.iFromUintVector] >> idx2.iFromUintBit) & 1U) << 2)
                 | (((vector[idx3.iFromUintVector] >> idx3.iFromUintBit) & 1U) << 3)
                 | (((vector[idx4.iFromUintVector] >> idx4.iFromUintBit) & 1U) << 4)
                 | (((vector[idx5.iFromUintVector] >> idx5.iFromUintBit) & 1U) << 5)
                 | (((vector[idx6.iFromUintVector] >> idx6.iFromUintBit) & 1U) << 6)
                 | (((vector[idx7.iFromUintVector] >> idx7.iFromUintBit) & 1U) << 7);
            byteVector[iByte] = (Byte)bits;
            iIndex += 8;
        }
        for (; iIndex < PrecomputedIndices.Length; iIndex++)
        {
            var idx = PrecomputedIndices[iIndex];
            var bit = (byte)(((vector[idx.iFromUintVector] >> idx.iFromUintBit) & 1U) << idx.iToByteBit);
            byteVector[idx.iToByteVector] |= bit;
        }
        return byteVector;
    }
    
    公共字节[]交织(uint[]向量)
    {
    var byteVector=new byte[BytesRequired+1];//需要一个额外的字节来保存额外的位,需要一个符号位来保存BigInteger。
    var extraBits=位-字节(需要>idx0.iFromUintBit)&1U))
    
    |((向量[idx1.iFromUintVector]>>idx1.iFromUintBit)和1U)>idx2.iFromUintBit)&1U)>idx3.iFromUintBit)&1U)>idx4.iFromUintBit)&1U)>idx5.iFromUintBit)&1U)>idx6.iFromUintBit)&1U)>idx7.iFromUintBit)&1U)>idx.iFromUintBit)&1U)因为
    索引是一种小的、不可变的类型,您是否尝试过将其设置为
    结构
    为了避免结构复制,因为你调用它数百万次,请使用
    for
    循环,而不是
    foreach
    。这将节省很短的时间。但总比什么都不做要好。@thumbmunkeys-很好的一点。我对缓存位置相关的事情不太在行。在80年代,我做了很多C编程,但C已经损坏了