C# 优化位的重新排列
我有一个核心C#函数,我正试图加速它。涉及安全或不安全代码的建议同样受欢迎。方法如下:C# 优化位的重新排列,c#,optimization,bit-manipulation,C#,Optimization,Bit Manipulation,我有一个核心C#函数,我正试图加速它。涉及安全或不安全代码的建议同样受欢迎。方法如下: public byte[] Interleave(uint[] vector) { var byteVector = new byte[BytesNeeded + 1]; // Extra byte needed when creating a BigInteger, for sign bit. foreach (var idx in PrecomputedIndices) {
public byte[] Interleave(uint[] vector)
{
var byteVector = new byte[BytesNeeded + 1]; // Extra byte needed when creating a BigInteger, for sign bit.
foreach (var idx in PrecomputedIndices)
{
var bit = (byte)(((vector[idx.iFromUintVector] >> idx.iFromUintBit) & 1U) << idx.iToByteBit);
byteVector[idx.iToByteVector] |= bit;
}
return byteVector;
}
交织
方法的目的是将位从UINT数组复制到字节数组。我已经预先计算了源和目标数组索引以及源和目标比特数,并将它们存储在索引对象中。源中没有两个相邻的位在目标中是相邻的,因此排除了某些优化
为了给你们一个尺度的概念,我正在研究的问题大约有4200个维度,所以“向量”有4200个元素。向量中的值范围从0到12,因此我只需要使用4位来将其值存储在字节数组中,因此我需要4200 x 4=16800位数据,或者每个向量需要2100字节的输出。此方法将被调用数百万次。在我需要优化的较大过程中,它消耗了大约三分之一的时间
更新1:将“索引”更改为结构,并缩小一些数据类型,使对象仅为8个字节(一个int、一个short和两个字节),将执行时间的百分比从35%减少到30%。这些是我修改后的实现的关键部分,我的想法来自于注释者:
struct Indices
{
/// <summary>
/// Index into source vector of source uint to read.
/// </summary>
public readonly int iFromUintVector;
/// <summary>
/// Index into target vector of target byte to write.
/// </summary>
public readonly short iToByteVector;
/// <summary>
/// Index into source uint of source bit to read.
/// </summary>
public readonly byte iFromUintBit;
/// <summary>
/// Index into target byte of target bit to write.
/// </summary>
public readonly byte iToByteBit;
public Indices(int fromUintVector, byte fromUintBit, short toByteVector, byte toByteBit)
{
iFromUintVector = fromUintVector;
iFromUintBit = fromUintBit;
iToByteVector = toByteVector;
iToByteBit = toByteBit;
}
}
struct索引
{
///
///索引到要读取的源uint的源向量中。
///
iFromUintVector中的公共只读;
///
///索引到要写入的目标字节的目标向量中。
///
公共只读短字节向量;
///
///索引到要读取的源位的源uint。
///
公共只读字节iFromUintBit;
///
///索引到要写入的目标位的目标字节。
///
公共只读字节iToByteBit;
公共索引(int-fromUintVector、byte-fromUintBit、short-toByteVector、byte-toByteBit)
{
iFromUintVector=fromUintVector;
iFromUintBit=fromUintBit;
iToByteVector=toByteVector;
iToByteBit=toByteBit;
}
}
Comparison<Indices> sortByTargetByteAndBit = (a, b) =>
{
if (a.iToByteVector < b.iToByteVector) return -1;
if (a.iToByteVector > b.iToByteVector) return 1;
if (a.iToByteBit < b.iToByteBit) return -1;
if (a.iToByteBit > b.iToByteBit) return 1;
return 0;
};
Array.Sort(PrecomputedIndices, sortByTargetByteAndBit);
比较sortByTargetByteAndBit=(a,b)=>
{
if(a.iToByteVectorpublic byte[] Interleave(uint[] vector)
{
var byteVector = new byte[BytesNeeded + 1]; // An extra byte is needed to hold the extra bits and a sign bit for the BigInteger.
var extraBits = Bits - BytesNeeded << 3;
int iIndex = 0;
var iByte = 0;
for (; iByte < BytesNeeded; iByte++)
{
// Unroll the loop so we compute the bits for a whole byte at a time.
uint bits = 0;
var idx0 = PrecomputedIndices[iIndex];
var idx1 = PrecomputedIndices[iIndex + 1];
var idx2 = PrecomputedIndices[iIndex + 2];
var idx3 = PrecomputedIndices[iIndex + 3];
var idx4 = PrecomputedIndices[iIndex + 4];
var idx5 = PrecomputedIndices[iIndex + 5];
var idx6 = PrecomputedIndices[iIndex + 6];
var idx7 = PrecomputedIndices[iIndex + 7];
bits = (((vector[idx0.iFromUintVector] >> idx0.iFromUintBit) & 1U))
| (((vector[idx1.iFromUintVector] >> idx1.iFromUintBit) & 1U) << 1)
| (((vector[idx2.iFromUintVector] >> idx2.iFromUintBit) & 1U) << 2)
| (((vector[idx3.iFromUintVector] >> idx3.iFromUintBit) & 1U) << 3)
| (((vector[idx4.iFromUintVector] >> idx4.iFromUintBit) & 1U) << 4)
| (((vector[idx5.iFromUintVector] >> idx5.iFromUintBit) & 1U) << 5)
| (((vector[idx6.iFromUintVector] >> idx6.iFromUintBit) & 1U) << 6)
| (((vector[idx7.iFromUintVector] >> idx7.iFromUintBit) & 1U) << 7);
byteVector[iByte] = (Byte)bits;
iIndex += 8;
}
for (; iIndex < PrecomputedIndices.Length; iIndex++)
{
var idx = PrecomputedIndices[iIndex];
var bit = (byte)(((vector[idx.iFromUintVector] >> idx.iFromUintBit) & 1U) << idx.iToByteBit);
byteVector[idx.iToByteVector] |= bit;
}
return byteVector;
}
公共字节[]交织(uint[]向量)
{
var byteVector=new byte[BytesRequired+1];//需要一个额外的字节来保存额外的位,需要一个符号位来保存BigInteger。
var extraBits=位-字节(需要>idx0.iFromUintBit)&1U))
|((向量[idx1.iFromUintVector]>>idx1.iFromUintBit)和1U)>idx2.iFromUintBit)&1U)>idx3.iFromUintBit)&1U)>idx4.iFromUintBit)&1U)>idx5.iFromUintBit)&1U)>idx6.iFromUintBit)&1U)>idx7.iFromUintBit)&1U)>idx.iFromUintBit)&1U)这些都是我修改后的实现的关键部分,从评论者那里汲取了一些想法:
将对象转换为struct,将数据类型收缩为较小的整数,并重新排列,使对象适合64位值,这对于64位计算机更为合适:
struct Indices
{
/// <summary>
/// Index into source vector of source uint to read.
/// </summary>
public readonly int iFromUintVector;
/// <summary>
/// Index into target vector of target byte to write.
/// </summary>
public readonly short iToByteVector;
/// <summary>
/// Index into source uint of source bit to read.
/// </summary>
public readonly byte iFromUintBit;
/// <summary>
/// Index into target byte of target bit to write.
/// </summary>
public readonly byte iToByteBit;
public Indices(int fromUintVector, byte fromUintBit, short toByteVector, byte toByteBit)
{
iFromUintVector = fromUintVector;
iFromUintBit = fromUintBit;
iToByteVector = toByteVector;
iToByteBit = toByteBit;
}
}
struct索引
{
///
///索引到要读取的源uint的源向量中。
///
iFromUintVector中的公共只读;
///
///索引到要写入的目标字节的目标向量中。
///
公共只读短字节向量;
///
///索引到要读取的源位的源uint。
///
公共只读字节iFromUintBit;
///
///索引到要写入的目标位的目标字节。
///
公共只读字节iToByteBit;
公共索引(int-fromUintVector、byte-fromUintBit、short-toByteVector、byte-toByteBit)
{
iFromUintVector=fromUintVector;
iFromUintBit=fromUintBit;
iToByteVector=toByteVector;
iToByteBit=toByteBit;
}
}
对预计算进行排序,以便我按升序写入每个目标字节和位,从而提高内存缓存访问:
Comparison<Indices> sortByTargetByteAndBit = (a, b) =>
{
if (a.iToByteVector < b.iToByteVector) return -1;
if (a.iToByteVector > b.iToByteVector) return 1;
if (a.iToByteBit < b.iToByteBit) return -1;
if (a.iToByteBit > b.iToByteBit) return 1;
return 0;
};
Array.Sort(PrecomputedIndices, sortByTargetByteAndBit);
比较sortByTargetByteAndBit=(a,b)=>
{
if(a.iToByteVectorb.iToByteVector)返回1;
如果(a.iToByteBitb.iToByteBit)返回1;
返回0;
};
Array.Sort(预计算dices、sortbytargetbytes和bit);
public byte[] Interleave(uint[] vector)
{
var byteVector = new byte[BytesNeeded + 1]; // An extra byte is needed to hold the extra bits and a sign bit for the BigInteger.
var extraBits = Bits - BytesNeeded << 3;
int iIndex = 0;
var iByte = 0;
for (; iByte < BytesNeeded; iByte++)
{
// Unroll the loop so we compute the bits for a whole byte at a time.
uint bits = 0;
var idx0 = PrecomputedIndices[iIndex];
var idx1 = PrecomputedIndices[iIndex + 1];
var idx2 = PrecomputedIndices[iIndex + 2];
var idx3 = PrecomputedIndices[iIndex + 3];
var idx4 = PrecomputedIndices[iIndex + 4];
var idx5 = PrecomputedIndices[iIndex + 5];
var idx6 = PrecomputedIndices[iIndex + 6];
var idx7 = PrecomputedIndices[iIndex + 7];
bits = (((vector[idx0.iFromUintVector] >> idx0.iFromUintBit) & 1U))
| (((vector[idx1.iFromUintVector] >> idx1.iFromUintBit) & 1U) << 1)
| (((vector[idx2.iFromUintVector] >> idx2.iFromUintBit) & 1U) << 2)
| (((vector[idx3.iFromUintVector] >> idx3.iFromUintBit) & 1U) << 3)
| (((vector[idx4.iFromUintVector] >> idx4.iFromUintBit) & 1U) << 4)
| (((vector[idx5.iFromUintVector] >> idx5.iFromUintBit) & 1U) << 5)
| (((vector[idx6.iFromUintVector] >> idx6.iFromUintBit) & 1U) << 6)
| (((vector[idx7.iFromUintVector] >> idx7.iFromUintBit) & 1U) << 7);
byteVector[iByte] = (Byte)bits;
iIndex += 8;
}
for (; iIndex < PrecomputedIndices.Length; iIndex++)
{
var idx = PrecomputedIndices[iIndex];
var bit = (byte)(((vector[idx.iFromUintVector] >> idx.iFromUintBit) & 1U) << idx.iToByteBit);
byteVector[idx.iToByteVector] |= bit;
}
return byteVector;
}
公共字节[]交织(uint[]向量)
{
var byteVector=new byte[BytesRequired+1];//需要一个额外的字节来保存额外的位,需要一个符号位来保存BigInteger。
var extraBits=位-字节(需要>idx0.iFromUintBit)&1U))
|((向量[idx1.iFromUintVector]>>idx1.iFromUintBit)和1U)>idx2.iFromUintBit)&1U)>idx3.iFromUintBit)&1U)>idx4.iFromUintBit)&1U)>idx5.iFromUintBit)&1U)>idx6.iFromUintBit)&1U)>idx7.iFromUintBit)&1U)>idx.iFromUintBit)&1U)因为索引是一种小的、不可变的类型,您是否尝试过将其设置为结构为了避免结构复制,因为你调用它数百万次,请使用for
循环,而不是foreach
。这将节省很短的时间。但总比什么都不做要好。@thumbmunkeys-很好的一点。我对缓存位置相关的事情不太在行。在80年代,我做了很多C编程,但C已经损坏了