Arrays 转换[UInt32]->；[UInt8]->；[[UInt8]]使用Swift_Arrays_Swift_Cryptography_Uint32_Uint8array

Arrays 转换[UInt32]->；[UInt8]->；[[UInt8]]使用Swift

arrays swift cryptography

Arrays 转换[UInt32]->；[UInt8]->；[[UInt8]]使用Swift,arrays,swift,cryptography,uint32,uint8array,Arrays,Swift,Cryptography,Uint32,Uint8array,我正在尝试加快我当前实现的一个函数的速度，该函数将[UInt32]转换为[UInt8]，而[UInt8]又被拆分为[[UInt8]]，每个索引处有6个数组我的实施： extension Array { func splitBy(subSize: Int) -> [[Element]] { return 0.stride(to: self.count, by: subSize).map { startIndex in let endIndex = startInde

我正在尝试加快我当前实现的一个函数的速度，该函数将[UInt32]转换为[UInt8]，而[UInt8]又被拆分为[[UInt8]]，每个索引处有6个数组

我的实施：

extension Array {
func splitBy(subSize: Int) -> [[Element]] {
    return 0.stride(to: self.count, by: subSize).map { startIndex in
        let endIndex = startIndex.advancedBy(subSize, limit: self.count)
        return Array(self[startIndex ..< endIndex])
    }
  }
}



func convertWordToBytes(fullW : [UInt32]) -> [[UInt8]] {
    var combined8 = [UInt8]()

    //Convert 17 [UInt32] to 68 [UInt8]
    for i in 0...16{
        _ = 24.stride(through: 0, by: -8).map {
            combined8.append(UInt8(truncatingBitPattern: fullW[i] >> UInt32($0)))
        }
    }

    //Split [UInt8] to [[UInt8]] with 6 values at each index.
    let combined48 = combined8.splitBy(6) 

    return combined48
}

扩展数组{
func splitBy（子集：Int）->[[Element]]{
返回0.stride（to:self.count，by:subSize）.map{startIndex in
让endIndex=startIndex.advancedBy（子版，限制：self.count）
返回数组（自[startIndex..[UInt8]]{
var combined8=[UInt8]（）
//将17[UInt32]转换为68[UInt8]
因为我在0…16{
_=24.跨步（通过：0，通过：-8）。地图{
combined8.append（UInt8（截断位模式：fullW[i]>>UInt32（$0）））
}
}
//将[UInt8]拆分为[[UInt8]]，每个索引处有6个值。
设combined48=combined8.splitBy（6）
返回组合48
}

这个函数将在我的程序中被迭代数百万次，它的速度是一个巨大的负担

有人有什么想法吗？谢谢

如果你分析（

Cmd+I

）你的代码，你会发现大部分时间都在各种“复制到缓冲区”功能上。当您将新元素附加到数组中，但它的初始分配空间已用完，因此必须将其移动到堆上具有更多内存的位置时，就会发生这种情况。这一课的寓意是：堆分配很慢，但数组不可避免。尽量少做几次

试试这个：

func convertWordToBytes2(fullW: [UInt32]) -> [[UInt8]] {
    let subSize = 6

    // We allocate the array only once per run since allocation is so slow
    // There will only be assignment to it after
    var combined48 = [UInt8](count: fullW.count * 4, repeatedValue: 0).splitBy(subSize)

    var row = 0
    var col = 0

    for i in 0...16 {
        for j in 24.stride(through: 0, by: -8) {
            let value = UInt8(truncatingBitPattern: fullW[i] >> UInt32(j))
            combined48[row][col] = value

            col += 1
            if col >= subSize {
                row += 1
                col = 0
            }
        }
    }

    return combined48
}

基准代码：

let testCases = (0..<1_000_000).map { _ in
    (0..<17).map { _ in arc4random() }
}

testCases.forEach {
    convertWordToBytes($0)
    convertWordToBytes2($0)
}

通过消除多次分配，我们已经将运行时间减少了60%。但每个测试用例都是独立的，这非常适合于使用当今的多核CPU进行并行处理。修改的循环…：

dispatch_apply(testCases.count, dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_HIGH, 0)) { i in
    convertWordToBytes2(testCases[i])
}

。。。在我的四核i7上使用8个线程执行时，将节省大约1秒的时间：

Weight    Self Weight       Symbol Name
2.28 s    6.4%  0 s         _dispatch_worker_thread3  0x58467
2.24 s    6.3%  0 s         _dispatch_worker_thread3  0x58463
2.22 s    6.2%  0 s         _dispatch_worker_thread3  0x58464
2.21 s    6.2%  0 s         _dispatch_worker_thread3  0x58466
2.21 s    6.2%  0 s         _dispatch_worker_thread3  0x58465
2.21 s    6.2%  0 s         _dispatch_worker_thread3  0x58461
2.18 s    6.1%  0 s         _dispatch_worker_thread3  0x58462

节省的时间没有我希望的那么多。显然，在访问堆内存时存在一些争用。对于任何更快的解决方案，您应该探索基于C的解决方案。

您可能更愿意在您的代码位于Swift 2中的网站上发布此信息。您想将其保留为Swift 2还是同时更新为Swift 3？此计算机太旧，因此现在需要将其保留在Swift 2中。使用加速框架中的向量数学/simd libs，您可能可以挂接已损坏但可用的opencl实现。这个内核会很小，在管道中很容易压碎UINT8，比cpu线程快得多。非常感谢您这么做！这使我的代码更快。特别感谢您的基准测试和解释。

Weight    Self Weight       Symbol Name
2.28 s    6.4%  0 s         _dispatch_worker_thread3  0x58467
2.24 s    6.3%  0 s         _dispatch_worker_thread3  0x58463
2.22 s    6.2%  0 s         _dispatch_worker_thread3  0x58464
2.21 s    6.2%  0 s         _dispatch_worker_thread3  0x58466
2.21 s    6.2%  0 s         _dispatch_worker_thread3  0x58465
2.21 s    6.2%  0 s         _dispatch_worker_thread3  0x58461
2.18 s    6.1%  0 s         _dispatch_worker_thread3  0x58462