Swift 为什么这些简单的金属GPU计算内核比CPU实现慢或相等?

Swift 为什么这些简单的金属GPU计算内核比CPU实现慢或相等?,swift,gpgpu,metal,gpu,Swift,Gpgpu,Metal,Gpu,我在macOS上使用Metal/Swift编写了一个晶格动力学模拟。它只包含高度并行的乘法和加法,但我仍然无法让Metal/GPU打败CPU。(6核i5 vs Radeon Pro 5300) 代码应该在由46080个浮点组成的数据集上执行内核78k次 编辑:78k迭代应按顺序执行,因为它们对应于模拟的时间步长,并且每个迭代都涉及~500k(高度并行)浮点运算 有什么基本的东西是我遗漏的吗 GPU代码: kernel void eom(const device float *mtK [[

我在macOS上使用Metal/Swift编写了一个晶格动力学模拟。它只包含高度并行的乘法和加法,但我仍然无法让Metal/GPU打败CPU。(6核i5 vs Radeon Pro 5300)

代码应该在由46080个浮点组成的数据集上执行内核78k次

编辑:78k迭代应按顺序执行,因为它们对应于模拟的时间步长,并且每个迭代都涉及~500k(高度并行)浮点运算

有什么基本的东西是我遗漏的吗

GPU代码:

kernel void eom(const device float *mtK     [[ buffer(0) ]],
            const device float *mtKnl   [[ buffer(1) ]],
            const device float *mtB     [[ buffer(2) ]],
            const device float *mtKh    [[ buffer(3) ]],
            const device float *mtKv    [[ buffer(4) ]],
            const device float *exm     [[ buffer(5) ]],
            const device float *exwfm   [[ buffer(6) ]],
            const device float *sourceX [[ buffer(7) ]],
            const device float *sourceV [[ buffer(8) ]],
                  device float *dest    [[ buffer(9) ]],
            uint3 id                    [[ thread_position_in_grid ]]) {

uint materialpoint = basisOffset + id.x + linestride*id.y;
uint samplepoint   = basisOffset + id.x + linestride*id.y + blockstride*id.z;

float k    = mtK   [materialpoint];
float b    = mtB   [materialpoint];
float knl  = mtKnl [materialpoint];
float kh   = mtKh  [materialpoint];
float kv   = mtKv  [materialpoint];
float khp  = mtKh  [materialpoint+linestride];
float kvp  = mtKv  [materialpoint+1];
float ex   = exwfm [id.z]*exm[materialpoint];

dest[samplepoint] = -sourceV[samplepoint]*b -sourceX[samplepoint]*(k + knl*sourceX[samplepoint]*sourceX[samplepoint]) + kv*sourceX[samplepoint-1] + kh*sourceX[samplepoint-linestride] + kvp*sourceX[samplepoint+1] + khp*sourceX[samplepoint+linestride] + ex; 
}
CPU代码:

let threadgroup_dx = 24
let threadgroup_dy = 32
let threadgroup_dz = 1

let red_widthG = 1
let red_heightG = 2
let waveformStrideG = 30

let device = MTLCreateSystemDefaultDevice()!
let commandQueue = device.makeCommandQueue()!
let library = try device.makeLibrary(filepath: "compute.metallib")

// Initialize buffers
let dydxV = device.makeBuffer(length: totalExtendedSize*MemoryLayout<Float>.stride, options: MTLResourceOptions.storageModeManaged)!
[...] // More buffers and loading code go here

// Create pipeline state
let computeDescriptor = MTLComputePipelineDescriptor()
computeDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true
computeDescriptor.computeFunction = library.makeFunction(name: "eom")
let pipEOM = try device.makeComputePipelineState(descriptor: computeDescriptor, options: [], reflection: nil)

let commandBuffer = commandQueue.makeCommandBuffer()!
let encoder = commandBuffer.makeComputeCommandEncoder(dispatchType: MTLDispatchType.serial)!

// Computation
for i in 1...78000 {
    let offset = i*numSamples
    encoder.setComputePipelineState(pipEOM)
    encoder.setBuffer(mtrK,     offset: 0,      index: 0)
    encoder.setBuffer(mtrKnl,   offset: 0,      index: 1)
    encoder.setBuffer(mtrB,     offset: 0,      index: 2)
    encoder.setBuffer(mtrKh,    offset: 0,      index: 3)
    encoder.setBuffer(mtrKv,    offset: 0,      index: 4)
    encoder.setBuffer(exm,      offset: 0,      index: 5)
    encoder.setBuffer(wvf,      offset: offset, index: 6)
    encoder.setBuffer(yX,       offset: 0,      index: 7)
    encoder.setBuffer(yV,       offset: 0,      index: 8)
    encoder.setBuffer(dydxV,    offset: 0,      index: 9)
    let numThreadGroups = MTLSize(width: red_widthG, height: red_heightG, depth: waveformStrideG)
    let threadsPerThreadgroup = MTLSize(width: threadgroup_dx, height: threadgroup_dy, depth: threadgroup_dz)
    encoder.dispatchThreadgroups(numThreadGroups, threadsPerThreadgroup: threadsPerThreadgroup)
}

encoder.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
让螺纹组_dx=24
设螺纹组_dy=32
设螺纹组_dz=1
设红色宽度g=1
设红色_高度g=2
设波阵三度=30
让设备=MTLCreateSystemDefaultDevice()!
让commandQueue=device.makeCommandQueue()!
let library=try device.makeLibrary(文件路径:“compute.metallib”)
//初始化缓冲区
让dydxV=device.makeBuffer(长度:totalExtendedSize*MemoryLayout.stride,选项:MTLResourceOptions.storageModeManaged)!
[…]//更多的缓冲区和加载代码都在这里
//创建管道状态
让ComputedDescriptor=MTLComputePipelineDescriptor()
ComputedDescriptor.ThreadGroupSizeMultipleofThreadExecutionWidth=true
computeDescriptor.computeFunction=library.makeFunction(名称:“eom”)
让pipEOM=try device.makeComputePipelineState(描述符:computeDescriptor,选项:[],反射:nil)
让commandBuffer=commandQueue.makeCommandBuffer()!
让编码器=commandBuffer.makeComputeCommandEncoder(dispatchType:MTLDispatchType.serial)!
//计算
我在1…78000{
设偏移量=i*numSamples
编码器.setComputePipelineState(pipEOM)
编码器.设置缓冲(mtrK,偏移量:0,索引:0)
编码器.setBuffer(mtrKnl,偏移量:0,索引:1)
编码器.setBuffer(mtrB,偏移量:0,索引:2)
编码器.设置缓冲(mtrKh,偏移量:0,索引:3)
编码器.设置缓冲(mtrKv,偏移量:0,索引:4)
编码器.setBuffer(exm,偏移量:0,索引:5)
编码器.setBuffer(wvf,偏移量:偏移量,索引:6)
编码器.setBuffer(yX,偏移量:0,索引:7)
编码器.setBuffer(yV,偏移量:0,索引:8)
编码器.setBuffer(dydxV,偏移量:0,索引:9)
让numThreadGroups=MTLSize(宽度:red_widthG,高度:red_heightG,深度:waveformStrideG)
让threadsPerThreadgroup=MTLSize(宽度:threadgroup_dx,高度:threadgroup_dy,深度:threadgroup_dz)
encoder.dispatchThreadgroups(numThreadGroups,threadsPerThreadgroup:threadsPerThreadgroup)
}
encoder.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntillCompleted()命令

您向GPU发送了78000次小批量工作。这是非常次优的。通常,您用所有工作项填充缓冲区,然后一次分派一大块工作。您将小批量工作分派到GPU 78000次。这是非常次优的。通常,您用所有工作项填充缓冲区,然后一次分派一大块工作。