Swift 为什么这些简单的金属GPU计算内核比CPU实现慢或相等?
我在macOS上使用Metal/Swift编写了一个晶格动力学模拟。它只包含高度并行的乘法和加法,但我仍然无法让Metal/GPU打败CPU。(6核i5 vs Radeon Pro 5300) 代码应该在由46080个浮点组成的数据集上执行内核78k次 编辑:78k迭代应按顺序执行,因为它们对应于模拟的时间步长,并且每个迭代都涉及~500k(高度并行)浮点运算 有什么基本的东西是我遗漏的吗 GPU代码:Swift 为什么这些简单的金属GPU计算内核比CPU实现慢或相等?,swift,gpgpu,metal,gpu,Swift,Gpgpu,Metal,Gpu,我在macOS上使用Metal/Swift编写了一个晶格动力学模拟。它只包含高度并行的乘法和加法,但我仍然无法让Metal/GPU打败CPU。(6核i5 vs Radeon Pro 5300) 代码应该在由46080个浮点组成的数据集上执行内核78k次 编辑:78k迭代应按顺序执行,因为它们对应于模拟的时间步长,并且每个迭代都涉及~500k(高度并行)浮点运算 有什么基本的东西是我遗漏的吗 GPU代码: kernel void eom(const device float *mtK [[
kernel void eom(const device float *mtK [[ buffer(0) ]],
const device float *mtKnl [[ buffer(1) ]],
const device float *mtB [[ buffer(2) ]],
const device float *mtKh [[ buffer(3) ]],
const device float *mtKv [[ buffer(4) ]],
const device float *exm [[ buffer(5) ]],
const device float *exwfm [[ buffer(6) ]],
const device float *sourceX [[ buffer(7) ]],
const device float *sourceV [[ buffer(8) ]],
device float *dest [[ buffer(9) ]],
uint3 id [[ thread_position_in_grid ]]) {
uint materialpoint = basisOffset + id.x + linestride*id.y;
uint samplepoint = basisOffset + id.x + linestride*id.y + blockstride*id.z;
float k = mtK [materialpoint];
float b = mtB [materialpoint];
float knl = mtKnl [materialpoint];
float kh = mtKh [materialpoint];
float kv = mtKv [materialpoint];
float khp = mtKh [materialpoint+linestride];
float kvp = mtKv [materialpoint+1];
float ex = exwfm [id.z]*exm[materialpoint];
dest[samplepoint] = -sourceV[samplepoint]*b -sourceX[samplepoint]*(k + knl*sourceX[samplepoint]*sourceX[samplepoint]) + kv*sourceX[samplepoint-1] + kh*sourceX[samplepoint-linestride] + kvp*sourceX[samplepoint+1] + khp*sourceX[samplepoint+linestride] + ex;
}
CPU代码:
let threadgroup_dx = 24
let threadgroup_dy = 32
let threadgroup_dz = 1
let red_widthG = 1
let red_heightG = 2
let waveformStrideG = 30
let device = MTLCreateSystemDefaultDevice()!
let commandQueue = device.makeCommandQueue()!
let library = try device.makeLibrary(filepath: "compute.metallib")
// Initialize buffers
let dydxV = device.makeBuffer(length: totalExtendedSize*MemoryLayout<Float>.stride, options: MTLResourceOptions.storageModeManaged)!
[...] // More buffers and loading code go here
// Create pipeline state
let computeDescriptor = MTLComputePipelineDescriptor()
computeDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true
computeDescriptor.computeFunction = library.makeFunction(name: "eom")
let pipEOM = try device.makeComputePipelineState(descriptor: computeDescriptor, options: [], reflection: nil)
let commandBuffer = commandQueue.makeCommandBuffer()!
let encoder = commandBuffer.makeComputeCommandEncoder(dispatchType: MTLDispatchType.serial)!
// Computation
for i in 1...78000 {
let offset = i*numSamples
encoder.setComputePipelineState(pipEOM)
encoder.setBuffer(mtrK, offset: 0, index: 0)
encoder.setBuffer(mtrKnl, offset: 0, index: 1)
encoder.setBuffer(mtrB, offset: 0, index: 2)
encoder.setBuffer(mtrKh, offset: 0, index: 3)
encoder.setBuffer(mtrKv, offset: 0, index: 4)
encoder.setBuffer(exm, offset: 0, index: 5)
encoder.setBuffer(wvf, offset: offset, index: 6)
encoder.setBuffer(yX, offset: 0, index: 7)
encoder.setBuffer(yV, offset: 0, index: 8)
encoder.setBuffer(dydxV, offset: 0, index: 9)
let numThreadGroups = MTLSize(width: red_widthG, height: red_heightG, depth: waveformStrideG)
let threadsPerThreadgroup = MTLSize(width: threadgroup_dx, height: threadgroup_dy, depth: threadgroup_dz)
encoder.dispatchThreadgroups(numThreadGroups, threadsPerThreadgroup: threadsPerThreadgroup)
}
encoder.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
让螺纹组_dx=24
设螺纹组_dy=32
设螺纹组_dz=1
设红色宽度g=1
设红色_高度g=2
设波阵三度=30
让设备=MTLCreateSystemDefaultDevice()!
让commandQueue=device.makeCommandQueue()!
let library=try device.makeLibrary(文件路径:“compute.metallib”)
//初始化缓冲区
让dydxV=device.makeBuffer(长度:totalExtendedSize*MemoryLayout.stride,选项:MTLResourceOptions.storageModeManaged)!
[…]//更多的缓冲区和加载代码都在这里
//创建管道状态
让ComputedDescriptor=MTLComputePipelineDescriptor()
ComputedDescriptor.ThreadGroupSizeMultipleofThreadExecutionWidth=true
computeDescriptor.computeFunction=library.makeFunction(名称:“eom”)
让pipEOM=try device.makeComputePipelineState(描述符:computeDescriptor,选项:[],反射:nil)
让commandBuffer=commandQueue.makeCommandBuffer()!
让编码器=commandBuffer.makeComputeCommandEncoder(dispatchType:MTLDispatchType.serial)!
//计算
我在1…78000{
设偏移量=i*numSamples
编码器.setComputePipelineState(pipEOM)
编码器.设置缓冲(mtrK,偏移量:0,索引:0)
编码器.setBuffer(mtrKnl,偏移量:0,索引:1)
编码器.setBuffer(mtrB,偏移量:0,索引:2)
编码器.设置缓冲(mtrKh,偏移量:0,索引:3)
编码器.设置缓冲(mtrKv,偏移量:0,索引:4)
编码器.setBuffer(exm,偏移量:0,索引:5)
编码器.setBuffer(wvf,偏移量:偏移量,索引:6)
编码器.setBuffer(yX,偏移量:0,索引:7)
编码器.setBuffer(yV,偏移量:0,索引:8)
编码器.setBuffer(dydxV,偏移量:0,索引:9)
让numThreadGroups=MTLSize(宽度:red_widthG,高度:red_heightG,深度:waveformStrideG)
让threadsPerThreadgroup=MTLSize(宽度:threadgroup_dx,高度:threadgroup_dy,深度:threadgroup_dz)
encoder.dispatchThreadgroups(numThreadGroups,threadsPerThreadgroup:threadsPerThreadgroup)
}
encoder.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntillCompleted()命令
您向GPU发送了78000次小批量工作。这是非常次优的。通常,您用所有工作项填充缓冲区,然后一次分派一大块工作。您将小批量工作分派到GPU 78000次。这是非常次优的。通常,您用所有工作项填充缓冲区,然后一次分派一大块工作。