Python Cuda并行内核

Python Cuda并行内核,python,parallel-processing,cuda,numba,Python,Parallel Processing,Cuda,Numba,我正在尝试在GPU上并行化一个简单的模拟更新循环。基本上,在每个更新循环中都会有一堆由圆圈表示的“生物”移动,然后会检查它们是否相交。半径是不同种类生物的半径 import numpy as np import math from numba import cuda @cuda.jit('void(float32[:], float32[:], float32[:], uint8[:], float32[:], float32[:], float32, uint32, uint32)') d

我正在尝试在GPU上并行化一个简单的模拟更新循环。基本上,在每个更新循环中都会有一堆由圆圈表示的“生物”移动,然后会检查它们是否相交。半径是不同种类生物的半径

import numpy as np
import math
from numba import cuda


@cuda.jit('void(float32[:], float32[:], float32[:], uint8[:], float32[:], float32[:], float32, uint32, uint32)')
def update(p_x, p_y, radii, types, velocities, max_velocities, acceleration, num_creatures, cycles):
    for c in range(cycles):
        for i in range(num_creatures):
            velocities[i] = velocities[i] + acceleration
            if velocities[i] > max_velocities[i]:
                velocities[i] = max_velocities[i]
            p_x[i] = p_x[i] + (math.cos(1.0) * velocities[i])
            p_y[i] = p_y[i] + (math.sin(1.0) * velocities[i])
        for i in range(num_creatures):
            for j in range(i, num_creatures):
                delta_x = p_x[j] - p_x[i]
                delta_y = p_y[j] - p_y[i]
                distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
                sum_of_radii = radii[types[i]] + radii[types[i]]
                if distance_squared < sum_of_radii * sum_of_radii:
                    pass


acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3

max_num_creatures = 1500
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 1


p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)

velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 800 // spacing):
    for y in range(1, 600 // spacing):
        if num_creatures % 2 == 0:
            types[num_creatures] = 0
        else:
            types[num_creatures] = 1
        p_x[num_creatures] = x * spacing
        p_y[num_creatures] = y * spacing
        max_velocities[num_creatures] = 5
        num_creatures += 1


device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = 16
update[blockspergrid, threadsperblock](device_p_x, device_p_y, device_radii, device_types, device_velocities, device_max_velocities,
        acceleration, num_creatures, cycles)
print(device_p_x.copy_to_host())
将numpy导入为np
输入数学
来自numba import cuda
@cuda.jit('void(float32[:],float32[:],float32[:],uint8[:],float32[:],float32[:],float32,uint32,uint32'))
def更新(p_x,p_y,半径,类型,速度,最大速度,加速度,数量,周期):
对于范围内的c(循环):
对于范围内的i(num_生物):
速度[i]=速度[i]+加速度
如果速度[i]>最大速度[i]:
速度[i]=最大速度[i]
p_x[i]=p_x[i]+(数学cos(1.0)*速度[i])
p_y[i]=p_y[i]+(数学sin(1.0)*速度[i])
对于范围内的i(num_生物):
对于范围内的j(i,num_生物):
delta_x=p_x[j]-p_x[i]
delta_y=p_y[j]-p_y[i]
距离平方=(δx*δx)+(δy*δy)
半径之和=半径[类型[i]]+半径[类型[i]]
如果距离的平方<半径之和*半径之和:
通过
加速度=.1
生物_半径=10
间距=20
食物半径=3
最大数量=1500只
数量=0
最大食物数量=500
食物数量=0
最大数量实体=最大数量生物+最大数量食物
num_实体=0
周期=1
p_x=np.zero(max_num_实体,dtype=np.float32)
p_y=np.zero(max_num_实体,dtype=np.float32)
半径=np.数组([生物半径,生物半径,食物半径],dtype=np.32)
types=np.zero(max_num_实体,dtype=np.uint8)
速度=np.0(最大数量生物,数据类型=np.float32)
最大速度=np.0(最大数量生物,dtype=np.float32)
#类型:
#男-0
#女-1
#食物-2
对于范围内的x(1800//间距):
对于范围内的y(1600//间距):
如果数量%2==0:
类型[num_生物]=0
其他:
类型[num_生物]=1
p_x[num_生物]=x*间距
p_y[生物数量]=y*间距
最大速度[生物数量]=5
生物数+=1
设备p_x=cuda.to_设备(p_x)
设备p_y=cuda.to_设备(p_y)
设备半径=cuda到设备(半径)
设备类型=cuda.to设备(类型)
设备速度=cuda.至设备(速度)
设备最大速度=cuda到设备(最大速度)
threadsperblock=64
blockspergrid=16
更新[blockspergrid,threadsperblock](设备p_x,设备p_y,设备半径,设备类型,设备速度,设备最大速度,
加速度、生物数、周期)
打印(设备\u p\u x.将\u复制到\u主机()
math.cos和math.sin中的1.0只是单个生物方向的占位符

import numpy as np
import math
from numba import cuda


@cuda.jit('void(float32[:], float32[:], float32[:], uint8[:], float32[:], float32[:], float32, uint32, uint32)')
def update(p_x, p_y, radii, types, velocities, max_velocities, acceleration, num_creatures, cycles):
    for c in range(cycles):
        for i in range(num_creatures):
            velocities[i] = velocities[i] + acceleration
            if velocities[i] > max_velocities[i]:
                velocities[i] = max_velocities[i]
            p_x[i] = p_x[i] + (math.cos(1.0) * velocities[i])
            p_y[i] = p_y[i] + (math.sin(1.0) * velocities[i])
        for i in range(num_creatures):
            for j in range(i, num_creatures):
                delta_x = p_x[j] - p_x[i]
                delta_y = p_y[j] - p_y[i]
                distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
                sum_of_radii = radii[types[i]] + radii[types[i]]
                if distance_squared < sum_of_radii * sum_of_radii:
                    pass


acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3

max_num_creatures = 1500
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 1


p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)

velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 800 // spacing):
    for y in range(1, 600 // spacing):
        if num_creatures % 2 == 0:
            types[num_creatures] = 0
        else:
            types[num_creatures] = 1
        p_x[num_creatures] = x * spacing
        p_y[num_creatures] = y * spacing
        max_velocities[num_creatures] = 5
        num_creatures += 1


device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = 16
update[blockspergrid, threadsperblock](device_p_x, device_p_y, device_radii, device_types, device_velocities, device_max_velocities,
        acceleration, num_creatures, cycles)
print(device_p_x.copy_to_host())
现在有多个线程,但它们执行相同的代码。
要使内核并行化,我必须对其进行哪些更改?

对我来说,并行化最明显的维度似乎是内核中
I
中的循环,即在
num\u生物上迭代。所以我将描述如何做到这一点

  • 我们的目标是删除
    num_biotes
    上的循环,而是让循环的每个迭代由单独的CUDA线程处理。这是可能的,因为每个循环迭代中完成的工作(大部分)是独立的-它不依赖于其他循环迭代的结果(但请参见下面的2)

  • 我们将遇到的一个挑战是
    num_biotes
    中的第二个
    i
    for循环可能取决于第一个循环的结果。如果我们将所有内容都作为串行代码在单个线程中运行,那么该依赖关系将由串行代码执行的性质来处理。然而,我们希望将其并行化。因此,我们需要在
    num_biotes
    中的第一个for循环和第二个for循环之间进行全局同步。CUDA中一个简单、方便的全局同步是内核启动,因此我们将把内核代码分解为两个内核函数。我们将它们称为
    update1
    update2

  • 然后,这就提出了在
    循环中如何处理超拱循环的挑战。我们不能简单地在两个内核中复制该循环,因为这会改变函数行为——例如,我们将在计算单个
    增量x
    之前,计算
    周期
    更新到
    p_x
    。这大概不是我们想要的。因此,为了简单起见,我们将把这个循环从内核代码中提取出来,然后返回到主机代码中。然后,主机代码将调用
    update1
    update2
    内核进行
    循环
    迭代

  • 我们还希望使内核处理适应不同大小的
    num\u生物
    。因此,我们将为
    threadsperblock
    选择一个硬编码的大小,但我们将根据
    num\u bioters
    的大小使启动的块数可变。为了实现这一点,我们需要在每个内核中进行线程检查(初始if语句),这样“额外”线程就不会做任何事情

  • 有了这样的描述,我们最终会得到这样的结果:

    $ cat t11.py
    import numpy as np
    import math
    from numba import cuda
    
    
    @cuda.jit('void(float32[:], float32[:], float32[:], float32[:], float32, uint32)')
    def update1(p_x, p_y, velocities, max_velocities, acceleration, num_creatures):
        i = cuda.grid(1)
        if i < num_creatures:
                velocities[i] = velocities[i] + acceleration
                if velocities[i] > max_velocities[i]:
                    velocities[i] = max_velocities[i]
                p_x[i] = p_x[i] + (math.cos(1.0) * velocities[i])
                p_y[i] = p_y[i] + (math.sin(1.0) * velocities[i])
    
    @cuda.jit('void(float32[:], float32[:], float32[:], uint8[:], uint32)')
    def update2(p_x, p_y, radii, types, num_creatures):
        i = cuda.grid(1)
        if i < num_creatures:
                for j in range(i, num_creatures):
                    delta_x = p_x[j] - p_x[i]
                    delta_y = p_y[j] - p_y[i]
                    distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
                    sum_of_radii = radii[types[i]] + radii[types[i]]
                    if distance_squared < sum_of_radii * sum_of_radii:
                        pass
    
    
    acceleration = .1
    creature_radius = 10
    spacing = 20
    food_radius = 3
    
    max_num_creatures = 1500
    num_creatures = 0
    max_num_food = 500
    num_food = 0
    max_num_entities = max_num_creatures + max_num_food
    num_entities = 0
    cycles = 1
    
    
    p_x = np.zeros(max_num_entities, dtype=np.float32)
    p_y = np.zeros(max_num_entities, dtype=np.float32)
    radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
    types = np.zeros(max_num_entities, dtype=np.uint8)
    
    velocities = np.zeros(max_num_creatures, dtype=np.float32)
    max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
    # types:
    # male - 0
    # female - 1
    # food - 2
    for x in range(1, 800 // spacing):
        for y in range(1, 600 // spacing):
            if num_creatures % 2 == 0:
                types[num_creatures] = 0
            else:
                types[num_creatures] = 1
            p_x[num_creatures] = x * spacing
            p_y[num_creatures] = y * spacing
            max_velocities[num_creatures] = 5
            num_creatures += 1
    
    
    device_p_x = cuda.to_device(p_x)
    device_p_y = cuda.to_device(p_y)
    device_radii = cuda.to_device(radii)
    device_types = cuda.to_device(types)
    device_velocities = cuda.to_device(velocities)
    device_max_velocities = cuda.to_device(max_velocities)
    threadsperblock = 64
    blockspergrid = (num_creatures // threadsperblock) + 1
    for i in range(cycles):
        update1[blockspergrid, threadsperblock](device_p_x, device_p_y, device_velocities, device_max_velocities, acceleration, num_creatures)
        update2[blockspergrid, threadsperblock](device_p_x, device_p_y, device_radii, device_types, num_creatures)
    print(device_p_x.copy_to_host())
    $ python t11.py
    [ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
    $
    
    我们看到探查器报告了原始
    t12.py
    版本,有一个
    update
    内核正在运行,有一个块和一个线程,耗时1.8453毫秒。对于这个答案中发布的修改后的
    t11.py
    版本,探查器报告了
    update1
    update2
    内核的18块64个线程,这两个内核的组合执行时间约为5.47+1.12=6.59微秒

    编辑: 根据评论中的一些讨论,应该可以将两者结合起来
    $ cat t11.py
    import numpy as np
    import math
    from numba import cuda
    
    
    @cuda.jit('void(float32[:], float32[:], float32[:], float32[:], float32[:], uint8[:], float32[:], float32[:], float32, uint32)')
    def update(p_x, p_y, p_x_new, p_y_new, radii, types, velocities, max_velocities, acceleration, num_creatures):
        i = cuda.grid(1)
        if i < num_creatures:
                velocities[i] = velocities[i] + acceleration
                if velocities[i] > max_velocities[i]:
                    velocities[i] = max_velocities[i]
                p_x_new[i] = p_x[i] + (math.cos(1.0) * velocities[i])
                p_y_new[i] = p_y[i] + (math.sin(1.0) * velocities[i])
                for j in range(i, num_creatures):
                    delta_x = p_x[j] - p_x[i]
                    delta_y = p_y[j] - p_y[i]
                    distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
                    sum_of_radii = radii[types[i]] + radii[types[i]]
                    if distance_squared < sum_of_radii * sum_of_radii:
                        pass
    
    
    acceleration = .1
    creature_radius = 10
    spacing = 20
    food_radius = 3
    
    max_num_creatures = 1500000
    num_creatures = 0
    max_num_food = 500
    num_food = 0
    max_num_entities = max_num_creatures + max_num_food
    num_entities = 0
    cycles = 2
    
    
    p_x = np.zeros(max_num_entities, dtype=np.float32)
    p_y = np.zeros(max_num_entities, dtype=np.float32)
    radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
    types = np.zeros(max_num_entities, dtype=np.uint8)
    
    velocities = np.zeros(max_num_creatures, dtype=np.float32)
    max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
    # types:
    # male - 0
    # female - 1
    # food - 2
    for x in range(1, 80000 // spacing):
        for y in range(1, 6000 // spacing):
            if num_creatures % 2 == 0:
                types[num_creatures] = 0
            else:
                types[num_creatures] = 1
            p_x[num_creatures] = x * spacing
            p_y[num_creatures] = y * spacing
            max_velocities[num_creatures] = 5
            num_creatures += 1
    
    
    device_p_x = cuda.to_device(p_x)
    device_p_y = cuda.to_device(p_y)
    device_p_x_new = cuda.to_device(p_x)
    device_p_y_new = cuda.to_device(p_y)
    device_radii = cuda.to_device(radii)
    device_types = cuda.to_device(types)
    device_velocities = cuda.to_device(velocities)
    device_max_velocities = cuda.to_device(max_velocities)
    threadsperblock = 64
    blockspergrid = (num_creatures // threadsperblock) + 1
    for i in range(cycles):
        if i % 2 == 0:
            update[blockspergrid, threadsperblock](device_p_x, device_p_y, device_p_x_new, device_p_y_new, device_radii, device_types,  device_velocities, device_max_velocities, acceleration, num_creatures)
        else:
            update[blockspergrid, threadsperblock](device_p_x_new, device_p_y_new, device_p_x, device_p_y, device_radii, device_types,  device_velocities, device_max_velocities, acceleration, num_creatures)
    
    print(device_p_x_new.copy_to_host())
    print(device_p_x.copy_to_host())
    $ python t11.py
    [ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
    [ 20.1620903  20.1620903  20.1620903 ...,   0.          0.          0.       ]
    $