使用Numba Cuda.jit在GPU上用Python进行Monte Carlo Pi估计

使用Numba Cuda.jit在GPU上用Python进行Monte Carlo Pi估计,python,montecarlo,pi,Python,Montecarlo,Pi,因此,我试图运行我的程序在谷歌Colab使用他们的特斯拉T4 GPU可用。我正在使用Numba实现@cuda.jit,我想知道为什么当我进行估算时,我发现它在CPU上比GPU上运行得更快。我的GPU代码实现有什么问题吗?或者它不能运行得更快,我假设它应该运行得更快。我确信有更快的方法可以做到这一点,但我只是简单地尝试一下,在我进一步优化它之前,先看看它对我有什么意义 import numpy as np import matplotlib.pyplot as plt import time fr

因此,我试图运行我的程序在谷歌Colab使用他们的特斯拉T4 GPU可用。我正在使用Numba实现@cuda.jit,我想知道为什么当我进行估算时,我发现它在CPU上比GPU上运行得更快。我的GPU代码实现有什么问题吗?或者它不能运行得更快,我假设它应该运行得更快。我确信有更快的方法可以做到这一点,但我只是简单地尝试一下,在我进一步优化它之前,先看看它对我有什么意义

import numpy as np
import matplotlib.pyplot as plt
import time
from random import *
from numba import jit, cuda, njit
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32

# This is the 10 sphere pi estimation using Monte Carlo. 
def pi_value(trial):
    hit = 0
    for i in range(trial):
        x1 = random()
        x2 = random()
        x3 = random()
        x4 = random()
        x5 = random()
        x6 = random()
        x7 = random()
        x8 = random()
        x9 = random()
        x10 = random()
        if (x1**2+x2**2+x3**2+x4**2+x5**2+x6**2+x6**2+x8**2+x9**2+x10**2)**(1/2) <= 1:
            hit += 1
    return hit

iter10 = 10000000
dimen = 10
start = time.time()
hit = pi_value(iter10)
end = time.time()
start1 = time.time()
hit1 = pi_value(iter10)
end1 = time.time()
run_time = end1 - start1
piv = (122880 * (hit1 / iter10))**(1/5)
print("For the {dimen} sphere with {trials} random points, the value of pi is estimated to    be {pi}, and executed in {run_time} seconds.".format(dimen=dimen, trials=iterations, pi=piv,   run_time=run_time))

# This is the 10 sphere run on GPU
@cuda.jit
def pi_value(rng_states, iterations, out):
    thread_id = cuda.grid(1)
    hit = 0
    for i in range(iterations):
        x1 = xoroshiro128p_uniform_float32(rng_states, thread_id)
        x2 = xoroshiro128p_uniform_float32(rng_states, thread_id)
        x3 = xoroshiro128p_uniform_float32(rng_states, thread_id)
        x4 = xoroshiro128p_uniform_float32(rng_states, thread_id)
        x5 = xoroshiro128p_uniform_float32(rng_states, thread_id)
        x6 = xoroshiro128p_uniform_float32(rng_states, thread_id)
        x7 = xoroshiro128p_uniform_float32(rng_states, thread_id)
        x8 = xoroshiro128p_uniform_float32(rng_states, thread_id)
        x9 = xoroshiro128p_uniform_float32(rng_states, thread_id)
        x10 = xoroshiro128p_uniform_float32(rng_states, thread_id)
        if (x1**2+x2**2+x3**2+x4**2+x5**2+x6**2+x6**2+x8**2+x9**2+x10**2)**(1/2) <= 1:
            hit += 1

    out[thread_id] = (122880 * (hit / iterations))**(1/5)


threads_per_block = 128
blocks = 32
rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=1)
out = np.zeros(threads_per_block * blocks, dtype=np.float32)

pi_value[blocks, threads_per_block](rng_states, 10000000, out)
print('pi:', out.mean())
将numpy导入为np
将matplotlib.pyplot作为plt导入
导入时间
从随机导入*
来自numba import jit、cuda、njit
从numba.cuda.random import创建xoroshiro128p_状态,xoroshiro128p_统一32
#这是使用蒙特卡罗的10球pi估计。
def pi_值(试用):
命中率=0
对于范围内的i(试验):
x1=随机()
x2=随机()
x3=随机()
x4=随机()
x5=随机数()
x6=随机()
x7=随机()
x8=随机数()
x9=随机数()
x10=随机()
如果(x1**2+x2**2+x3**2+x4**2+x5**2+x6**2+x8**2+x9**2+x10**2)**(1/2)