Python 用NumPy和PyTorch在GPU上求解线性方程组
我正在尽可能快地解很多线性方程组。为了找出我在CPU和GeForce 1080 GPU(用于NumPy)上进行基准测试和测试的最快方法。结果真把我弄糊涂了 这是我在Python 3.8中使用的代码:Python 用NumPy和PyTorch在GPU上求解线性方程组,python,numpy,pytorch,gpu,numba,Python,Numpy,Pytorch,Gpu,Numba,我正在尽可能快地解很多线性方程组。为了找出我在CPU和GeForce 1080 GPU(用于NumPy)上进行基准测试和测试的最快方法。结果真把我弄糊涂了 这是我在Python 3.8中使用的代码: import timeit import torch import numpy from numba import njit def solve_numpy_cpu(dim: int = 5): a = numpy.random.rand(dim, dim) b = numpy
import timeit
import torch
import numpy
from numba import njit
def solve_numpy_cpu(dim: int = 5):
a = numpy.random.rand(dim, dim)
b = numpy.random.rand(dim)
for _ in range(1000):
numpy.linalg.solve(a, b)
def solve_numpy_njit_a(dim: int = 5):
njit(solve_numpy_cpu, dim=dim)
@njit
def solve_numpy_njit_b(dim: int = 5):
a = numpy.random.rand(dim, dim)
b = numpy.random.rand(dim)
for _ in range(1000):
numpy.linalg.solve(a, b)
def solve_torch_cpu(dim: int = 5):
a = torch.rand(dim, dim)
b = torch.rand(dim, 1)
for _ in range(1000):
torch.solve(b, a)
def solve_torch_gpu(dim: int = 5):
torch.set_default_tensor_type("torch.cuda.FloatTensor")
solve_torch_cpu(dim=dim)
def main():
for f in (solve_numpy_cpu, solve_torch_cpu, solve_torch_gpu, solve_numpy_njit_a, solve_numpy_njit_b):
time = timeit.timeit(f, number=1)
print(f"{f.__name__:<20s}: {time:f}")
if __name__ == "__main__":
main()
速度最慢的是CUDA。我验证了PyTorch正在使用我的GPU
import torch
torch.cuda.is_available()
torch.cuda.get_device_name(0)
返回
True
'GeForce GTX 1080'
我可以做到这一点,在CPU上,PyTorch比NumPy慢。我无法理解的是,为什么GPU上的PyTorch速度要慢得多。没那么重要,但实际上更令人困惑的是,Numba的njit
decorator使性能降低了几个数量级,直到您不再使用@decorator语法为止。
这是我的安排吗?偶尔我会收到一条奇怪的消息,说windows页面/交换文件不够大。如果我在GPU上解决线性方程组时走了一条完全模糊的道路,我很乐意被引导到另一个方向
编辑 所以,我把重点放在了Numba上,并稍微改变了我的基准。正如@max9111所建议的那样,我重写了函数以接收输入并生成输出,因为最终,任何人都希望使用它们。现在,我还为Numba加速函数执行了第一次编译运行,因此后续的计时更加公平。最后,我根据矩阵大小检查了性能,并绘制了结果 TL/DR:对于500x500的矩阵大小,Numba加速度并不会对其产生任何影响 代码如下:
import time
from typing import Tuple
import numpy
from matplotlib import pyplot
from numba import jit
@jit(nopython=True)
def solve_numpy_njit(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray:
parameters = numpy.linalg.solve(a, b)
return parameters
def solve_numpy(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray:
parameters = numpy.linalg.solve(a, b)
return parameters
def get_data(dim: int) -> Tuple[numpy.ndarray, numpy.ndarray]:
a = numpy.random.random((dim, dim))
b = numpy.random.random(dim)
return a, b
def main():
a, b = get_data(10)
# compile numba function
p = solve_numpy_njit(a, b)
matrix_size = [(x + 1) * 10 for x in range(50)]
non_accelerated = []
accelerated = []
results = non_accelerated, accelerated
for j, each_matrix_size in enumerate(matrix_size):
for m, f in enumerate((solve_numpy, solve_numpy_njit)):
average_time = -1.
for k in range(5):
time_start = time.time()
for i in range(100):
a, b = get_data(each_matrix_size)
p = f(a, b)
d_t = time.time() - time_start
print(f"{each_matrix_size:d} {f.__name__:<30s}: {d_t:f}")
average_time = (average_time * k + d_t) / (k + 1)
results[m].append(average_time)
pyplot.plot(matrix_size, non_accelerated, label="not numba")
pyplot.plot(matrix_size, accelerated, label="numba")
pyplot.legend()
pyplot.show()
if __name__ == "__main__":
main()
导入时间
从输入导入元组开始
进口numpy
从matplotlib导入pyplot
从numba导入jit
@jit(nopython=True)
def solve_numpy_njit(a:numpy.ndarray,b:numpy.ndarray)->numpy.ndarray:
参数=numpy.linalg.solve(a,b)
返回参数
def solve_numpy(a:numpy.ndarray,b:numpy.ndarray)->numpy.ndarray:
参数=numpy.linalg.solve(a,b)
返回参数
def get_data(dim:int)->Tuple[numpy.ndarray,numpy.ndarray]:
a=numpy.random.random((尺寸,尺寸))
b=numpy.random.random(尺寸)
返回a,b
def main():
a、 b=获取数据(10)
#编译函数
p=求解(a,b)
矩阵_大小=[(x+1)*10表示范围内的x(50)]
非加速=[]
加速=[]
结果=非加速,加速
对于j,枚举中的每个矩阵大小(矩阵大小):
对于枚举中的m,f((solve_numpy,solve_numpy_njit)):
平均时间=-1。
对于范围(5)内的k:
time\u start=time.time()
对于范围(100)内的i:
a、 b=获取数据(每个矩阵大小)
p=f(a,b)
d_t=time.time()-time\u start
print(f{each_matrix\u size:d}{f.\uu name\uuuuuu:使用带有输出的基准测试。在像Numba这样的编译器中,通常有一个死代码消除->如果这样做有效,您只测量函数调用,但不进行计算(njit\u a)也可以避免默认值,这可能导致重复的重新编译,这是很慢的(njit\u b)如果计算这些小问题(不可并行,数据传输速度慢,可能函数调用也慢),GPU通常会慢得多@max9111感谢您的评论!我将研究使用输出并删除默认值。关于解线性方程组的并行性:我认为基本的高斯消去法是可并行的。这就是为什么我希望从gpu加速中受益。我在这里错了吗?您的真实世界用例是什么?解决大量的tiny方程系统->在cpu上并行计算它们(注意:parfor)解决更大的方程组,它们可以在自己的-> CPU或GPU上并行化。但是你也应该考虑到GPU在单精度计算上要快得多。@ Max 9111是一个在线回归算法,它用每个训练例子更新其参数。因此,每个方程组都是顺序求解的。然而,应该是并行的,据我所知,使用雅可比方法应该是可行的。但对于5x5矩阵来说不是这样。并行化的好处一定要大得多。使用MKL或Numba的Numpy调用相同的求解算法会自动并行化更大的问题。
import time
from typing import Tuple
import numpy
from matplotlib import pyplot
from numba import jit
@jit(nopython=True)
def solve_numpy_njit(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray:
parameters = numpy.linalg.solve(a, b)
return parameters
def solve_numpy(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray:
parameters = numpy.linalg.solve(a, b)
return parameters
def get_data(dim: int) -> Tuple[numpy.ndarray, numpy.ndarray]:
a = numpy.random.random((dim, dim))
b = numpy.random.random(dim)
return a, b
def main():
a, b = get_data(10)
# compile numba function
p = solve_numpy_njit(a, b)
matrix_size = [(x + 1) * 10 for x in range(50)]
non_accelerated = []
accelerated = []
results = non_accelerated, accelerated
for j, each_matrix_size in enumerate(matrix_size):
for m, f in enumerate((solve_numpy, solve_numpy_njit)):
average_time = -1.
for k in range(5):
time_start = time.time()
for i in range(100):
a, b = get_data(each_matrix_size)
p = f(a, b)
d_t = time.time() - time_start
print(f"{each_matrix_size:d} {f.__name__:<30s}: {d_t:f}")
average_time = (average_time * k + d_t) / (k + 1)
results[m].append(average_time)
pyplot.plot(matrix_size, non_accelerated, label="not numba")
pyplot.plot(matrix_size, accelerated, label="numba")
pyplot.legend()
pyplot.show()
if __name__ == "__main__":
main()
import time
from typing import Tuple
import numpy
import torch
from matplotlib import pyplot
def solve_numpy(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray:
parameters = numpy.linalg.solve(a, b)
return parameters
def get_data(dim: int) -> Tuple[numpy.ndarray, numpy.ndarray]:
a = numpy.random.random((dim, dim))
b = numpy.random.random(dim)
return a, b
def get_data_torch(dim: int) -> Tuple[torch.tensor, torch.tensor]:
a = torch.rand(dim, dim)
b = torch.rand(dim, 1)
return a, b
def solve_torch(a: torch.tensor, b: torch.tensor) -> torch.tensor:
parameters, _ = torch.solve(b, a)
return parameters
def experiment_numpy(matrix_size: int, repetitions: int = 100):
for i in range(repetitions):
a, b = get_data(matrix_size)
p = solve_numpy(a, b)
def experiment_pytorch(matrix_size: int, repetitions: int = 100):
for i in range(repetitions):
a, b = get_data_torch(matrix_size)
p = solve_torch(a, b)
def main():
matrix_size = [x for x in range(5, 505, 5)]
experiments = experiment_numpy, experiment_pytorch
results = tuple([] for _ in experiments)
for i, each_experiment in enumerate(experiments):
for j, each_matrix_size in enumerate(matrix_size):
time_start = time.time()
each_experiment(each_matrix_size, repetitions=100)
d_t = time.time() - time_start
print(f"{each_matrix_size:d} {each_experiment.__name__:<30s}: {d_t:f}")
results[i].append(d_t)
for each_experiment, each_result in zip(experiments, results):
pyplot.plot(matrix_size, each_result, label=each_experiment.__name__)
pyplot.legend()
pyplot.show()
if __name__ == "__main__":
main()