Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/291.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 用NumPy和PyTorch在GPU上求解线性方程组_Python_Numpy_Pytorch_Gpu_Numba - Fatal编程技术网

Python 用NumPy和PyTorch在GPU上求解线性方程组

Python 用NumPy和PyTorch在GPU上求解线性方程组,python,numpy,pytorch,gpu,numba,Python,Numpy,Pytorch,Gpu,Numba,我正在尽可能快地解很多线性方程组。为了找出我在CPU和GeForce 1080 GPU(用于NumPy)上进行基准测试和测试的最快方法。结果真把我弄糊涂了 这是我在Python 3.8中使用的代码: import timeit import torch import numpy from numba import njit def solve_numpy_cpu(dim: int = 5): a = numpy.random.rand(dim, dim) b = numpy

我正在尽可能快地解很多线性方程组。为了找出我在CPU和GeForce 1080 GPU(用于NumPy)上进行基准测试和测试的最快方法。结果真把我弄糊涂了

这是我在Python 3.8中使用的代码:

import timeit

import torch
import numpy
from numba import njit


def solve_numpy_cpu(dim: int = 5):
    a = numpy.random.rand(dim, dim)
    b = numpy.random.rand(dim)

    for _ in range(1000):
        numpy.linalg.solve(a, b)


def solve_numpy_njit_a(dim: int = 5):
    njit(solve_numpy_cpu, dim=dim)


@njit
def solve_numpy_njit_b(dim: int = 5):
    a = numpy.random.rand(dim, dim)
    b = numpy.random.rand(dim)

    for _ in range(1000):
        numpy.linalg.solve(a, b)


def solve_torch_cpu(dim: int = 5):
    a = torch.rand(dim, dim)
    b = torch.rand(dim, 1)

    for _ in range(1000):
        torch.solve(b, a)


def solve_torch_gpu(dim: int = 5):
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
    solve_torch_cpu(dim=dim)


def main():
    for f in (solve_numpy_cpu, solve_torch_cpu, solve_torch_gpu, solve_numpy_njit_a, solve_numpy_njit_b):
        time = timeit.timeit(f, number=1)
        print(f"{f.__name__:<20s}: {time:f}")


if __name__ == "__main__":
    main()
速度最慢的是CUDA。我验证了PyTorch正在使用我的GPU

import torch
torch.cuda.is_available()
torch.cuda.get_device_name(0)
返回

True
'GeForce GTX 1080'
我可以做到这一点,在CPU上,PyTorch比NumPy慢。我无法理解的是,为什么GPU上的PyTorch速度要慢得多。没那么重要,但实际上更令人困惑的是,Numba的
njit
decorator使性能降低了几个数量级,直到您不再使用@decorator语法为止。

这是我的安排吗?偶尔我会收到一条奇怪的消息,说windows页面/交换文件不够大。如果我在GPU上解决线性方程组时走了一条完全模糊的道路,我很乐意被引导到另一个方向


编辑 所以,我把重点放在了Numba上,并稍微改变了我的基准。正如@max9111所建议的那样,我重写了函数以接收输入并生成输出,因为最终,任何人都希望使用它们。现在,我还为Numba加速函数执行了第一次编译运行,因此后续的计时更加公平。最后,我根据矩阵大小检查了性能,并绘制了结果

TL/DR:对于500x500的矩阵大小,Numba加速度并不会对其产生任何影响

代码如下:

import time
from typing import Tuple

import numpy
from matplotlib import pyplot
from numba import jit


@jit(nopython=True)
def solve_numpy_njit(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray:
    parameters = numpy.linalg.solve(a, b)
    return parameters


def solve_numpy(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray:
    parameters = numpy.linalg.solve(a, b)
    return parameters


def get_data(dim: int) -> Tuple[numpy.ndarray, numpy.ndarray]:
    a = numpy.random.random((dim, dim))
    b = numpy.random.random(dim)
    return a, b


def main():
    a, b = get_data(10)
    # compile numba function
    p = solve_numpy_njit(a, b)

    matrix_size = [(x + 1) * 10 for x in range(50)]
    non_accelerated = []
    accelerated = []
    results = non_accelerated, accelerated

    for j, each_matrix_size in enumerate(matrix_size):
        for m, f in enumerate((solve_numpy, solve_numpy_njit)):
            average_time = -1.
            for k in range(5):
                time_start = time.time()
                for i in range(100):
                    a, b = get_data(each_matrix_size)
                    p = f(a, b)
                d_t = time.time() - time_start
                print(f"{each_matrix_size:d} {f.__name__:<30s}: {d_t:f}")
                average_time = (average_time * k + d_t) / (k + 1)
            results[m].append(average_time)

    pyplot.plot(matrix_size, non_accelerated, label="not numba")
    pyplot.plot(matrix_size, accelerated, label="numba")
    pyplot.legend()
    pyplot.show()


if __name__ == "__main__":
    main()
导入时间
从输入导入元组开始
进口numpy
从matplotlib导入pyplot
从numba导入jit
@jit(nopython=True)
def solve_numpy_njit(a:numpy.ndarray,b:numpy.ndarray)->numpy.ndarray:
参数=numpy.linalg.solve(a,b)
返回参数
def solve_numpy(a:numpy.ndarray,b:numpy.ndarray)->numpy.ndarray:
参数=numpy.linalg.solve(a,b)
返回参数
def get_data(dim:int)->Tuple[numpy.ndarray,numpy.ndarray]:
a=numpy.random.random((尺寸,尺寸))
b=numpy.random.random(尺寸)
返回a,b
def main():
a、 b=获取数据(10)
#编译函数
p=求解(a,b)
矩阵_大小=[(x+1)*10表示范围内的x(50)]
非加速=[]
加速=[]
结果=非加速,加速
对于j,枚举中的每个矩阵大小(矩阵大小):
对于枚举中的m,f((solve_numpy,solve_numpy_njit)):
平均时间=-1。
对于范围(5)内的k:
time\u start=time.time()
对于范围(100)内的i:
a、 b=获取数据(每个矩阵大小)
p=f(a,b)
d_t=time.time()-time\u start

print(f{each_matrix\u size:d}{f.\uu name\uuuuuu:使用带有输出的基准测试。在像Numba这样的编译器中,通常有一个死代码消除->如果这样做有效,您只测量函数调用,但不进行计算(njit\u a)也可以避免默认值,这可能导致重复的重新编译,这是很慢的(njit\u b)如果计算这些小问题(不可并行,数据传输速度慢,可能函数调用也慢),GPU通常会慢得多@max9111感谢您的评论!我将研究使用输出并删除默认值。关于解线性方程组的并行性:我认为基本的高斯消去法是可并行的。这就是为什么我希望从gpu加速中受益。我在这里错了吗?您的真实世界用例是什么?解决大量的tiny方程系统->在cpu上并行计算它们(注意:parfor)解决更大的方程组,它们可以在自己的-> CPU或GPU上并行化。但是你也应该考虑到GPU在单精度计算上要快得多。@ Max 9111是一个在线回归算法,它用每个训练例子更新其参数。因此,每个方程组都是顺序求解的。然而,应该是并行的,据我所知,使用雅可比方法应该是可行的。但对于5x5矩阵来说不是这样。并行化的好处一定要大得多。使用MKL或Numba的Numpy调用相同的求解算法会自动并行化更大的问题。
import time
from typing import Tuple

import numpy
from matplotlib import pyplot
from numba import jit


@jit(nopython=True)
def solve_numpy_njit(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray:
    parameters = numpy.linalg.solve(a, b)
    return parameters


def solve_numpy(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray:
    parameters = numpy.linalg.solve(a, b)
    return parameters


def get_data(dim: int) -> Tuple[numpy.ndarray, numpy.ndarray]:
    a = numpy.random.random((dim, dim))
    b = numpy.random.random(dim)
    return a, b


def main():
    a, b = get_data(10)
    # compile numba function
    p = solve_numpy_njit(a, b)

    matrix_size = [(x + 1) * 10 for x in range(50)]
    non_accelerated = []
    accelerated = []
    results = non_accelerated, accelerated

    for j, each_matrix_size in enumerate(matrix_size):
        for m, f in enumerate((solve_numpy, solve_numpy_njit)):
            average_time = -1.
            for k in range(5):
                time_start = time.time()
                for i in range(100):
                    a, b = get_data(each_matrix_size)
                    p = f(a, b)
                d_t = time.time() - time_start
                print(f"{each_matrix_size:d} {f.__name__:<30s}: {d_t:f}")
                average_time = (average_time * k + d_t) / (k + 1)
            results[m].append(average_time)

    pyplot.plot(matrix_size, non_accelerated, label="not numba")
    pyplot.plot(matrix_size, accelerated, label="numba")
    pyplot.legend()
    pyplot.show()


if __name__ == "__main__":
    main()
import time
from typing import Tuple

import numpy
import torch
from matplotlib import pyplot


def solve_numpy(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray:
    parameters = numpy.linalg.solve(a, b)
    return parameters


def get_data(dim: int) -> Tuple[numpy.ndarray, numpy.ndarray]:
    a = numpy.random.random((dim, dim))
    b = numpy.random.random(dim)
    return a, b


def get_data_torch(dim: int) -> Tuple[torch.tensor, torch.tensor]:
    a = torch.rand(dim, dim)
    b = torch.rand(dim, 1)
    return a, b


def solve_torch(a: torch.tensor, b: torch.tensor) -> torch.tensor:
    parameters, _ = torch.solve(b, a)
    return parameters


def experiment_numpy(matrix_size: int, repetitions: int = 100):
    for i in range(repetitions):
        a, b = get_data(matrix_size)
        p = solve_numpy(a, b)


def experiment_pytorch(matrix_size: int, repetitions: int = 100):
    for i in range(repetitions):
        a, b = get_data_torch(matrix_size)
        p = solve_torch(a, b)


def main():
    matrix_size = [x for x in range(5, 505, 5)]
    experiments = experiment_numpy, experiment_pytorch
    results = tuple([] for _ in experiments)

    for i, each_experiment in enumerate(experiments):
        for j, each_matrix_size in enumerate(matrix_size):
            time_start = time.time()
            each_experiment(each_matrix_size, repetitions=100)
            d_t = time.time() - time_start
            print(f"{each_matrix_size:d} {each_experiment.__name__:<30s}: {d_t:f}")
            results[i].append(d_t)

    for each_experiment, each_result in zip(experiments, results):
        pyplot.plot(matrix_size, each_result, label=each_experiment.__name__)

    pyplot.legend()
    pyplot.show()


if __name__ == "__main__":
    main()