Python Cython使用prange/parallel无性能提升

Python Cython使用prange/parallel无性能提升,python,multithreading,performance,cython,Python,Multithreading,Performance,Cython,我正在使用Cython版本0.27.3为一个简单的素性测试模块编译以下源代码,该模块包含相同算法的python和Cython实现。当我将threads参数设置为不同的值时,我看不到性能的提高,尽管GIL被释放了。是不是有什么东西阻止它并行运行 所讨论的函数是cdefvoid\u getprimes,它接受memoryview切片作为参数,并应将该切片中的所有非素数值设置为0 primes.pyx #cython: boundscheck=False, wraparound=False, none

我正在使用Cython版本0.27.3为一个简单的素性测试模块编译以下源代码,该模块包含相同算法的python和Cython实现。当我将
threads
参数设置为不同的值时,我看不到性能的提高,尽管GIL被释放了。是不是有什么东西阻止它并行运行

所讨论的函数是
cdefvoid\u getprimes
,它接受memoryview切片作为参数,并应将该切片中的所有非素数值设置为0

primes.pyx

#cython: boundscheck=False, wraparound=False, nonecheck=False
cimport cython
from cpython cimport array
from cython.parallel cimport parallel, prange
from libc.math cimport sqrt, ceil
from libc.stdlib cimport malloc, free
from libc.stdio cimport printf
import math

# =====================
# Python implementation
# =====================

def pyisprime(n):
    """Python implementation"""
    if n < 2 or n & 1 == 0:
        if n == 2:
            return True
        return False
    for i in range(2, int(math.sqrt(n)) + 1):
        if n % i == 0:
            return False
    return True

def pygetprimes(nums):
    return [num for num in nums if pyisprime(num)]


# =====================
# Cython implementation
# =====================
cdef int _isprime(unsigned long long n) nogil:
    """Cython implementation of a simple primality check"""
    cdef unsigned long long upper 
    cdef unsigned long long i = 3
    cdef int prime = 1
    if n < 2 or n & 1 == 0:
        if n == 2:
            return 1
        return 0
    upper = <unsigned long long>ceil(sqrt(<double>n))
    while i <= upper:
        if n % i == 0:
            prime = 0
            break
        i += 1
    return prime

def isprime(unsigned long long n):
    """Wrapper for _isprime"""
    cdef int result
    with nogil:
        result = _isprime(n)
    return result

cdef void _getprimes(unsigned long long[:] nums, int threads) nogil:
    cdef unsigned long num
    cdef int i = 0
    with parallel(num_threads=threads):
        for i in prange(nums.shape[0], schedule="dynamic"):
            if _isprime(nums[i]) == 0:
                nums[i] = 0

def getprimes(nums, int threads = 1):
    """Wrapper for _getprimes"""
    cdef unsigned long long num
    cdef unsigned long long[:] primes = array.array("Q", nums)

    with nogil:
        _getprimes(primes, threads)

    return [num for num in primes if num != 0]
test.py

#!/usr/bin/env python3
from distutils.core import setup
from Cython.Build import cythonize

setup(
    name="primes",
    ext_modules=cythonize('primes.pyx'),
)
#!/usr/bin/env python3
import functools
import random
import time
import primes

def timed(func):
    def wrapped(*args, **kwargs):
        start = time.time()
        val = func(*args, **kwargs)
        end = time.time()
        print(func.__name__, end - start)
        return val
    return functools.wraps(func)(wrapped)


def main():
    nums = [random.randint(0, 0xffffff) for _ in range(500000)]

    pyfoo = timed(primes.pygetprimes)
    cyfoo = timed(primes.getprimes)

    x = pyfoo(nums)
    y = cyfoo(nums, 1)
    z = cyfoo(nums, 4)
    assert x == y == z

if __name__ == "__main__":
    main()
当我运行
cyfoo
时,我希望将线程数从1增加到4会显示某种类型的速度提高,但事实并非如此:

[aarcher@Arch]: ~/Programming/Cython/build/lib.linux-x86_64-3.6>$ ./test.py 
pygetprimes 5.11554741859436
getprimes 1.1129701137542725
getprimes 1.1306445598602295

似乎您需要为OpenMP启用编译器标志,以便并行语句实际执行任何操作

请参阅cython文档


我相信您需要为OpenMP启用编译器标志,以便并行语句实际执行任何操作。看这里:很简单!如果你继续回答,我会接受的。
# setup.py
# ... omitted ...

ext_modules = [
    Extension(
        "hello",
        ["hello.pyx"],
        extra_compile_args=['-fopenmp'],
        extra_link_args=['-fopenmp'],
    )
]