Python 多处理距离矩阵cdist Scipy_Python_Scipy_Multiprocessing

Python 多处理距离矩阵cdist Scipy

python

Python 多处理距离矩阵cdist Scipy,python,scipy,multiprocessing,Python,Scipy,Multiprocessing,我要计算两个字符串向量元素之间的多重距离。我使用cdist来完成这项工作，速度很快，但对于大型阵列来说，这需要时间，而且我必须多次重复该操作cdistfromScipy它只使用了我的一个核心。我怎样才能利用我所有的核心计算所有的距离，更快地得到最终的3D矩阵？我想实现的是在不同的过程中分别计算每个距离矩阵，或者使用我的所有核心计算一个距离矩阵在这里，您可以运行我试图实现的示例 import numpy as np first = np.array(["hello", &q

我要计算两个字符串向量元素之间的多重距离。我使用cdist来完成这项工作，速度很快，但对于大型阵列来说，这需要时间，而且我必须多次重复该操作

cdist

from

Scipy

它只使用了我的一个核心。我怎样才能利用我所有的核心计算所有的距离，更快地得到最终的3D矩阵？我想实现的是在不同的过程中分别计算每个距离矩阵，或者使用我的所有核心计算一个距离矩阵

在这里，您可以运行我试图实现的示例

import numpy as np
first = np.array(["hello", "hello", "hellllo"])
second = np.array(["hlo", "halo", "alle"])

def diff_len(string1, string2):
    return abs(len(string1) - len(string2))

def diff_len2(string1, string2):
    return abs(len(string1) - len(string2)*2)

def diff_len_square(string1, string2):
    return abs(len(string1) - len(string2)**2)

def minimum_nw(*sequences):
    return -max(map(len, sequences)) * 1


def maximum_nw(*sequences):
    return max(map(len, sequences))


def normalized_distance(distance, *sequences):
    """Get distance from 0 to 1
    """
    minimum = minimum_nw(*sequences)
    maximum = maximum_nw(*sequences)

    if maximum == 0:
        return 0
    return (distance - minimum) / (maximum - minimum)


@njit
def NeedlemanWunschDP(dist_mat, s1, s2):
    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):
            match = dist_mat[i - 1, j - 1] + (s1[i-1] == s2[j-1])
            delete = dist_mat[i - 1, j] - gap_
            insert = dist_mat[i, j - 1] - gap_
            dist_mat[i, j] = max(match, delete, insert)


def NeedleW(s1, s2, mode='raw'):
    dist_mat_ = np.empty(
        (len(s1) + 1, len(s2) + 1),
        dtype=np.int64,
    )

    # DP initialization
    for i in range(len(s1) + 1):
        dist_mat_[i, 0] = -(i * gap_)

    # DP initialization
    for j in range(len(s2) + 1):
        dist_mat_[0, j] = -(j * gap_)

    # Transform the strings to fast integer arrays
    tmp_s1 = np.array([ord(e) for e in s1], dtype=np.int64)
    tmp_s2 = np.array([ord(e) for e in s2], dtype=np.int64)
    # Needleman-Wunsch DP calculation
    NeedlemanWunschDP(dist_mat_, tmp_s1, tmp_s2)
    distance_ = -1 * dist_mat_[dist_mat_.shape[0] - 1, dist_mat_.shape[1] - 1]

    if mode == 'norm':
        return normalized_distance(distance_, s1, s2)

    return - float(distance_)

first = np.array(["hello", "hello", "hellllo"])
second = np.array(["hlo", "halo", "alle"])
first = np.tile(first, 20)
second = np.array(second, 20)

d0 = cdist(first[:, np.newaxis], second[:, np.newaxis], lambda a, b: diff_len(a[0], b[0]))
d1 = cdist(first[:, np.newaxis], second[:, np.newaxis], lambda a, b: diff_len2(a[0], b[0]))
d2 = cdist(first[:, np.newaxis], second[:, np.newaxis], lambda a, b: diff_len_square(a[0], b[0]))
d3 = cdist(first[:, np.newaxis], second[:, np.newaxis], lambda a, b: NeedleW(a[0], b[0], 'norm'))

mat3D = np.stack((d0, d1, d2, d3))