Python 如何并行化scipy稀疏矩阵乘法

Python 如何并行化scipy稀疏矩阵乘法,python,parallel-processing,scipy,sparse-matrix,Python,Parallel Processing,Scipy,Sparse Matrix,我有一个scipy.sparse.csr_矩阵格式的大稀疏矩阵X,我想用一个numpy数组W乘以它,利用并行性。经过一些研究,我发现我需要在多处理中使用数组,以避免在进程之间复制X和W(例如,从这里:和)。这是我的最新尝试 import multiprocessing import numpy import scipy.sparse import time def initProcess(data, indices, indptr, shape, Warr, Wshp): gl

我有一个scipy.sparse.csr_矩阵格式的大稀疏矩阵X,我想用一个numpy数组W乘以它,利用并行性。经过一些研究,我发现我需要在多处理中使用数组,以避免在进程之间复制X和W(例如,从这里:和)。这是我的最新尝试

import multiprocessing 
import numpy 
import scipy.sparse 
import time 

def initProcess(data, indices, indptr, shape, Warr, Wshp):
    global XData 
    global XIndices 
    global XIntptr 
    global Xshape 

    XData = data 
    XIndices = indices 
    XIntptr = indptr 
    Xshape = shape 

    global WArray
    global WShape 

    WArray = Warr     
    WShape = Wshp 

def dot2(args):
    rowInds, i = args     

    global XData 
    global XIndices
    global XIntptr 
    global Xshape 

    data = numpy.frombuffer(XData, dtype=numpy.float)
    indices = numpy.frombuffer(XIndices, dtype=numpy.int32)
    indptr = numpy.frombuffer(XIntptr, dtype=numpy.int32)
    Xr = scipy.sparse.csr_matrix((data, indices, indptr), shape=Xshape)

    global WArray
    global WShape 
    W = numpy.frombuffer(WArray, dtype=numpy.float).reshape(WShape)

    return Xr[rowInds[i]:rowInds[i+1], :].dot(W)

def getMatmat(X): 
    numJobs = multiprocessing.cpu_count()
    rowInds = numpy.array(numpy.linspace(0, X.shape[0], numJobs+1), numpy.int)

    #Store the data in X as RawArray objects so we can share it amoung processes
    XData = multiprocessing.RawArray("d", X.data)
    XIndices = multiprocessing.RawArray("i", X.indices)
    XIndptr = multiprocessing.RawArray("i", X.indptr)

    def matmat(W): 
        WArray = multiprocessing.RawArray("d", W.flatten())
        pool = multiprocessing.Pool(processes=multiprocessing.cpu_count(), initializer=initProcess, initargs=(XData, XIndices, XIndptr, X.shape, WArray, W.shape)) 
        params = [] 

        for i in range(numJobs): 
            params.append((rowInds, i))

        iterator = pool.map(dot2, params)
        P = numpy.zeros((X.shape[0], W.shape[1])) 

        for i in range(numJobs): 
            P[rowInds[i]:rowInds[i+1], :] = iterator[i]

        return P   

    return matmat 

if __name__ == '__main__':
    #Create a random sparse matrix X and a random dense one W     
    X = scipy.sparse.rand(10000, 8000, 0.1)
    X = X.tocsr()
    W = numpy.random.rand(8000, 20)

    startTime = time.time()
    A = getMatmat(X)(W)
    parallelTime = time.time()-startTime 

    startTime = time.time()
    B = X.dot(W)
    nonParallelTime = time.time()-startTime 

    print(parallelTime, nonParallelTime)
但是,输出类似:(4.431,0.165),表示并行版本比非并行乘法慢得多

我相信在将大数据复制到进程的类似情况下可能会导致速度减慢,但这里的情况并非如此,因为我使用数组存储共享变量(除非它发生在numpy.frombuffer中或创建csr_矩阵时,但我找不到直接共享csr_矩阵的方法)。速度慢的另一个可能原因是每个进程的每个矩阵乘法返回一个较大的结果,但我不确定如何解决这个问题

有人能看出我错在哪里吗? 谢谢你的帮助


更新:我不能确定,但我认为在进程之间共享大量数据并没有那么有效,理想情况下我应该使用多线程(尽管全局解释器锁(GIL)使得这非常困难)。解决这个问题的一种方法是使用Cython释放GIL,例如(请参阅),尽管许多numpy函数需要通过GIL

你最好的选择是和Cython一起降到C。这样,您就可以击败GIL并使用OpenMP。我对多处理速度较慢并不感到惊讶——这会带来大量开销

这里是CSparse的稀疏矩阵向量乘积代码在python中的朴素包装OpenMP包装

在我的笔记本电脑上,它比scipy运行得快一点。但我没有那么多内核。代码,包括setup.py脚本和C头文件以及其他内容如下:

我怀疑,如果您真的希望并行代码速度更快(在我的笔记本电脑上,它只比单线程scipy快20%左右,即使使用4个线程),您需要比我更仔细地考虑并行发生的位置,注意缓存位置

# psparse.pyx

#-----------------------------------------------------------------------------
# Imports
#-----------------------------------------------------------------------------
cimport cython
cimport numpy as np
import numpy as np
import scipy.sparse
from libc.stddef cimport ptrdiff_t
from cython.parallel import parallel, prange

#-----------------------------------------------------------------------------
# Headers
#-----------------------------------------------------------------------------

ctypedef int csi

ctypedef struct cs:
    # matrix in compressed-column or triplet form
    csi nzmax       # maximum number of entries
    csi m           # number of rows
    csi n           # number of columns
    csi *p          # column pointers (size n+1) or col indices (size nzmax)
    csi *i          # row indices, size nzmax
    double *x       # numerical values, size nzmax
    csi nz          # # of entries in triplet matrix, -1 for compressed-col

cdef extern csi cs_gaxpy (cs *A, double *x, double *y) nogil
cdef extern csi cs_print (cs *A, csi brief) nogil

assert sizeof(csi) == 4

#-----------------------------------------------------------------------------
# Functions
#-----------------------------------------------------------------------------

@cython.boundscheck(False)
def pmultiply(X not None, np.ndarray[ndim=2, mode='fortran', dtype=np.float64_t] W not None):
    """Multiply a sparse CSC matrix by a dense matrix

    Parameters
    ----------
    X : scipy.sparse.csc_matrix
        A sparse matrix, of size N x M
    W : np.ndarray[dtype=float564, ndim=2, mode='fortran']
        A dense matrix, of size M x P. Note, W must be contiguous and in
        fortran (column-major) order. You can ensure this using
        numpy's `asfortranarray` function.

    Returns
    -------
    A : np.ndarray[dtype=float64, ndim=2, mode='fortran']
        A dense matrix, of size N x P, the result of multiplying X by W.

    Notes
    -----
    This function is parallelized over the columns of W using OpenMP. You
    can control the number of threads at runtime using the OMP_NUM_THREADS
    environment variable. The internal sparse matrix code is from CSPARSE, 
    a Concise Sparse matrix package. Copyright (c) 2006, Timothy A. Davis.
    http://www.cise.ufl.edu/research/sparse/CSparse, licensed under the
    GNU LGPL v2.1+.

    References
    ----------
    .. [1] Davis, Timothy A., "Direct Methods for Sparse Linear Systems
    (Fundamentals of Algorithms 2)," SIAM Press, 2006. ISBN: 0898716136
    """
    if X.shape[1] != W.shape[0]:
        raise ValueError('matrices are not aligned')

    cdef int i
    cdef cs csX
    cdef np.ndarray[double, ndim=2, mode='fortran'] result
    cdef np.ndarray[csi, ndim=1, mode = 'c'] indptr  = X.indptr
    cdef np.ndarray[csi, ndim=1, mode = 'c'] indices = X.indices
    cdef np.ndarray[double, ndim=1, mode = 'c']    data = X.data

    # Pack the scipy data into the CSparse struct. This is just copying some
    # pointers.
    csX.nzmax = X.data.shape[0]
    csX.m = X.shape[0]
    csX.n = X.shape[1]
    csX.p = &indptr[0]
    csX.i = &indices[0]
    csX.x = &data[0]
    csX.nz = -1  # to indicate CSC format

    result = np.zeros((X.shape[0], W.shape[1]), order='F', dtype=np.double)
    for i in prange(W.shape[1], nogil=True):
        # X is in fortran format, so we can get quick access to each of its
        # columns
        cs_gaxpy(&csX, &W[0, i], &result[0, i])

    return result
它从CSparse调用一些C

// src/cs_gaxpy.c

#include "cs.h"
/* y = A*x+y */
csi cs_gaxpy (const cs *A, const double *x, double *y)
{
  csi p, j, n, *Ap, *Ai ;
  double *Ax ;
  if (!CS_CSC (A) || !x || !y) return (0) ;       /* check inputs */
  n = A->n ; Ap = A->p ; Ai = A->i ; Ax = A->x ;
  for (j = 0 ; j < n ; j++)
    {
      for (p = Ap [j] ; p < Ap [j+1] ; p++)
        {
      y [Ai [p]] += Ax [p] * x [j] ;
        }
    }
  return (1) ;
}
//src/cs\u gaxpy.c
#包括“cs.h”
/*y=A*x+y*/
csi cs_gaxpy(常数cs*A,常数double*x,double*y)
{
csi p,j,n,*Ap,*Ai;
双倍*斧头;
如果(!CS|u CSC(A)| |!x | |!y)返回(0);/*检查输入*/
n=A->n;Ap=A->p;Ai=A->i;Ax=A->x;
对于(j=0;j
也许响应有点晚。使用pyTrilinos包可以获得可靠的并行加速,pyTrilinos包为Trilinos中的许多函数提供python包装。以下是转换为使用Pytrinos的示例:

from PyTrilinos import Epetra
from scipy.sparse import rand
import numpy as np

n_rows = 10000
n_cols = 8000
n_vecs = 20
fill_factor = 0.1

comm = Epetra.PyComm()
my_id = comm.MyPID()

row_map = Epetra.Map(n_rows, 0, comm)
out_vec_map = row_map
in_vec_map = Epetra.Map(n_cols, 0, comm)
col_map = Epetra.Map(n_cols, range(n_cols), 0, comm)

n_local_rows = row_map.NumMyElements()

# Create local block matrix in scipy and convert to Epetra
X = rand(n_local_rows, n_cols, fill_factor).tocoo()

A = Epetra.CrsMatrix(Epetra.Copy, row_map, col_map, int(fill_factor*n_cols*1.2), True)
A.InsertMyValues(X.row, X.col, X.data)
A.FillComplete()

# Create sub-vectors in numpy and convert to Epetra format 
W = np.random.rand(in_vec_map.NumMyElements(), n_vecs)
V = Epetra.MultiVector(in_vec_map, n_vecs)

V[:] = W.T # order of indices is opposite

B = Epetra.MultiVector(out_vec_map, n_vecs)

# Multiply
A.Multiply(False, V, B)
然后可以使用MPI运行此代码

mpiexec -n 2 python scipy_to_trilinos.py

其他PyTrilinos的示例可以在github存储库中找到。当然,如果要使用pyTrilinos,使用scipy初始化矩阵的这种方式可能不是最理想的。

您是否将numpy/scipy链接到优化的多线程ATLAS构建?如果你这样做了,当你使用np.dot时,你应该可以免费获得并行矩阵乘法。我使用的是链接到numpy/scipy的多线程BLAS库(OpenBLAS),但我测试了X.dot(W)和numpy.dot(X,W)(后者不适用于稀疏X),这不是并行的。谢谢你的回复!我也有类似的想法,并基于Eigen编写了Cython/OpenMP点积(参见的pdot2d)。在这里,我将X的行划分为cpu_计数块,它在我的8核机器上运行速度快了大约2倍(但我相信它可以改进)。一旦我解决了编译中的一些问题,我将与您的解决方案进行比较。