Python 获取索引数组中的最小值_Python_Numpy

Python 获取索引数组中的最小值

python numpy

Python 获取索引数组中的最小值,python,numpy,Python,Numpy,我有一个n×3索引数组（想想三角形索引点）和一个与三角形关联的浮点值列表。现在，我想获取每个索引（“点”）的最小值，即检查包含索引的所有行，例如0，并从各行的vals中获取最小值：导入numpy a=numpy.array([ [0, 1, 2], [2, 3, 0], [1, 4, 2], [2, 5, 3], ]) VAL=numpy.数组（[0.1,0.5,0.3,0.6]） out=[ numpy.min（vals[numpy.any（a==i，axis=1）]）对于范围（6）中的i

我有一个n×3索引数组（想想三角形索引点）和一个与三角形关联的浮点值列表。现在，我想获取每个索引（“点”）的最小值，即检查包含索引的所有行，例如0，并从各行的

vals

中获取最小值：

导入numpy
a=numpy.array([
[0, 1, 2],
[2, 3, 0],
[1, 4, 2],
[2, 5, 3],
])
VAL=numpy.数组（[0.1,0.5,0.3,0.6]）
out=[
numpy.min（vals[numpy.any（a==i，axis=1）]）
对于范围（6）中的i
]
#out=numpy.array（[0.1,0.1,0.1,0.5,0.3,0.6]）

此解决方案效率低下，因为它对每个

执行完整的数组比较

这个问题与numpy的ufuncs非常相似，但是

numpy.min.at

不存在

有什么提示吗？

如果for循环超出了

，您可以切换到

pd.GroupBy

或

itertools.GroupBy

比如说,

r = n.ravel()
pd.Series(np.arange(len(r))//3).groupby(r).apply(lambda s: vals[s].min())

对于长循环，此解决方案会更快，对于小循环（<50）

方法#1

一种基于数组分配的方法是设置填充的

2D

数组

nan

，使用这些

值作为列索引（因此假设这些值为整数），然后将

VAL

映射到其中，并为最终输出查找nan跳过的最小值-

nr,nc = len(a),a.max()+1
m = np.full((nr,nc),np.nan)
m[np.arange(nr)[:,None],a] = vals[:,None]
out = np.nanmin(m,axis=0)

方法#2

另一个是基于数组赋值的，但使用

掩蔽

和

np.minimum.reduceat

来处理

nan

nr,nc = len(a),a.max()+1
m = np.zeros((nc,nr),dtype=bool)
m[a.T,np.arange(nr)] = 1
c = m.sum(1)
shift_idx = np.r_[0,c[:-1].cumsum()]
out = np.minimum.reduceat(np.broadcast_to(vals,m.shape)[m],shift_idx)

方法#3

另一个基于

argsort

（假设

中的所有整数都是从

到

a.max（）

）-

方法#4

提高内存效率，从而提高性能。而且还要完成这套-

来自numba import njit
@njit
def numba1（a、VAL、out）：
m、 n=a.形状
对于范围内的j（m）：
对于范围（n）中的i：
e=a[j，i]
如果VAL[j]

以下是一个基于：
如果你有pythran，编译
文件

否则，脚本将退回到基于稀疏矩阵的方法，该方法只会稍微慢一点：
import numpy as np
try:
    from stb_pthr import sort_to_bins
    HAVE_PYTHRAN = True
except:
    HAVE_PYTHRAN = False

from scipy.sparse import csr_matrix

def sort_to_bins_sparse(idx, mx):
    if mx==-1:
        mx = idx.max() + 1
    aux = csr_matrix((np.ones_like(idx),idx,np.arange(idx.size+1)),
                     (idx.size,mx)).tocsc()
    return aux.indices, aux.indptr

if not HAVE_PYTHRAN:
    sort_to_bins = sort_to_bins_sparse

def f_op():
    mx = a.max() + 1
    return np.fromiter((np.min(vals[np.any(a == i, axis=1)])
                        for i in range(mx)),vals.dtype,mx)

def f_pp():
    idx, bb = sort_to_bins(a.reshape(-1),-1)
    res = np.minimum.reduceat(vals[idx//3], bb[:-1])
    res[bb[:-1]==bb[1:]] = np.inf
    return res

def f_div_3():
    sidx = a.ravel().argsort()
    c = np.bincount(a.ravel())
    bb = np.r_[0,c.cumsum()]
    res = np.minimum.reduceat(vals[sidx//a.shape[1]],bb[:-1])
    res[bb[:-1]==bb[1:]] = np.inf
    return res

a = np.array([
    [0, 1, 2],
    [2, 3, 0],
    [1, 4, 2],
    [2, 5, 3],
])
vals = np.array([0.1, 0.5, 0.3, 0.6])

assert np.all(f_op()==f_pp())

from timeit import timeit

a = np.random.randint(0,1000,(10000,3))
vals = np.random.random(10000)
assert len(np.unique(a))==1000

assert np.all(f_op()==f_pp())
print("1000/1000 labels, 10000 rows")
print("op ", timeit(f_op, number=10)*100, 'ms')
print("pp ", timeit(f_pp, number=100)*10, 'ms')
print("div", timeit(f_div_3, number=100)*10, 'ms')

a = 1 + 2 * np.random.randint(0,5000,(1000000,3))
vals = np.random.random(1000000)
nl = len(np.unique(a))

assert np.all(f_div_3()==f_pp())
print(f"{nl}/{a.max()+1} labels, 1000000 rows")
print("pp ", timeit(f_pp, number=10)*100, 'ms')
print("div", timeit(f_div_3, number=10)*100, 'ms')

a = 1 + 2 * np.random.randint(0,100000,(1000000,3))
vals = np.random.random(1000000)
nl = len(np.unique(a))

assert np.all(f_div_3()==f_pp())
print(f"{nl}/{a.max()+1} labels, 1000000 rows")
print("pp ", timeit(f_pp, number=10)*100, 'ms')
print("div", timeit(f_div_3, number=10)*100, 'ms')

样本运行（时间安排包括@Divakar进近3供参考）：
更新：@Divakar的最新（方法4）很难击败，本质上是一个C实现。这没什么错，只是jitting不是一个选项，而是这里的一个要求（运行未提交的代码没有乐趣）。如果人们接受这一点，那么当然，pythran也可以这样做：
pythran-O3标记为_min.py

文件

样本运行：
nmb  8.41792532010004 ms
pthr 8.104007659712806 ms

pythran
的速度快了几个百分点，但这只是因为我将vals
查找移出了内部循环；没有这一点，他们几乎是平等的
为了进行比较，对于同一个问题，有和没有非python助手时，以前最好的解决方案是：
pp           114.04887529788539 ms
pp (py only) 147.0821460010484 ms

显然，numpy.minimum.at
存在：
导入numpy
a=numpy.array([
[0, 1, 2],
[2, 3, 0],
[1, 4, 2],
[2, 5, 3],
])
VAL=numpy.数组（[0.1,0.5,0.3,0.6]）
out=numpy.full（6，numpy.inf）
最小数值（向外，a.整形（-1），重复数值（VAL，3））
你能解释一下你是如何在输出中得到0.1、0.1、0.1的吗？有可复制的代码吗？@SANTOSHKUMARDESAI这么做了。有趣的优化问题。我认为你的解决方案是最好的。如果某个ID丢失了，比如说a[0,1]
和a[2,0]
是0s
？因此，我们在a
中没有1。那会发生吗？
import numpy as np
try:
    from stb_pthr import sort_to_bins
    HAVE_PYTHRAN = True
except:
    HAVE_PYTHRAN = False

from scipy.sparse import csr_matrix

def sort_to_bins_sparse(idx, mx):
    if mx==-1:
        mx = idx.max() + 1
    aux = csr_matrix((np.ones_like(idx),idx,np.arange(idx.size+1)),
                     (idx.size,mx)).tocsc()
    return aux.indices, aux.indptr

if not HAVE_PYTHRAN:
    sort_to_bins = sort_to_bins_sparse

def f_op():
    mx = a.max() + 1
    return np.fromiter((np.min(vals[np.any(a == i, axis=1)])
                        for i in range(mx)),vals.dtype,mx)

def f_pp():
    idx, bb = sort_to_bins(a.reshape(-1),-1)
    res = np.minimum.reduceat(vals[idx//3], bb[:-1])
    res[bb[:-1]==bb[1:]] = np.inf
    return res

def f_div_3():
    sidx = a.ravel().argsort()
    c = np.bincount(a.ravel())
    bb = np.r_[0,c.cumsum()]
    res = np.minimum.reduceat(vals[sidx//a.shape[1]],bb[:-1])
    res[bb[:-1]==bb[1:]] = np.inf
    return res

a = np.array([
    [0, 1, 2],
    [2, 3, 0],
    [1, 4, 2],
    [2, 5, 3],
])
vals = np.array([0.1, 0.5, 0.3, 0.6])

assert np.all(f_op()==f_pp())

from timeit import timeit

a = np.random.randint(0,1000,(10000,3))
vals = np.random.random(10000)
assert len(np.unique(a))==1000

assert np.all(f_op()==f_pp())
print("1000/1000 labels, 10000 rows")
print("op ", timeit(f_op, number=10)*100, 'ms')
print("pp ", timeit(f_pp, number=100)*10, 'ms')
print("div", timeit(f_div_3, number=100)*10, 'ms')

a = 1 + 2 * np.random.randint(0,5000,(1000000,3))
vals = np.random.random(1000000)
nl = len(np.unique(a))

assert np.all(f_div_3()==f_pp())
print(f"{nl}/{a.max()+1} labels, 1000000 rows")
print("pp ", timeit(f_pp, number=10)*100, 'ms')
print("div", timeit(f_div_3, number=10)*100, 'ms')

a = 1 + 2 * np.random.randint(0,100000,(1000000,3))
vals = np.random.random(1000000)
nl = len(np.unique(a))

assert np.all(f_div_3()==f_pp())
print(f"{nl}/{a.max()+1} labels, 1000000 rows")
print("pp ", timeit(f_pp, number=10)*100, 'ms')
print("div", timeit(f_div_3, number=10)*100, 'ms')

1000/1000 labels, 10000 rows
op  145.1122640981339 ms
pp  0.7944229000713676 ms
div 2.2905819199513644 ms
5000/10000 labels, 1000000 rows
pp  113.86540920939296 ms
div 417.2476712032221 ms
100000/200000 labels, 1000000 rows
pp  158.23634970001876 ms
div 486.13436080049723 ms

import numpy as np

#pythran export labeled_min(int[:,:], float[:])

def labeled_min(A, vals):
    mn = np.empty(A.max()+1)
    mn[:] = np.inf
    M,N = A.shape
    for i in range(M):
        v = vals[i]
        for j in range(N):
            c = A[i,j]
            if v < mn[c]:
                mn[c] = v
    return mn

from labeled_min import labeled_min

func1() # do not measure jitting time    
print("nmb ", timeit(func1, number=100)*10, 'ms')
print("pthr", timeit(lambda:labeled_min(a,vals), number=100)*10, 'ms')

nmb  8.41792532010004 ms
pthr 8.104007659712806 ms

pp           114.04887529788539 ms
pp (py only) 147.0821460010484 ms