Python 在数组中寻找最近点-KDTree的逆

Python 在数组中寻找最近点-KDTree的逆,python,arrays,algorithm,numpy,scipy,Python,Arrays,Algorithm,Numpy,Scipy,我有一个非常大的Ndaray a和一个点k的排序列表(一个小列表,大约30个点) 对于A的每个元素,我想确定点k列表中最近的元素以及索引。比如: >>> A = np.asarray([3, 4, 5, 6]) >>> k = np.asarray([4.1, 3]) >>> values, indices [3, 4.1, 4.1, 4.1], [1, 0, 0, 0] 现在,问题是A非常非常大。所以我不能做一些低效的事情,比如给A加一个

我有一个非常大的Ndaray a和一个点k的排序列表(一个小列表,大约30个点)

对于A的每个元素,我想确定点k列表中最近的元素以及索引。比如:

>>> A = np.asarray([3, 4, 5, 6])
>>> k = np.asarray([4.1, 3])
>>> values, indices
[3, 4.1, 4.1, 4.1], [1, 0, 0, 0]
现在,问题是A非常非常大。所以我不能做一些低效的事情,比如给A加一个维度,把abs差取到k,然后取每列的最小值

目前,我一直在使用np.searchsorted,如第二个答案所示:但即使这样也太慢了。这是我使用的代码(修改为使用多个值):

这比搜索排序的解决方案慢得多

另一方面,数组A总是相同的,只有k改变。因此,在a上使用一些辅助结构(如“反向KDTree”),然后在小数组k上查询结果将是有益的

有这样的事吗

编辑

目前,我正在使用np.searchsorted的一个变体,它需要对数组a进行排序。作为预处理步骤,我们可以提前执行此操作,但在计算索引后仍必须恢复原始顺序。这种变体的速度大约是上述变体的两倍

A = np.random.random(3000000)
k = np.random.random(30)

indices_sort = np.argsort(A)
sortedA = A[indices_sort]

inv_indices_sort = np.argsort(indices_sort)
k.sort()


def find_nearest(sortedA, k):
    midpoints = k[:-1] + np.diff(k)/2
    idx_aux = np.searchsorted(sortedA, midpoints)
    idx = []
    count = 0
    final_indices = np.zeros(sortedA.shape, dtype=int)
    old_obj = None
    for obj in idx_aux:
        if obj != old_obj:
            idx.append((obj, count))
            old_obj = obj
        count += 1
    old_idx = 0
    for idx_A, idx_k in idx:
        final_indices[old_idx:idx_A] = idx_k
        old_idx = idx_A
    final_indices[old_idx:] = len(k)-1

    indicesClosest = final_indices[inv_indices_sort] #<- this takes 90% of the time
    return k[indicesClosest], indicesClosest
A=np.random.random(3000000)
k=np.随机。随机(30)
索引\u sort=np.argsort(A)
sortedA=A[索引\排序]
inv\u index\u sort=np.argsort(index\u sort)
k、 排序()
def find_最近(sortedA,k):
中点=k[:-1]+np.diff(k)/2
idx_aux=np.searchsorted(排序数据,中点)
idx=[]
计数=0
最终索引=np.0(sortedA.shape,dtype=int)
old_obj=无
对于idx_aux中的obj:
如果obj!=old_obj:
追加((对象,计数))
old_obj=obj
计数+=1
old_idx=0
对于idx_A,idx中的idx_k:
最终索引[旧索引:idx\U A]=idx\U k
old_idx=idx_A
最终指数[old_idx:]=len(k)-1
IndicatesClosest=最终索引[投资索引排序]\p>更新:

内置函数实际上可以完全满足您的需要。只需要一个小技巧:
digitalize
为垃圾箱赋值。我们可以通过将数组排序并将bin边框精确地设置在相邻元素之间的中间,将<代码> k>代码>转换为bin。

import numpy as np

A = np.asarray([3, 4, 5, 6])
k = np.asarray([4.1, 3, 1])  # added another value to show that sorting/binning works

ki = np.argsort(k)
ks = k[ki]

i = np.digitize(A, (ks[:-1] + ks[1:]) / 2)

indices = ki[i]
values = ks[i]

print(values, indices)
# [ 3.   4.1  4.1  4.1] [1 0 0 0]

旧答案:

我将采用蛮力方法对
k
中的每个元素执行一次向量化传递
a
,并更新当前元素改善近似值的位置

import numpy as np

A = np.asarray([3, 4, 5, 6])
k = np.asarray([4.1, 3])

err = np.zeros_like(A) + np.inf  # keep track of error over passes

values = np.empty_like(A, dtype=k.dtype)
indices = np.empty_like(A, dtype=int)

for i, v in enumerate(k):
    d = np.abs(A - v)
    mask = d < err  # only update where v is closer to A
    values[mask] = v
    indices[mask] = i
    err[mask] = d[mask]

print(values, indices)
# [ 3.   4.1  4.1  4.1] [1 0 0 0]
将numpy导入为np
A=np.asarray([3,4,5,6])
k=np.asarray([4.1,3])
err=np.zero_像(A)+np.inf#跟踪传递过程中的错误
values=np.empty_like(A,dtype=k.dtype)
索引=np.empty_like(A,dtype=int)
对于枚举(k)中的i,v:
d=np.abs(A-v)
mask=d

这种方法需要三个与
A
大小相同的临时变量,因此如果没有足够的内存可用,它将失败。

因此,经过一些工作和scipy邮件列表中的想法,我认为在我的情况下(使用常数A和缓慢变化的k),最好的方法是使用以下实现

class SearchSorted:
    def __init__(self, tensor, use_k_optimization=True):

        '''
        use_k_optimization requires storing 4x the size of the tensor.
        If use_k_optimization is True, the class will assume that successive calls will be made with similar k.
        When this happens, we can cut the running time significantly by storing additional variables. If it won't be
        called with successive k, set the flag to False, as otherwise would just consume more memory for no
        good reason
        '''

        self.indices_sort = np.argsort(tensor)
        self.sorted_tensor = tensor[self.indices_sort]
        self.inv_indices_sort = np.argsort(self.indices_sort)
        self.use_k_optimization = use_k_optimization

        self.previous_indices_results = None
        self.prev_idx_A_k_pair = None

    def query(self, k):
        midpoints = k[:-1] + np.diff(k) / 2
        idx_count = np.searchsorted(self.sorted_tensor, midpoints)
        idx_A_k_pair = []
        count = 0

        old_obj = 0
        for obj in idx_count:
            if obj != old_obj:
                idx_A_k_pair.append((obj, count))
                old_obj = obj
            count += 1

        if not self.use_k_optimization or self.previous_indices_results is None:
            #creates the index matrix in the sorted case
            final_indices = self._create_indices_matrix(idx_A_k_pair, self.sorted_tensor.shape, len(k))
            #and now unsort it to match the original tensor position
            indicesClosest = final_indices[self.inv_indices_sort]
            if self.use_k_optimization:
                self.prev_idx_A_k_pair = idx_A_k_pair
                self.previous_indices_results = indicesClosest
            return indicesClosest

        old_indices_unsorted = self._create_indices_matrix(self.prev_idx_A_k_pair, self.sorted_tensor.shape, len(k))
        new_indices_unsorted = self._create_indices_matrix(idx_A_k_pair, self.sorted_tensor.shape, len(k))
        mask = new_indices_unsorted != old_indices_unsorted

        self.prev_idx_A_k_pair = idx_A_k_pair
        self.previous_indices_results[self.indices_sort[mask]] = new_indices_unsorted[mask]
        indicesClosest = self.previous_indices_results

        return indicesClosest

    @staticmethod
    def _create_indices_matrix(idx_A_k_pair, matrix_shape, len_quant_points):
        old_idx = 0
        final_indices = np.zeros(matrix_shape, dtype=int)
        for idx_A, idx_k in idx_A_k_pair:
            final_indices[old_idx:idx_A] = idx_k
            old_idx = idx_A
        final_indices[old_idx:] = len_quant_points - 1
        return final_indices
其思想是预先对数组A进行排序,然后在k的中点使用A的searchsorted。这给出了与之前相同的信息,因为它准确地告诉我们A的哪些点更接近k的哪些点。方法_create_index_matrix将根据这些信息创建完整的索引数组,然后我们将对其进行反排序以恢复A的原始顺序。为了利用缓慢变化的k,我们保存最后的索引,并确定必须更改哪些索引;然后我们只改变那些。对于缓慢变化的k,这会产生更高的性能(但内存成本会更高)

对于500万个元素的随机矩阵A和30个元素的随机矩阵k,重复实验60次,我们得到

Function search_sorted1; 15.72285795211792s
Function search_sorted2; 13.030786037445068s
Function query; 2.3306031227111816s <- the one with use_k_optimization = True
Function query; 4.81286096572876s   <- with use_k_optimization = False
功能搜索功能1;15.7228579521792s
功能搜索2;13.030786037445068s

功能查询;2.3306031227111816s您有多个
。那么,使用
searchsorted
时是否循环?显示您的搜索尝试?或者您是否使用了此代码-?请比“非常非常大”更具体。给一个典型的
a
@Divakar是的,我用了那个代码:)我会编辑它in@WarrenWeckesser我有大约20个阵列,每个阵列平均有500万个元素。一些大一些,一些小一些。我需要对每个数组A执行此操作。不要认为这是您的尝试,因为它不能处理
k
中的多个值。谢谢您的回答!不幸的是,暴力解决方案太慢了。数字化是个好主意,但我不认为它与np有什么不同,对吗?我们有稍微不同的接口,但它将以相似的速度运行。最有可能改善这一点的唯一方法是利用矩阵A永远不会改变的事实,只有k会改变;因此,以某种方式对A进行预处理,并以一种更容易执行必要操作的格式进行处理computations@Ant我认为你是对的。我不熟悉
searchsorted
,所以这种相似性不知怎的没有被我注意到。然而,无论如何,还是值得尝试一下
digitalize
。有时非常相似的numpy函数在性能上表现出惊人的差异。注释代码看起来是个好主意,包括(是的,我注意到了构造函数),这应该是一个很好的问题。标记性能、numpy和scipy。使用“Python标记”的风险自负。剖面图中还有其他观察结果吗?
使用_k_优化的有效性如何“随增量”变化?(您可以通过为
以前的_索引结果指定
int8
来缓解一些内存压力
class SearchSorted:
    def __init__(self, tensor, use_k_optimization=True):

        '''
        use_k_optimization requires storing 4x the size of the tensor.
        If use_k_optimization is True, the class will assume that successive calls will be made with similar k.
        When this happens, we can cut the running time significantly by storing additional variables. If it won't be
        called with successive k, set the flag to False, as otherwise would just consume more memory for no
        good reason
        '''

        self.indices_sort = np.argsort(tensor)
        self.sorted_tensor = tensor[self.indices_sort]
        self.inv_indices_sort = np.argsort(self.indices_sort)
        self.use_k_optimization = use_k_optimization

        self.previous_indices_results = None
        self.prev_idx_A_k_pair = None

    def query(self, k):
        midpoints = k[:-1] + np.diff(k) / 2
        idx_count = np.searchsorted(self.sorted_tensor, midpoints)
        idx_A_k_pair = []
        count = 0

        old_obj = 0
        for obj in idx_count:
            if obj != old_obj:
                idx_A_k_pair.append((obj, count))
                old_obj = obj
            count += 1

        if not self.use_k_optimization or self.previous_indices_results is None:
            #creates the index matrix in the sorted case
            final_indices = self._create_indices_matrix(idx_A_k_pair, self.sorted_tensor.shape, len(k))
            #and now unsort it to match the original tensor position
            indicesClosest = final_indices[self.inv_indices_sort]
            if self.use_k_optimization:
                self.prev_idx_A_k_pair = idx_A_k_pair
                self.previous_indices_results = indicesClosest
            return indicesClosest

        old_indices_unsorted = self._create_indices_matrix(self.prev_idx_A_k_pair, self.sorted_tensor.shape, len(k))
        new_indices_unsorted = self._create_indices_matrix(idx_A_k_pair, self.sorted_tensor.shape, len(k))
        mask = new_indices_unsorted != old_indices_unsorted

        self.prev_idx_A_k_pair = idx_A_k_pair
        self.previous_indices_results[self.indices_sort[mask]] = new_indices_unsorted[mask]
        indicesClosest = self.previous_indices_results

        return indicesClosest

    @staticmethod
    def _create_indices_matrix(idx_A_k_pair, matrix_shape, len_quant_points):
        old_idx = 0
        final_indices = np.zeros(matrix_shape, dtype=int)
        for idx_A, idx_k in idx_A_k_pair:
            final_indices[old_idx:idx_A] = idx_k
            old_idx = idx_A
        final_indices[old_idx:] = len_quant_points - 1
        return final_indices
Function search_sorted1; 15.72285795211792s
Function search_sorted2; 13.030786037445068s
Function query; 2.3306031227111816s <- the one with use_k_optimization = True
Function query; 4.81286096572876s   <- with use_k_optimization = False
import numpy as np
import scipy
import scipy.spatial
import time


A = np.random.rand(10000*500) #5 million elements
k = np.random.rand(32)
k.sort()

#first attempt, detailed in the answer, too
def search_sorted1(A, k):
    indicesClosest = np.searchsorted(k, A)
    flagToReduce = indicesClosest == k.shape[0]
    modifiedIndicesToAvoidOutOfBoundsException = indicesClosest.copy()
    modifiedIndicesToAvoidOutOfBoundsException[flagToReduce] -= 1

    flagToReduce = np.logical_or(flagToReduce,
                        np.abs(A-k[indicesClosest-1]) <
                        np.abs(A - k[modifiedIndicesToAvoidOutOfBoundsException]))
    flagToReduce = np.logical_and(indicesClosest > 0, flagToReduce)
    indicesClosest[flagToReduce] -= 1

    return indicesClosest

#taken from @Divakar answer linked in the comments under the question
def search_sorted2(A, k):
    indicesClosest = np.searchsorted(k, A, side="left").clip(max=k.size - 1)
    mask = (indicesClosest > 0) & \
           ((indicesClosest == len(k)) | (np.fabs(A - k[indicesClosest - 1]) < np.fabs(A - k[indicesClosest])))
    indicesClosest = indicesClosest - mask

    return indicesClosest
def kdquery1(A, k):
    d = scipy.spatial.cKDTree(k, compact_nodes=False, balanced_tree=False)
    _, indices = d.query(A)
    return indices

#After an indea on scipy mailing list
class SearchSorted:
    def __init__(self, tensor, use_k_optimization=True):

        '''
        Using this requires storing 4x the size of the tensor.
        If use_k_optimization is True, the class will assume that successive calls will be made with similar k.
        When this happens, we can cut the running time significantly by storing additional variables. If it won't be
        called with successive k, set the flag to False, as otherwise would just consume more memory for no
        good reason
        '''

        self.indices_sort = np.argsort(tensor)
        self.sorted_tensor = tensor[self.indices_sort]
        self.inv_indices_sort = np.argsort(self.indices_sort)
        self.use_k_optimization = use_k_optimization

        self.previous_indices_results = None
        self.prev_idx_A_k_pair = None

    def query(self, k):
        midpoints = k[:-1] + np.diff(k) / 2
        idx_count = np.searchsorted(self.sorted_tensor, midpoints)
        idx_A_k_pair = []
        count = 0

        old_obj = 0
        for obj in idx_count:
            if obj != old_obj:
                idx_A_k_pair.append((obj, count))
                old_obj = obj
            count += 1

        if not self.use_k_optimization or self.previous_indices_results is None:
            #creates the index matrix in the sorted case
            final_indices = self._create_indices_matrix(idx_A_k_pair, self.sorted_tensor.shape, len(k))
            #and now unsort it to match the original tensor position
            indicesClosest = final_indices[self.inv_indices_sort]
            if self.use_k_optimization:
                self.prev_idx_A_k_pair = idx_A_k_pair
                self.previous_indices_results = indicesClosest
            return indicesClosest

        old_indices_unsorted = self._create_indices_matrix(self.prev_idx_A_k_pair, self.sorted_tensor.shape, len(k))
        new_indices_unsorted = self._create_indices_matrix(idx_A_k_pair, self.sorted_tensor.shape, len(k))
        mask = new_indices_unsorted != old_indices_unsorted

        self.prev_idx_A_k_pair = idx_A_k_pair
        self.previous_indices_results[self.indices_sort[mask]] = new_indices_unsorted[mask]
        indicesClosest = self.previous_indices_results

        return indicesClosest

    @staticmethod
    def _create_indices_matrix(idx_A_k_pair, matrix_shape, len_quant_points):
        old_idx = 0
        final_indices = np.zeros(matrix_shape, dtype=int)
        for idx_A, idx_k in idx_A_k_pair:
            final_indices[old_idx:idx_A] = idx_k
            old_idx = idx_A
        final_indices[old_idx:] = len_quant_points - 1
        return final_indices

mySearchSorted = SearchSorted(A, use_k_optimization=True)
mySearchSorted2 = SearchSorted(A, use_k_optimization=False)
allFunctions = [search_sorted1, search_sorted2,
                mySearchSorted.query,
                mySearchSorted2.query]

print(np.array_equal(mySearchSorted.query(k), kdquery1(A, k)[1]))
print(np.array_equal(mySearchSorted.query(k), search_sorted2(A, k)[1]))
print(np.array_equal(mySearchSorted2.query(k), search_sorted2(A, k)[1]))

if __name__== '__main__':
    num_to_average = 3
    for func in allFunctions:
        if func.__name__ == 'search_sorted3':
            indices_sort = np.argsort(A)
            sA = A[indices_sort].copy()
            inv_indices_sort = np.argsort(indices_sort)
        else:
            sA = A.copy()
        if func.__name__ != 'query':
            func_to_use = lambda x: func(sA, x)
        else:
            func_to_use = func
        k_to_use = k
        start_time = time.time()
        for idx_average in range(num_to_average):
            for idx_repeat in range(10):
                k_to_use += (2*np.random.rand(*k.shape)-1)/100 #uniform between (-1/100, 1/100)
                k_to_use.sort()
                indices = func_to_use(k_to_use)
                if func.__name__ == 'search_sorted3':
                    indices = indices[inv_indices_sort]
                val = k[indices]

        end_time = time.time()
        total_time = end_time-start_time

        print('Function {}; {}s'.format(func.__name__, total_time))