Python 查找两个值之间的唯一数据_Python_Numpy

Python 查找两个值之间的唯一数据

python numpy

Python 查找两个值之间的唯一数据,python,numpy,Python,Numpy,我希望快速找到两个值之间唯一值（本例中为历元时间）的索引，只返回minVal和maxVal之间的所有值（但不是两次）。下面是一个简化示例： import numpy as np minVal = 198000 maxVal = 230000 uniqueExample = np.arange(300, dtype=float) # this is how it expected to exist # this is how it actually exists, a small repea

我希望快速找到两个值之间唯一值（本例中为历元时间）的索引，只返回minVal和maxVal之间的所有值（但不是两次）。下面是一个简化示例：

import numpy as np 
minVal = 198000  
maxVal = 230000
uniqueExample = np.arange(300, dtype=float) # this is how it expected to exist
# this is how it actually exists, a small repeated values randomly interspersed  
example = np.insert(uniqueExample, 200, np.arange(200,210.))*1000 # *1000 to differentiate from the indices


# now begin process of isolating 
mask = (example < maxVal) & (example > minVal)
idx = np.argwhere(mask).squeeze()

为了改进结果，添加了以下内容这将返回所需的结果当检索索引O（100）时，这可以正常工作，但对于较大的数据集O（100000）+（有时似乎无法删除所有重复项）来说，这很慢，因此我提出了一些似乎仍然很慢的选项，我希望有人能解释这些选项的慢，或者找到更好/更快的方法。速度是个问题

import time
# define testing function for test functions below 
def timing(f, n, a):
    print(f.__name__,)
    r = range(n)
    t1 = time.perf_counter()
    for i in r:
        f(a[0],a[1],a[2]); f(a[0],a[1],a[2]); 
    t2 = time.perf_counter()
    print(round(t2-t1, 3))

def gettimeBase(example, minVal, maxVal):
    # this is target (speed and simplicity), but returns duplicates
    mask = (example >= minVal) & (example  < maxVal)
    idx = np.argwhere(mask).squeeze()
    return idx
## now one's that don't return duplicates
def gettime1(example, minVal, maxVal):
    mask = (example >= minVal) & (example < maxVal)
    idx = np.argwhere(mask).squeeze()
    if np.size(idx) == 0:
        idx = None
    if len(set(example[idx])) !=len(example[idx]):
     ## when there are duplicate times on the server
        times, idxUnique = np.unique(example, return_index=True)
        mask2 = (times >= minVal) & (times < maxVal)
        idx2 = np.argwhere(mask2).squeeze()
        idx = idxUnique[idx2].squeeze()
        assert (sorted(set(example[idx])) == example[idx]).all(), 'Data Still have duplicate times'
     return idx

def gettime2(example, minVal, maxVal):
    if len(set(example)) != len(example):
        ## when there are duplicate times on the server
        times, idxUnique = np.unique(example, return_index=True)
        mask2 = (times >= minVal) & (times < maxVal)
        idx2 = np.argwhere(mask2).squeeze()
        idx = idxUnique[idx2].squeeze()
    else:
        mask = (example >= minVal) & (example < maxVal)
        idx = np.argwhere(mask).squeeze()
    if np.size(idx) == 0:
        return None
    assert (sorted(set(example[idx])) == example[idx]).all(), 'Data Still have duplicate times'
    return idx

testdata = (example, minValue, maxValue)
testfuncs = gettimeBase, gettime1, gettime2
for f in testfuncs:
    timing(f, 100, testdata)

导入时间
#为下面的测试功能定义测试功能
def正时（f、n、a）：
打印（f.\uuuuu名称\uuuuuu，）
r=范围（n）
t1=时间。性能计数器（）
对于r中的i：
f（a[0]，a[1]，a[2]）；f（a[0]，a[1]，a[2]）；
t2=时间。性能计数器（）
打印（圆形（t2-t1，3））
def gettimeBase（例如，minVal、maxVal）：
#这是目标（速度和简单性），但返回重复项
掩码=（示例>=minVal）和（示例=minVal）和（示例=最小值）和（时间<最大值）
idx2=np.argwhere（mask2.squence（）
idx=idxUnique[idx2]。挤压（）
断言（已排序（集合（示例[idx]））==示例[idx]）.all（），“数据仍有重复时间”
返回idx
def gettime2（例如，minVal、maxVal）：
如果len（设置（示例））！=len（示例）：
##当服务器上存在重复时间时
times，idxUnique=np.unique（例如，return\u index=True）
mask2=（时间>=最小值）和（时间<最大值）
idx2=np.argwhere（mask2.squence（）
idx=idxUnique[idx2]。挤压（）
其他：
掩码=（示例>=minVal）和（示例


测试结果如下（python 3）：
gettimeBase
0.127
gettime1
35.103
gettime2
74.953选项1
numpy.unique
此选项速度很快，但它将返回每个副本第一次出现的索引，而在您的问题中，您似乎正在获取副本的最后一个索引。这意味着此方法返回的索引将与所需的输出不匹配，但它们对应的值将相同
vals, indices = np.unique(example[mask], return_index=True)
indices + np.argmax(mask)


下面是我提到的警告：
desired = np.array([199, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,
   222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
   235, 236, 237, 238, 239])

np.array_equal(start + idx, desired)
# False

np.array_equal(example[start + idx], example[desired])
# True


选项2
numpy.unique
+numpy.flip

这实际上捕获了最后一次事件，但增加了更多开销：
np.array_equal(final + idx[0], desired)
# True


性能（我包括一些安装成本）
def chris1（arr、mn、mx）：
掩码=（arrmn）
VAL，索引=np.unique（arr[mask]，返回\索引=True）
返回索引+np.argmax（掩码）
def chris2（arr、mn、mx）：
掩码=（arrmn）
f=np.flip（arr[mask]）
VAL，索引=np.unique（f，返回值\索引=True）
最终=f.形状[0]-1-索引
返回最终+np.argmax（掩码）
def sbfrf（arr、mn、mx）：
掩码=（arrmn）
idx=np.argwhere（mask.squence（））
如果len（set（例如[idx]）！=len（示例[idx]）：
dupes=np.array（[x代表n，枚举中的x（示例[idx]），如果示例[idx][：n]]中的x是）。squence（）
idx=np.delete（idx，np.nonzero（np.in1d（例如[idx]，dupes.squence（）[：：2]））
返回idx
在[225]中：%timeit chris1（例如，198_000，230_000）
每个回路29.6µs±133 ns（7次运行的平均值±标准偏差，每个10000个回路）
在[226]中：%timeit chris2（例如，198_000，230_000）
每个回路36.5µs±98.6 ns（7次运行的平均值±标准偏差，每个10000个回路）
在[227]中：%timeit sbfrf（例如，198_000、230_000）
每个回路463µs±7.77µs（7次运行的平均值±标准偏差，每个1000个回路）
我回答时略带警告。您可以返回重复值的第一个索引而不是最后一个索引吗？在本例中，返回的索引可以引用示例中的任何一个重复值，因为它们只是重复数据
vals, indices = np.unique(example[mask], return_index=True)
indices + np.argmax(mask)

array([199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 220, 221,
       222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
       235, 236, 237, 238, 239], dtype=int64)

desired = np.array([199, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,
   222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
   235, 236, 237, 238, 239])

np.array_equal(start + idx, desired)
# False

np.array_equal(example[start + idx], example[desired])
# True

f = np.flip(example[mask])
vals, indices = np.unique(f, return_index=True)
final = f.shape[0] - 1 - indices
final + np.argmax(mask)

array([199, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,
       222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
       235, 236, 237, 238, 239], dtype=int64)

np.array_equal(final + idx[0], desired)
# True

def chris1(arr, mn, mx):
    mask = (arr < mx) & (arr > mn)
    vals, indices = np.unique(arr[mask], return_index=True)
    return indices + np.argmax(mask)

def chris2(arr, mn, mx):
    mask = (arr < mx) & (arr > mn)
    f = np.flip(arr[mask])
    vals, indices = np.unique(f, return_index=True)
    final = f.shape[0] - 1 - indices
    return final + np.argmax(mask)

def sbfrf(arr, mn, mx):
    mask = (arr < mx) & (arr > mn)
    idx = np.argwhere(mask).squeeze()
    if len(set(example[idx])) != len(example[idx]):
        dupes = np.array([x for n, x in enumerate(example[idx]) if x in example[idx][:n]]).squeeze()
    idx = np.delete(idx, np.nonzero(np.in1d(example[idx], dupes).squeeze()[::2]))
    return idx

In [225]: %timeit chris1(example, 198_000, 230_000)
29.6 µs ± 133 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

In [226]: %timeit chris2(example, 198_000, 230_000)
36.5 µs ± 98.6 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

In [227]: %timeit sbfrf(example, 198_000, 230_000)
463 µs ± 7.77 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)