Python 是否有一个numpy内置项来拒绝列表中的异常值_Python_Numpy

Python 是否有一个numpy内置项来拒绝列表中的异常值

python numpy

Python 是否有一个numpy内置项来拒绝列表中的异常值,python,numpy,Python,Numpy,是否有一个numpy内置来执行以下操作？也就是说，获取一个列表d，并返回一个列表filtered\u d，其中根据d中的点的某些假定分布删除了任何外围元素 import numpy as np def reject_outliers(data): m = 2 u = np.mean(data) s = np.std(data) filtered = [e for e in data if (u - 2 * s < e < u + 2 * s)]

是否有一个numpy内置来执行以下操作？也就是说，获取一个列表

，并返回一个列表

filtered\u d

，其中根据

中的点的某些假定分布删除了任何外围元素

import numpy as np

def reject_outliers(data):
    m = 2
    u = np.mean(data)
    s = np.std(data)
    filtered = [e for e in data if (u - 2 * s < e < u + 2 * s)]
    return filtered

>>> d = [2,4,5,1,6,5,40]
>>> filtered_d = reject_outliers(d)
>>> print filtered_d
[2,4,5,1,6,5]

将numpy导入为np
def拒绝_异常值（数据）：
m=2
u=np.平均值（数据）
s=np.std（数据）
过滤=[e表示数据中的e，如果（u-2*s>>d=[2,4,5,1,6,5,40]
>>>过滤的\u d=拒绝异常值（d）
>>>打印过滤的
[2,4,5,1,6,5]

我之所以说“类似”，是因为函数可能允许这些分布中的不同分布（泊松分布、高斯分布等）和不同的异常值阈值（如我在这里使用的

）。

此方法与您的方法几乎相同，只是更多的numpy（也仅适用于numpy数组）：

def拒绝_异常值（数据，m=2）：
返回数据[abs（数据-np.平均值（数据））

处理异常值时，重要的一点是，应尽量使用稳健的估计值。分布的平均值会因异常值而产生偏差，但中位数会小得多

基于eumiro的答案：

def reject_outliers(data, m = 2.):
    d = np.abs(data - np.median(data))
    mdev = np.median(d)
    s = d/mdev if mdev else 0.
    return data[s<m]

def拒绝_异常值（数据，m=2）：
d=np.abs（数据-np.median（数据））
mdev=np.中值（d）
如果mdev为0，则s=d/mdev。
返回数据[s建立在Benjamin的基础上，使用pandas.Series

，并替换：

def拒绝异常值（sr，iq范围=0.5）：
pcnt=（1-智商范围）/2
qlow，中位数，qhigh=sr.dropna（）.分位数（[pcnt，0.50，1-pcnt]）
iqr=qhigh-qlow
返回sr[（sr-中值）.abs（）另一种方法是对标准偏差进行稳健估计（假设高斯统计）。查找在线计算器，我发现90%的百分位对应于1.2815σ，95%的百分位对应于1.645σ（）
举个简单的例子：
import numpy as np

# Create some random numbers
x = np.random.normal(5, 2, 1000)

# Calculate the statistics
print("Mean= ", np.mean(x))
print("Median= ", np.median(x))
print("Max/Min=", x.max(), " ", x.min())
print("StdDev=", np.std(x))
print("90th Percentile", np.percentile(x, 90))

# Add a few large points
x[10] += 1000
x[20] += 2000
x[30] += 1500

# Recalculate the statistics
print()
print("Mean= ", np.mean(x))
print("Median= ", np.median(x))
print("Max/Min=", x.max(), " ", x.min())
print("StdDev=", np.std(x))
print("90th Percentile", np.percentile(x, 90))

# Measure the percentile intervals and then estimate Standard Deviation of the distribution, both from median to the 90th percentile and from the 10th to 90th percentile
p90 = np.percentile(x, 90)
p10 = np.percentile(x, 10)
p50 = np.median(x)
# p50 to p90 is 1.2815 sigma
rSig = (p90-p50)/1.2815
print("Robust Sigma=", rSig)

rSig = (p90-p10)/(2*1.2815)
print("Robust Sigma=", rSig)

我得到的结果是：
Mean=  4.99760520022
Median=  4.95395274981
Max/Min= 11.1226494654   -2.15388472011
Sigma= 1.976629928
90th Percentile 7.52065379649

Mean=  9.64760520022
Median=  4.95667658782
Max/Min= 2205.43861943   -2.15388472011
Sigma= 88.6263902244
90th Percentile 7.60646688694

Robust Sigma= 2.06772555531
Robust Sigma= 1.99878292462

这接近于预期值2
如果我们要删除高于/低于5个标准偏差的点（1000个点，我们希望1个值>3个标准偏差）：
我不知道哪种方法更有效/更稳健
当距离中位数的中位数为0时，Benjamin Bannier的答案会产生一个通过，因此我发现这个修改版本对于下面示例中给出的情况更有用
def reject_outliers_2(data, m=2.):
    d = np.abs(data - np.median(data))
    mdev = np.median(d)
    s = d / (mdev if mdev else 1.)
    return data[s < m]

给出：
[[10, 10, 10, 17, 10, 10]]  # 17 is not filtered
[10, 10, 10, 10, 10]  # 17 is filtered (it's distance, 7, is greater than m)

我想做一些类似的事情，除了将数字设置为NaN，而不是从数据中删除它，因为如果删除它，则会更改长度，这可能会导致打印混乱（即，如果只从表中的一列中删除异常值，但需要它与其他列保持相同，以便可以相互打印）
为此，我使用了：
我想在这个答案中提供两种方法，基于“z分数”的解决方案和基于“IQR”的解决方案
此答案中提供的代码适用于单个dimnumpy
阵列和多个numpy
阵列
让我们先导入一些模块
import collections
import numpy as np
import scipy.stats as stat
from scipy.stats import iqr

基于z分数的方法
此方法将测试数字是否超出三个标准差。根据此规则，如果值为异常值，则该方法将返回true，如果不是，则返回false
def sd_outlier(x, axis = None, bar = 3, side = 'both'):
    assert side in ['gt', 'lt', 'both'], 'Side should be `gt`, `lt` or `both`.'

    d_z = stat.zscore(x, axis = axis)

    if side == 'gt':
        return d_z > bar
    elif side == 'lt':
        return d_z < -bar
    elif side == 'both':
        return np.abs(d_z) > bar

最后，如果要过滤掉异常值，请使用numpy
选择器
祝你愉快。
考虑一下，当你的标准偏差由于巨大的异常值而变得非常大时，上述所有方法都会失败
（Simalar表示平均计算失败，应该计算中位数。不过，平均值“更容易出现stdDv这样的错误”。）
您可以尝试迭代应用算法，或者使用四分位数范围进行过滤：
（此处“因子”与n*西格玛范围相关，但仅当数据服从高斯分布时）
将numpy导入为np
def SORTOUTLIERS（数据输入，系数）：
quant3，quant1=np.百分位（数据输入[75,25]）
iqr=量程3-量程1
iqrSigma=iqr/1.34896
medData=np.中值（数据单位）
dataOut=[x代表数据输入if中的x（（x>medData-因子*iqrSigma）和（x
如果您想获取异常值的索引位置，idx\U列表将返回它
def reject_outliers(data, m = 2.):
        d = np.abs(data - np.median(data))
        mdev = np.median(d)
        s = d/mdev if mdev else 0.
        data_range = np.arange(len(data))
        idx_list = data_range[s>=m]
        return data[s<m], idx_list

data_points = np.array([8, 10, 35, 17, 73, 77])  
print(reject_outliers(data_points))

after rejection: [ 8 10 35 17], index positions of outliers: [4 5]

def拒绝_异常值（数据，m=2）：
d=np.abs（数据-np.median（数据））
mdev=np.中值（d）
如果mdev为0，则s=d/mdev。
数据范围=np.arange（len（数据））
idx\u列表=数据范围[s>=m]
返回一组图像的数据[s（每个图像有3个维度），其中我想拒绝我使用的每个像素的异常值：
mean = np.mean(imgs, axis=0)
std = np.std(imgs, axis=0)
mask = np.greater(0.5 * std + 1, np.abs(imgs - mean))
masked = np.multiply(imgs, mask)

然后可以计算平均值：
masked_mean = np.divide(np.sum(masked, axis=0), np.sum(mask, axis=0))

（我将其用于背景减法）
在这里，我在x
中找到异常值，并用它们周围的一个点窗口（win
）的中位数替换它们（从Benjamin Bannier处获取中位数偏差）
def outlier\u平滑器（x，m=3，win=3，plots=False）：
''发现x中的异常值，点>m*mdev（x）[mdev:中值偏差]
并将其替换为其周围的赢点数中值“
x_corr=np.拷贝（x）
d=np.abs（x-np.median（x））
mdev=np.中值（d）
idxs_异常值=np.非零（d>m*mdev）[0]
对于idxs_中的i异常值：
如果i-win<0：
x_corr[i]=np.median（np.append（x[0:i]，x[i+1:i+win+1]））
如果i+win+1>len（x）：
x_corr[i]=np.median（np.append（x[i-win:i]，x[i+1:len（x）]）
其他：
x_corr[i]=np.中值（np.追加（x[i-win:i]，x[i+1:i+win+1]））
如果绘制：
plt.figure（'异常值更平滑'，clear=True）
plt.绘图（x，label='orig'，lw=5）
plt.plot（idxs_异常值，x[idxs_异常值]，'ro'，label='outliers'）
plt.绘图（x_corr，'-o'，label='corrected'）
plt。
def reject_outliers(data, m=2):
    stdev = np.std(data)
    mean = np.mean(data)
    maskMin = mean - stdev * m
    maskMax = mean + stdev * m
    mask = np.ma.masked_outside(data, maskMin, maskMax)
    print('Masking values outside of {} and {}'.format(maskMin, maskMax))
    return mask

import collections
import numpy as np
import scipy.stats as stat
from scipy.stats import iqr

def sd_outlier(x, axis = None, bar = 3, side = 'both'):
    assert side in ['gt', 'lt', 'both'], 'Side should be `gt`, `lt` or `both`.'

    d_z = stat.zscore(x, axis = axis)

    if side == 'gt':
        return d_z > bar
    elif side == 'lt':
        return d_z < -bar
    elif side == 'both':
        return np.abs(d_z) > bar

def q1(x, axis = None):
    return np.percentile(x, 25, axis = axis)

def q3(x, axis = None):
    return np.percentile(x, 75, axis = axis)

def iqr_outlier(x, axis = None, bar = 1.5, side = 'both'):
    assert side in ['gt', 'lt', 'both'], 'Side should be `gt`, `lt` or `both`.'

    d_iqr = iqr(x, axis = axis)
    d_q1 = q1(x, axis = axis)
    d_q3 = q3(x, axis = axis)
    iqr_distance = np.multiply(d_iqr, bar)

    stat_shape = list(x.shape)

    if isinstance(axis, collections.Iterable):
        for single_axis in axis:
            stat_shape[single_axis] = 1
    else:
        stat_shape[axis] = 1

    if side in ['gt', 'both']:
        upper_range = d_q3 + iqr_distance
        upper_outlier = np.greater(x - upper_range.reshape(stat_shape), 0)
    if side in ['lt', 'both']:
        lower_range = d_q1 - iqr_distance
        lower_outlier = np.less(x - lower_range.reshape(stat_shape), 0)

    if side == 'gt':
        return upper_outlier
    if side == 'lt':
        return lower_outlier
    if side == 'both':
        return np.logical_or(upper_outlier, lower_outlier)

import numpy as np

def sortoutOutliers(dataIn,factor):
    quant3, quant1 = np.percentile(dataIn, [75 ,25])
    iqr = quant3 - quant1
    iqrSigma = iqr/1.34896
    medData = np.median(dataIn)
    dataOut = [ x for x in dataIn if ( (x > medData - factor* iqrSigma) and (x < medData + factor* iqrSigma) ) ] 
    return(dataOut)

def reject_outliers(data, m = 2.):
        d = np.abs(data - np.median(data))
        mdev = np.median(d)
        s = d/mdev if mdev else 0.
        data_range = np.arange(len(data))
        idx_list = data_range[s>=m]
        return data[s<m], idx_list

data_points = np.array([8, 10, 35, 17, 73, 77])  
print(reject_outliers(data_points))

after rejection: [ 8 10 35 17], index positions of outliers: [4 5]

mean = np.mean(imgs, axis=0)
std = np.std(imgs, axis=0)
mask = np.greater(0.5 * std + 1, np.abs(imgs - mean))
masked = np.multiply(imgs, mask)

masked_mean = np.divide(np.sum(masked, axis=0), np.sum(mask, axis=0))

def outlier_smoother(x, m=3, win=3, plots=False):
    ''' finds outliers in x, points > m*mdev(x) [mdev:median deviation] 
    and replaces them with the median of win points around them '''
    x_corr = np.copy(x)
    d = np.abs(x - np.median(x))
    mdev = np.median(d)
    idxs_outliers = np.nonzero(d > m*mdev)[0]
    for i in idxs_outliers:
        if i-win < 0:
            x_corr[i] = np.median(np.append(x[0:i], x[i+1:i+win+1]))
        elif i+win+1 > len(x):
            x_corr[i] = np.median(np.append(x[i-win:i], x[i+1:len(x)]))
        else:
            x_corr[i] = np.median(np.append(x[i-win:i], x[i+1:i+win+1]))
    if plots:
        plt.figure('outlier_smoother', clear=True)
        plt.plot(x, label='orig.', lw=5)
        plt.plot(idxs_outliers, x[idxs_outliers], 'ro', label='outliers')                                                                                                                    
        plt.plot(x_corr, '-o', label='corrected')
        plt.legend()
    
    return x_corr