Python 突出显示CDF图中的异常值区域_Python_Python 2.7_Matplotlib_Ecdf

Python 突出显示CDF图中的异常值区域

python python-2.7 matplotlib

Python 突出显示CDF图中的异常值区域,python,python-2.7,matplotlib,ecdf,Python,Python 2.7,Matplotlib,Ecdf,我试图突出显示CDF中“异常值”在我的可视化中所处的区域（可能是一个浅红色阴影来区分该区域）根据上述定义，您能否帮助对“异常值”点所在的区域进行着色？出于某种原因，当我试图查看异常值定义的作用时，我得到了一个空输出，无论是print（异常值\u iqr（天））还是print（str（异常值\u iqr（天）[1:-1]）。它只打印数组（[]，dtype=int64），这是我当前的代码： import numpy as np import pandas as pd import matplot

我试图突出显示CDF中“异常值”在我的可视化中所处的区域（可能是一个浅红色阴影来区分该区域）

根据上述定义，您能否帮助对“异常值”点所在的区域进行着色？出于某种原因，当我试图查看异常值定义的作用时，我得到了一个空输出，无论是

print（异常值\u iqr（天））

还是

print（str（异常值\u iqr（天）[1:-1]）

。它只打印

数组（[]，dtype=int64），

这是我当前的代码：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

a = [389, 350, 130, 344, 392, 92, 51, 28, 309, 357, 64, 380, 332, 109, 284, 105, 
 50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133, 
 95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105, 
 368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214, 
 192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50, 
 178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19, 
 118, 105, 92, 133, 77, 54, 72, 34]

#create CDF definition
def ecdf(data):
    n = len(data)
    x = np.sort(data)
    y = np.arange(1.0, n+1) / n
    return x, y

#Using +-1.5x IQR method for defining outliers
def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return np.where((ys > upper_bound) | (ys < lower_bound))

days = pd.DataFrame({"days" : a})

x, y = ecdf(days)

plt.plot(x, y, marker='.', linestyle='none') 
plt.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean

x_m = int(x.mean())
y_m = stats.percentileofscore(days.as_matrix(), x.mean())/100.0

ax=plt.gca()
ax.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), 
            xytext=(10,-5), textcoords='offset points')

outliers= outliers_iqr(days) 
print(outliers_iqr(days)) #print outliers- doesn't print   
print(str(outliers_iqr(days))[1:-1]) #same

#highlight the outliers area in the CDF plot
ax.fill_between(?, ?, ?, where=?, facecolor='red', alpha=0.3) #between 0 and 1st quartile
ax.fill_between(?, ?, ?, where=?, facecolor='red', alpha=0.3) #between 3rd quartile and 1

percentiles= np.array([25,50,75])
x_p = np.percentile(days, percentiles)
y_p = percentiles/100.0

plt.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles

for x,y in zip(x_p, y_p):                                        
    ax.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')

plt.xlabel('Days')
plt.ylabel('ECDF')
plt.legend(('Days', "Mean", 'Quartiles'), loc='lower right')

plt.show()

将numpy导入为np
作为pd进口熊猫
将matplotlib.pyplot作为plt导入
从scipy导入统计信息
a=[38935013034439251 28309357 64380332105，
50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133, 
95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105, 
368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214, 
192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50, 
178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19, 
118, 105, 92, 133, 77, 54, 72, 34]
#创建CDF定义
def ecdf（数据）：
n=len（数据）
x=np.排序（数据）
y=np.arange（1.0，n+1）/n
返回x，y
#使用+-1.5x IQR方法定义异常值
def异常值_iqr（ys）：
四分位_1，四分位_3=np.百分位（ys，[25,75]）
iqr=四分位_3-四分位_1
下限=四分位数（iqr*1.5）
上限=四分位数3+（iqr*1.5）
返回np.where（（ys>上界）|（ys<下界））
days=pd.DataFrame（{“days”：a}）
x、 y=ecdf（天）
plt.plot（x，y，marker=''，linestyle='none'）
plt.axvline（x.mean（），color='gray'，linestyle='虚线'，linewidth=2）#添加平均值
x_m=int（x.mean（））
y_m=stats.percentileofscore（天数，如矩阵（），x.mean（））/100.0
ax=plt.gca（）
ax.注释（“（%s，%s）”（x_m，int（y_m*100）），xy=（x_m，y_m），
xytext=（10，-5），textcoords='offset points'）
异常值=异常值（天）
打印（异常值_iqr（天））#打印异常值-不打印
打印（str（异常值_iqr（天））[1:-1]）#相同
#突出显示CDF图中的异常值区域
最大填充量在（？，？，式中=？，面色为红色，α=0.3）之间#在0和第一个四分位数之间
第三个四分位数和第1个四分位数之间的最大填充
百分位数=np.数组（[25,50,75]）
x_p=np.百分位数（天，百分位数）
y_p=百分位数/100.0
plt.plot（x_p，y_p，marker='D'，color='red'，linestyle='none'）#叠加四分位数
对于拉链中的x，y（x\u p，y\p）：
注释（“%s”%int（x），xy=（x，y），xytext=（10，-5），textcoords='offset points'）
plt.xlabel（“天”）
plt.ylabel（'ECDF'）
plt.图例（（‘天’、‘平均数’、‘四分位数’），位置为右下角）
plt.show（）

如果异常值数组有时可能为空，则必须使用

If

语句来处理这种情况。此外，由于您只想对绘图区域进行着色处理，因此实际上可以使用

Axes.axvspan

来实现此目的。这里是一个在原有基础上稍作修改的示例（函数中的所有绘图命令，以及添加第二个子图，其中包含实际存在异常值的数据）：

将numpy导入为np
作为pd进口熊猫
将matplotlib.pyplot作为plt导入
从scipy导入统计信息
a=[38935013034439251 28309357 64380332105，
50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133, 
95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105, 
368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214, 
192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50, 
178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19, 
118, 105, 92, 133, 77, 54, 72, 34]
#创建CDF定义
def ecdf（数据）：
n=len（数据）
x=np.排序（数据）
y=np.arange（1.0，n+1）/n
返回x，y
#使用+-1.5x IQR方法定义异常值
def异常值_iqr（ys）：
四分位_1，四分位_3=np.百分位（ys，[25,75]）
iqr=四分位_3-四分位_1
下限=四分位数（iqr*1.5）
上限=四分位数3+（iqr*1.5）
返回np.where（（ys<下界）），np.where（（ys>上界））
def生成_图（ax、df）：
x、 y=ecdf（df）
ax.plot（x，y，marker='.'，linestyle='none'）
ax.axvline（x.mean（），color='gray'，linestyle='虚线'，linewidth=2）#添加平均值
x_m=int（x.mean（））
y_m=stats.percentileofscore（df.as_matrix（），x.mean（））/100.0
ax.注释（“（%s，%s）”（x_m，int（y_m*100）），xy=（x_m，y_m），
xytext=（10，-5），textcoords='offset points'）
离群值=离群值（df.值）
#突出显示CDF图中的异常值区域
对于异常值中的outl：
VAL=df.值[outl]
如果VAL.size>0：
ax.axvspan（np.min（vals），np.max（vals），alpha=0.5，颜色为红色）
百分位数=np.数组（[25,50,75]）
x_p=np.百分位数（df，百分位数）
y_p=百分位数/100.0
ax.plot（x_p，y_p，marker='D'，color='red'，linestyle='none'）#叠加四分位数
对于拉链中的x，y（x\u p，y\p）：
注释（“%s”%int（x），xy=（x，y），xytext=（10，-5），textcoords='offset points'）
ax.set\u xlabel（'Days'）
ax.set_ylabel（'ECDF'）
ax.图例（（‘天’、‘平均值’、‘四分位数’），位置为右下角）
图，轴=plt.子批次（nrows=1，ncols=2，figsize=（10,5））
##原始数据
days=pd.DataFrame（{“days”：a}）
生成_图（轴[0]，天）
##带有异常值的伪数据
b=np.concatenate([
np.随机正常（200,50300），
np.随机正常（25,10,20），
np.随机正常（375,10,20），
])
np.random.shuffle（b）
生成_图（轴[1]，pd.DataFrame（{“天”：b}））
##命名子地块
轴[0]。设置标题（“原始数据”）
轴[1]。设置标题（“带异常值的假数据”）
plt.show（）

结果如下所示：

希望这能有所帮助。

什么标准可以定义定义“o”的“更好的方法”

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

a = [389, 350, 130, 344, 392, 92, 51, 28, 309, 357, 64, 380, 332, 109, 284, 105, 
 50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133, 
 95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105, 
 368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214, 
 192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50, 
 178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19, 
 118, 105, 92, 133, 77, 54, 72, 34]


#create CDF definition
def ecdf(data):
    n = len(data)
    x = np.sort(data)
    y = np.arange(1.0, n+1) / n
    return x, y

#Using +-1.5x IQR method for defining outliers
def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)

    return  np.where((ys < lower_bound)), np.where((ys > upper_bound))



def generate_plot(ax, df):

    x, y = ecdf(df)

    ax.plot(x, y, marker='.', linestyle='none') 
    ax.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean

    x_m = int(x.mean())
    y_m = stats.percentileofscore(df.as_matrix(), x.mean())/100.0

    ax.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), 
                xytext=(10,-5), textcoords='offset points')

    outliers= outliers_iqr(df.values) 

    #highlight the outliers area in the CDF plot
    for outl in outliers:
        vals = df.values[outl]
        if vals.size>0:
            ax.axvspan(np.min(vals),np.max(vals),alpha=0.5,color='red')


    percentiles= np.array([25,50,75])
    x_p = np.percentile(df, percentiles)
    y_p = percentiles/100.0

    ax.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles

    for x,y in zip(x_p, y_p):                                        
        ax.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')

    ax.set_xlabel('Days')
    ax.set_ylabel('ECDF')
    ax.legend(('Days', "Mean", 'Quartiles'), loc='lower right')


fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize=(10,5))

##original data
days = pd.DataFrame({"days" : a})
generate_plot(axes[0],days)

##fake data with outliers
b = np.concatenate([
    np.random.normal(200,50,300),
    np.random.normal(25,10,20),
    np.random.normal(375,10,20),
])
np.random.shuffle(b)
generate_plot(axes[1],pd.DataFrame({"days" : b}))

##naming the subplots
axes[0].set_title('original data')
axes[1].set_title('fake data with outliers')

plt.show()