Python 根据直方图计算PDF_Python_Probability Density

Python 根据直方图计算PDF

python

Python 根据直方图计算PDF,python,probability-density,Python,Probability Density,我有一个严重右偏的直方图，我想计算一系列寿命值的概率（曲线下的区域，PDF）。例如，寿命值为（0-0.01）的概率由累计收入/累计安装量计算的LTV组成的数据帧： df['LTV'] (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.208125,0.0558879,0.608348,0.212553,0.0865896, 0.728542,0,0.609512,0,0,0,0,0,0,0,0.0801339,0.140657,0.0194118,0,0,0.063468

我有一个严重右偏的直方图，我想计算一系列寿命值的概率（曲线下的区域，PDF）。例如，寿命值为（0-0.01）的概率

由累计收入/累计安装量计算的LTV组成的数据帧：

df['LTV']

(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.208125,0.0558879,0.608348,0.212553,0.0865896,
 0.728542,0,0.609512,0,0,0,0,0,0,0,0.0801339,0.140657,0.0194118,0,0,0.0634682,
 0.339545,0.875902,0.8325,0.0260526,0.0711905,0.169894,0.202969,0.0761538,0,0.342055,
 0.42781,0,0,0.192115,0,0,0,0,0,0,0,0,0,0,0,1.6473,0,0.232329,0,2.21329,0.748,0.0424286,
 0.455439,0.210282,5.56453,0.427959,0,0.352059,0,0,0.567059,0,0,0,0.384462,1.29476,
 0.0103125,0,0.0126923,1.03356,0,0,0.289785,0,0)

我尝试利用SKlearn的KernelDensity，但是，在将其拟合到直方图后，它不会捕获过度表示的0

import gc
from sklearn.neighbors import KernelDensity

def plot_prob_density(df_lunch, field, x_start, x_end):
    plt.figure(figsize = (10, 7))

    unit = 0
    x = np.linspace(df_lunch.min() - unit, df_lunch.max() + unit, 1000)[:, np.newaxis]

    # Plot the data using a normalized histogram
    plt.hist(df_lunch, bins=200, density=True, label='LTV', color='blue', alpha=0.2)
    
    # Do kernel density estimation
    kd_lunch = KernelDensity(kernel='gaussian', bandwidth=0.00187).fit(df_lunch) #0.00187
  

    # Plot the estimated densty
    kd_vals_lunch = np.exp(kd_lunch.score_samples(x))


    plt.plot(x, kd_vals_lunch, color='orange')
    
    plt.axvline(x=x_start,color='red',linestyle='dashed')
    plt.axvline(x=x_end,color='red',linestyle='dashed')

    # Show the plots
    plt.xlabel(field, fontsize=15)
    plt.ylabel('Probability Density', fontsize=15)
    plt.legend(fontsize=15)
    plt.show()
    gc.collect()
    return kd_lunch
kd_lunch = plot_prob_density(final_df['LTV'].values.reshape(-1,1), 'LTV', x_start=0, x_end=0.01)

然后找到这样的概率：

def get_probability(start_value, end_value, eval_points, kd):
    
    # Number of evaluation points 
    N = eval_points                                      
    step = (end_value - start_value) / (N - 1)  # Step size

    x = np.linspace(start_value, end_value, N)[:, np.newaxis]  # Generate values in the range
    kd_vals = np.exp(kd.score_samples(x))  # Get PDF values for each x
    probability = np.sum(kd_vals * step)  # Approximate the integral of the PDF
    return probability.round(4)


print('Probability of LTV 0-3  tips during LUNCH time: {}\n'
      .format(get_probability(start_value = 0, 
                              end_value = 0.01, 
                              eval_points = 100, 
                              kd = kd_lunch)))

然而，这种方法并不能产生我们想要的适当的PDF值。如对其他方法有任何建议，将不胜感激

绘图：

我在工作中或多或少使用了类似的脚本，这是我的脚本，可能会对您有所帮助

import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = beta_95[0]

def plot_prob_density(data1, x_start, x_end):
    plt.figure(figsize = (4, 3.5))

    unit = 1.5
    x = np.linspace(-20, 20, 1000)[:, np.newaxis]

    # Plot the data using a normalized histogram
    plt.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
    #plt.show

    # Do kernel density estimation
    kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)

    # Plot the estimated densty
    kd_vals_data1 = np.exp(kd_data1.score_samples(x))

    plt.plot(x, kd_vals_data1, color='r', label='$N_a$', linewidth = 2)
    
    plt.axvline(x=9.95,color='green',linestyle='dashed', linewidth = 2.0, label='$β_o$')
    plt.axvline(x=1.9,color='black',linestyle='dashed', linewidth = 2.0, label='$β_b$')
    
    plt.axvline(x=x_end,color='red',linestyle='dashed', linewidth = 2, label='$β_{95\%}$')

    # Show the plots
    plt.xlabel('Beta', fontsize=10)
    plt.ylabel('Probability Density', fontsize=10)
    plt.title('02 hours window', fontsize=12)
    plt.xlim(-20, 20)
    plt.ylim(0, 0.3)
    plt.yticks([0, 0.1, 0.2, 0.3]) 
    plt.legend(fontsize=12, loc='upper left', frameon=False)
    plt.show()
    gc.collect()
    return kd_data1

def get_probability(start_value, end_value, eval_points, kd):
    
    # Number of evaluation points 
    N = eval_points                                      
    step = (end_value - start_value) / (N - 1)  # Step size

    x = np.linspace(start_value, end_value, N)[:, np.newaxis]  # Generate values in the range
    kd_vals = np.exp(kd.score_samples(x))  # Get PDF values for each x
    probability = np.sum(kd_vals * step)  # Approximate the integral of the PDF
    return probability.round(4)

data1 = np.array(data1).reshape(-1, 1)

kd_data1 = plot_prob_density(data1, x_start=3.0, x_end=13)

print('Beta-95%: {}\n'
      .format(get_probability(start_value = -10, 
                              end_value = 13, 
                              eval_points = 1000, 
                              kd = kd_data1)))

你能用Sklearn发布你用于KDE的代码吗？此外，如果可能，请共享您试图拟合PDF的数据，以及您得到的不正确输出图。在询问依赖数据的代码时，重要的是在您的问题中包含最少的数据示例。您越容易让我们从您的问题中复制和粘贴（以便我们可以执行您的代码并测试我们的解决方案），您就越有可能得到回复。-请阅读。

final\u df

中有多少行？您是否在问为什么在

plot\u prob\u density

函数中使用SKLearn的内核密度与数据/直方图不一致？您是否在询问您的方法是否有效您没有提供

plot\u prob\u density

，因此我们不知道

kd\u lunch

是什么，我们只能猜测。Post已用完整的代码更新，似乎您正在寻找一个混合的随机变量，它是连续和离散的混合。