如何在python中实现EM-GMM?
我已实施以下措施,以避免使用此帖子失败:如何在python中实现EM-GMM?,python,numpy,machine-learning,scikit-learn,gmm,Python,Numpy,Machine Learning,Scikit Learn,Gmm,我已实施以下措施,以避免使用此帖子失败: import numpy as np def PDF(data, means, variances): return 1/(np.sqrt(2 * np.pi * variances) + eps) * np.exp(-1/2 * (np.square(data - means) / (variances + eps))) def EM_GMM(data, k, iterations): weights = np.ones((k, 1
import numpy as np
def PDF(data, means, variances):
return 1/(np.sqrt(2 * np.pi * variances) + eps) * np.exp(-1/2 * (np.square(data - means) / (variances + eps)))
def EM_GMM(data, k, iterations):
weights = np.ones((k, 1)) / k # shape=(k, 1)
means = np.random.choice(data, k)[:, np.newaxis] # shape=(k, 1)
variances = np.random.random_sample(size=k)[:, np.newaxis] # shape=(k, 1)
data = np.repeat(data[np.newaxis, :], k, 0) # shape=(k, n)
for step in range(iterations):
# Expectation step
likelihood = PDF(data, means, np.sqrt(variances)) # shape=(k, n)
# Maximization step
b = likelihood * weights # shape=(k, n)
b /= np.sum(b, axis=1)[:, np.newaxis] + eps
# updage means, variances, and weights
means = np.sum(b * data, axis=1)[:, np.newaxis] / (np.sum(b, axis=1)[:, np.newaxis] + eps)
variances = np.sum(b * np.square(data - means), axis=1)[:, np.newaxis] / (np.sum(b, axis=1)[:, np.newaxis] + eps)
weights = np.mean(b, axis=1)[:, np.newaxis]
return means, variances
当我在一维时间序列数据集上运行该算法时,对于k等于3,它返回如下输出:
array([[0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
3.05053810e-003, 2.36989898e-025, 2.36989898e-025,
1.32797395e-136, 6.91134950e-031, 5.47347807e-001,
1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
1.44637007e+000, 2.25849208e-064, 0.00000000e+000,
1.61228562e-303, 0.00000000e+000, 0.00000000e+000,
0.00000000e+000, 0.00000000e+000, 3.94387272e-242,
1.13078186e+000, 2.53108878e-001, 5.33548114e-001,
9.14920432e-001, 2.07015697e-013, 4.45250680e-038,
1.43000602e+000, 1.28781615e+000, 1.44821615e+000,
1.18186109e+000, 3.21610659e-002, 3.21610659e-002,
3.21610659e-002, 3.21610659e-002, 3.21610659e-002,
2.47382844e-039, 0.00000000e+000, 2.09150855e-200,
0.00000000e+000, 0.00000000e+000],
[5.93203066e-002, 1.01647068e+000, 5.99299162e-001,
0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
0.00000000e+000, 0.00000000e+000, 2.14690238e-010,
2.49337135e-191, 5.10499986e-001, 9.32658804e-001,
1.21148135e+000, 1.13315278e+000, 2.50324069e-237,
0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
0.00000000e+000, 1.73966953e-125, 2.53559290e-275,
1.42960975e-065, 7.57552338e-001],
[0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
3.05053810e-003, 2.36989898e-025, 2.36989898e-025,
1.32797395e-136, 6.91134950e-031, 5.47347807e-001,
1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
1.44637007e+000, 2.25849208e-064, 0.00000000e+000,
1.61228562e-303, 0.00000000e+000, 0.00000000e+000,
0.00000000e+000, 0.00000000e+000, 3.94387272e-242,
1.13078186e+000, 2.53108878e-001, 5.33548114e-001,
9.14920432e-001, 2.07015697e-013, 4.45250680e-038,
1.43000602e+000, 1.28781615e+000, 1.44821615e+000,
1.18186109e+000, 3.21610659e-002, 3.21610659e-002,
3.21610659e-002, 3.21610659e-002, 3.21610659e-002,
2.47382844e-039, 0.00000000e+000, 2.09150855e-200,
0.00000000e+000, 0.00000000e+000]])
我认为这是错误的,因为输出是两个向量,其中一个表示值,另一个表示方差
值。让我对实现产生怀疑的模糊点是,它返回了大部分输出的0.00000000e+000
,正如它所看到的,并且它不需要真正地可视化这些输出。顺便说一句,输入数据是时间序列数据。我已经检查了所有东西并跟踪了多次,但没有发现任何bug
以下是我的输入数据:
[25.31 , 24.31 , 24.12 , 43.46 , 41.48666667,
41.48666667, 37.54 , 41.175 , 44.81 , 44.44571429,
44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
44.44571429, 44.44571429, 39.71 , 26.69 , 34.15 ,
24.94 , 24.75 , 24.56 , 24.38 , 35.25 ,
44.62 , 44.94 , 44.815 , 44.69 , 42.31 ,
40.81 , 44.38 , 44.56 , 44.44 , 44.25 ,
43.66666667, 43.66666667, 43.66666667, 43.66666667, 43.66666667,
40.75 , 32.31 , 36.08 , 30.135 , 24.19 ]
我想知道是否有一种优雅的方法可以通过numpy
或SciKit-learn
实现它。任何帮助都将不胜感激
更新
以下是当前输出和预期输出:
正如我在评论中提到的,我看到的关键点是意味着初始化。在的默认实现之后,我切换到KMeans,而不是随机初始化
将numpy导入为np
导入seaborn作为sns
将matplotlib.pyplot作为plt导入
plt.style.use('seaborn')
eps=1e-8
def PDF(数据、平均值、差异):
返回1/(np.sqrt(2*np.pi*方差)+每股收益)*np.exp(-1/2*(np.sqrt(数据-均值)/(方差+每股收益)))
def EM_GMM(数据,k=3,迭代次数=100,init_strategy='kmeans'):
权重=np.one((k,1))/k#形状=(k,1)
如果init_策略=='kmeans':
从sklearn.cluster导入KMeans
km=KMeans(k).fit(数据[:,无])
平均数=km.聚类中心形状=(k,1)
其他:#初始策略=='random'
平均数=np.random.choice(数据,k)[,np.newaxis]#shape=(k,1)
方差=np.random.random_样本(大小=k)[,np.newaxis]#形状=(k,1)
数据=np.重复(数据[np.newaxis,:],k,0)#形状=(k,n)
对于步进范围(迭代):
#期望步
似然=PDF(数据,均值,np.sqrt(方差))#形状=(k,n)
#最大化步长
b=可能性*权重#形状=(k,n)
b/=np.sum(b,axis=1)[:,np.newaxis]+eps
#升级平均值、方差和权重
平均数=np.sum(b*数据,轴=1)[,np.newaxis]/(np.sum(b,轴=1)[,np.newaxis]+eps)
方差=np.和(b*np.平方(数据-均值),轴=1)[:np.新轴]/(np.和(b,轴=1)[:np.新轴]+eps)
权重=np.平均值(b,轴=1)[:,np.新轴]
返回均值、方差
这似乎可以更一致地产生所需的输出:
s=np.array([25.31,24.31,24.12,43.46,41.486667,
41.48666667, 37.54 , 41.175 , 44.81 , 44.44571429,
44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
44.44571429, 44.44571429, 39.71 , 26.69 , 34.15 ,
24.94 , 24.75 , 24.56 , 24.38 , 35.25 ,
44.62 , 44.94 , 44.815 , 44.69 , 42.31 ,
40.81 , 44.38 , 44.56 , 44.44 , 44.25 ,
43.66666667, 43.66666667, 43.66666667, 43.66666667, 43.66666667,
40.75 , 32.31 , 36.08 , 30.135 , 24.19 ])
k=3
n_iter=100
均值、方差=EM_GMM(s、k、n_iter)
打印(平均值、差异)
[[44.42596231]
[24.509301 ]
[35.4137508 ]]
[[0.07568723]
[0.10583743]
[0.52125856]]
#绘制结果
颜色=[“绿色”、“红色”、“蓝色”、“黄色”]
垃圾箱=np.linspace(np.min(s)-2,np.max(s)+2100)
plt.图(figsize=(10,7))
plt.xlabel(“$x$”)
plt.ylabel('pdf')
sns.散点图(s,[0.05]*len(s),color='navy',s=40,marker=2,label='Series data')
对于枚举中的i,(m,v)(zip(均值,方差)):
lineplot(bin,PDF(bin,m,v),color=colors[i],label=f'Cluster{i+1}'))
plt.legend()
plt.plot()
最后我们可以看到,纯随机初始化会产生不同的结果;让我们看看结果的意思是:
范围(5)内的uu的:
打印(EM_GMM(s,k,n_iter,init_strategy='random')[0],'\n')
[[44.42596231]
[44.42596231]
[44.42596231]]
[[44.42596231]
[24.509301 ]
[30.1349997 ]]
[[44.42596231]
[35.4137508 ]
[44.42596231]]
[[44.42596231]
[30.1349997 ]
[44.42596231]]
[[44.42596231]
[44.42596231]
[44.42596231]]
我们可以看到这些结果有多大的不同,在某些情况下,结果的平均值是常数,这意味着我们选择了3个相似的值,并且在迭代时没有太大的变化。在EM\u GMM
中添加一些打印语句将澄清这一点
# Expectation step
likelihood = PDF(data, means, np.sqrt(variances))
- 为什么我们要传递
差异的sqrt
?pdf函数接受差异。所以这应该是PDF(数据、均值、方差)
另一个问题,
# Maximization step
b = likelihood * weights # shape=(k, n)
b /= np.sum(b, axis=1)[:, np.newaxis] + eps
- 上面的第二行应该是
b/=np.sum(b,axis=0)[:,np.newaxis]+eps
同样在差异的初始化中
variances = np.random.random_sample(size=k)[:, np.newaxis] # shape=(k, 1)
- 为什么我们要初始化随机变量?我们有
数据
和均值
,为什么不计算当前估计的方差,如vars=np.展开(np.均值(np.平方(数据-均值),轴=1),-1)
有了这些变化,下面是我的实现
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')
eps=1e-8
def pdf(data, means, vars):
denom = np.sqrt(2 * np.pi * vars) + eps
numer = np.exp(-0.5 * np.square(data - means) / (vars + eps))
return numer /denom
def em_gmm(data, k, n_iter, init_strategy='k_means'):
weights = np.ones((k, 1), dtype=np.float32) / k
if init_strategy == 'k_means':
from sklearn.cluster import KMeans
km = KMeans(k).fit(data[:, None])
means = km.cluster_centers_
else:
means = np.random.choice(data, k)[:, np.newaxis]
data = np.repeat(data[np.newaxis, :], k, 0)
vars = np.expand_dims(np.mean(np.square(data - means), axis=1), -1)
for step in range(n_iter):
p = pdf(data, means, vars)
b = p * weights
denom = np.expand_dims(np.sum(b, axis=0), 0) + eps
b = b / denom
means_n = np.sum(b * data, axis=1)
means_d = np.sum(b, axis=1) + eps
means = np.expand_dims(means_n / means_d, -1)
vars = np.sum(b * np.square(data - means), axis=1) / means_d
vars = np.expand_dims(vars, -1)
weights = np.expand_dims(np.mean(b, axis=1), -1)
return means, vars
def main():
s = np.array([25.31, 24.31, 24.12, 43.46, 41.48666667,
41.48666667, 37.54, 41.175, 44.81, 44.44571429,
44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
44.44571429, 44.44571429, 39.71, 26.69, 34.15,
24.94, 24.75, 24.56, 24.38, 35.25,
44.62, 44.94, 44.815, 44.69, 42.31,
40.81, 44.38, 44.56, 44.44, 44.25,
43.66666667, 43.66666667, 43.66666667, 43.66666667, 43.66666667,
40.75, 32.31, 36.08, 30.135, 24.19])
k = 3
n_iter = 100
means, vars = em_gmm(s, k, n_iter)
y = 0
colors = ['green', 'red', 'blue', 'yellow']
bins = np.linspace(np.min(s) - 2, np.max(s) + 2, 100)
plt.figure(figsize=(10, 7))
plt.xlabel('$x$')
plt.ylabel('pdf')
sns.scatterplot(s, [0.0] * len(s), color='navy', s=40, marker=2, label='Series data')
for i, (m, v) in enumerate(zip(means, vars)):
sns.lineplot(bins, pdf(bins, m, v), color=colors[i], label=f'Cluster {i + 1}')
plt.legend()
plt.plot()
plt.show()
pass
这是我的结果。
您能分享一下您认为错误的原因吗?可视化可能会有所帮助,但即使没有,似乎也有一些解释missing@dia输出是两个向量,其中一个表示表示值,另一个表示方差
值。使我对实现产生怀疑的模糊点是,对于大多数输出,它返回0.00000000e+000