Python Scipy.cluster.hierarchy.fclusterdata+；距离测量_Python_Cluster Analysis_Scipy

Python Scipy.cluster.hierarchy.fclusterdata+；距离测量

python

Python Scipy.cluster.hierarchy.fclusterdata+；距离测量,python,cluster-analysis,scipy,Python,Cluster Analysis,Scipy,1）我正在使用scipy的hcluster模块所以我控制的变量是阈值变量。我如何知道每个阈值的性能？i、 e.在Kmeans中，该性能将是指向其质心的所有点的总和。当然，这必须进行调整，因为更多的簇通常意味着更少的距离对于这一点，我可以使用hcluster进行观察吗 2）我意识到fclusterdata有很多可用的指标。我基于关键术语的tf-idf对文本文档进行聚类。问题是，一些文档比其他文档长，我认为余弦是“规范化”这个长度问题的一个好方法，因为文档越长，如果内容一致，它在n维字段中

1）我正在使用scipy的hcluster模块

所以我控制的变量是阈值变量。我如何知道每个阈值的性能？i、 e.在Kmeans中，该性能将是指向其质心的所有点的总和。当然，这必须进行调整，因为更多的簇通常意味着更少的距离

对于这一点，我可以使用hcluster进行观察吗

2）我意识到fclusterdata有很多可用的指标。我基于关键术语的tf-idf对文本文档进行聚类。问题是，一些文档比其他文档长，我认为余弦是“规范化”这个长度问题的一个好方法，因为文档越长，如果内容一致，它在n维字段中的“方向”应该保持不变。有没有其他人可以建议的方法？我如何评估

Thx

可以计算群中x的平均距离| x-簇中心|，就像K-均值一样。下面就是这种暴力。（它必须是内置的在scipy.cluster或scipy.spatial.distance中，但我也找不到。）

关于你的问题2，通过。欢迎链接到关于分层集群的好教程

#!/usr/bin/env python
""" cluster cities: pdist linkage fcluster plot
    util: clusters() avdist()
"""

from __future__ import division
import sys
import numpy as np
import scipy.cluster.hierarchy as hier  # $scipy/cluster/hierarchy.py
import scipy.spatial.distance as dist
import pylab as pl
from citiesin import citiesin  # 1000 US cities

__date__ = "27may 2010 denis"

def clusterlists(T):
    """ T = hier.fcluster( Z, t ) e.g. [a b a b a c]
        -> [ [0 2 4] [1 3] [5] ] sorted by len
    """
    clists = [ [] for j in range( max(T) + 1 )]
    for j, c in enumerate(T):
        clists[c].append( j )
    clists.sort( key=len, reverse=True )
    return clists[:-1]  # clip the []

def avdist( X, to=None ):
    """ av dist X vecs to "to", None: mean(X) """
    if to is None:
        to = np.mean( X, axis=0 )
    return np.mean( dist.cdist( X, [to] ))

#...............................................................................
Ndata = 100
method = "average"
t = 0
crit = "maxclust"
    # 'maxclust': Finds a minimum threshold `r` so that the cophenetic distance
    # between any two original observations in the same flat cluster
    # is no more than `r` and no more than `t` flat clusters are formed.
    # but t affects cluster sizes only weakly ?
    # t 25: [10, 9, 8, 7, 6
    # t 20: [12, 11, 10, 9, 7
plot = 0
seed = 1

exec "\n".join( sys.argv[1:] )  # Ndata= t= ...
np.random.seed(seed)
np.set_printoptions( 2, threshold=100, edgeitems=10, suppress=True )  # .2f
me = __file__.split('/') [-1]

    # biggest US cities --
cities = np.array( citiesin( n=Ndata )[0] )  # N,2

if t == 0:  t = Ndata // 4

#...............................................................................
print "# %s  Ndata=%d  t=%d  method=%s  crit=%s " % (me, Ndata, t, method, crit)

Y = dist.pdist( cities )  # n*(n-1) / 2
Z = hier.linkage( Y, method )  # n-1
T = hier.fcluster( Z, t, criterion=crit )  # n

clusters = clusterlists(T)
print "cluster sizes:", map( len, clusters )
print "# average distance to centre in the biggest clusters:"
for c in clusters:
    if len(c) < len(clusters[0]) // 3:  break
    cit = cities[c].T
    print "%.2g %s" % (avdist(cit.T), cit)
    if plot:
        pl.plot( cit[0], cit[1] )

if plot:
    pl.title( "scipy.cluster.hierarchy of %d US cities, %s t=%d" % (
        Ndata, crit, t) )
    pl.grid(False)
    if plot >= 2:
        pl.savefig( "cities-%d-%d.png" % (Ndata, t), dpi=80 )
    pl.show()

#/usr/bin/env python
“”“集群城市：pdist链接fcluster绘图”
util:clusters（）avdist（）
"""
来自未来进口部
导入系统
将numpy作为np导入
将scipy.cluster.hierarchy作为hier#$scipy/cluster/hierarchy.py导入
将scipy.spatial.distance导入为dist
将pylab作为pl导入
从citiesin进口citiesin#1000个美国城市
__日期=“2010年5月27日丹尼斯”
def群集列表（T）：
“”“T=hier.fcluster（Z，T）例如[a b a b a c]
->[[0 2 4][1 3][5]]按len排序
"""
clists=[]对于范围内的j（最大值（T）+1）]
对于枚举（T）中的j，c：
clists[c].追加（j）
排序（key=len，reverse=True）
返回剪辑[：-1]#剪辑[]
def avdist（X，至=无）：
“”“av dist X VEC至“至”，无：平均值（X）”
如果to为无：
to=np.平均值（X，轴=0）
返回np.平均值（距离cdist（X，[to]））
#...............................................................................
数据=100
方法=“平均”
t=0
crit=“maxclust”
#“maxclust”：查找最小阈值“r”，以便
#在同一扁平星团中的任意两个原始观测值之间
#不超过`r`且不超过`t`形成扁平簇。
#但是t对集群大小的影响很弱？
#t 25:[10,9,8,7,6
#t 20:[12,11,10,9,7
绘图=0
种子=1
exec“\n”.join（sys.argv[1:]）#nda=t=。。。
np.随机种子（种子）
np.set_打印选项（2，阈值=100，边缘项=10，抑制项=True）#.2f
me=\uuuu文件\uuuu.split（'/'）[-1]
#美国最大城市--
cities=np.array（citiesin（n=nda）[0]）#n，2
如果t==0:t=nda//4
#...............................................................................
打印“#%s数据=%d t=%d方法=%s临界值=%s”%（me，数据，t，方法，临界值）
Y=dist.pdist（城市）#n*（n-1）/2
Z=层级链接（Y，方法）#n-1
T=hier.fcluster（Z，T，crit=crit）#n
集群=集群列表（T）
打印“簇大小：”，映射（透镜，簇）
打印“#最大簇中到中心的平均距离：”
对于集群中的c：
如果len（c）=2：
pl.savefig（“城市-%d-%d.png”%（数据量，t），dpi=80）
pl.show（）