Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/oop/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 自适应DBSCAN实现_Python_Machine Learning_Cluster Analysis_Dbscan - Fatal编程技术网

Python 自适应DBSCAN实现

Python 自适应DBSCAN实现,python,machine-learning,cluster-analysis,dbscan,Python,Machine Learning,Cluster Analysis,Dbscan,我正在用python进行DBSCAN集群。我想通过自我计算eps和Minpts参数,实现一种自适应的方法来返回集群数量。下面是我的代码 import math import copy import numpy as np import pandas as pd from sklearn.cluster import DBSCAN def loadDataSet(fileName, splitChar='\t'): dataSet = [] with open(fileNam

我正在用python进行DBSCAN集群。我想通过自我计算eps和Minpts参数,实现一种自适应的方法来返回集群数量。下面是我的代码

import math
import copy
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN


def loadDataSet(fileName, splitChar='\t'):

    dataSet = []
    with open(fileName) as fr:
        for line in fr.readlines():
            curline = line.strip().split(splitChar)
            fltline = list(map(float, curline))
            dataSet.append(fltline)
    return dataSet


def dist(a,b):

    return math.sqrt(math.pow(a[0]-b[0],2) + math.pow(a[1]-b[1],2))


def returnDk(matrix,k):

    Dk = []
    for i in range(len(matrix)):
        Dk.append(matrix[i][k])
    return Dk


def returnDkAverage(Dk):

    sum = 0
    for i in range(len(Dk)):
        sum = sum + Dk[i]
    return sum/len(Dk)


def CalculateDistMatrix(dataset):

    DistMatrix = [[0 for j in range(len(dataset))] for i in range(len(dataset))]
    for i in range(len(dataset)):
        for j in range(len(dataset)):
            DistMatrix[i][j] = dist(dataset[i], dataset[j])
    return DistMatrix


def returnEpsCandidate(dataSet):

    DistMatrix = CalculateDistMatrix(dataSet)
    tmp_matrix = copy.deepcopy(DistMatrix)
    for i in range(len(tmp_matrix)):
        tmp_matrix[i].sort()
    EpsCandidate = []
    for k in range(1,len(dataSet)):
        Dk = returnDk(tmp_matrix,k)
        DkAverage = returnDkAverage(Dk)
        EpsCandidate.append(DkAverage)
    return EpsCandidate


def returnMinptsCandidate(DistMatrix,EpsCandidate):

    MinptsCandidate = []
    for k in range(len(EpsCandidate)):
        tmp_eps = EpsCandidate[k]
        tmp_count = 0
        for i in range(len(DistMatrix)):
            for j in range(len(DistMatrix[i])):
                if DistMatrix[i][j] <= tmp_eps:
                    tmp_count = tmp_count + 1
        MinptsCandidate.append(tmp_count/len(dataSet))
    return MinptsCandidate


def returnClusterNumberList(dataset,EpsCandidate,MinptsCandidate):

    np_dataset = np.array(dataset)
    ClusterNumberList = []
    for i in range(len(EpsCandidate)):
        clustering = DBSCAN(eps= EpsCandidate[i],min_samples= MinptsCandidate[i]).fit(np_dataset)
        num_clustering = max(clustering.labels_)
        ClusterNumberList.append(num_clustering)
    return ClusterNumberList

if __name__ == '__main__':
    data = pd.read_csv('/Users/Desktop/Mic/recorder_test1/New folder/MFCCresultsforclustering/MFCCresultsforclustering.csv')
    dataSet = data.iloc[:,0:13].values
    EpsCandidate = returnEpsCandidate(dataSet)
    DistMatrix = CalculateDistMatrix(dataSet)
    MinptsCandidate = returnMinptsCandidate(DistMatrix,EpsCandidate)
    ClusterNumberList = returnClusterNumberList(dataSet,EpsCandidate,MinptsCandidate)
    print(EpsCandidate)
    print(MinptsCandidate)
    print('cluster number list is')
    print(ClusterNumberList)  
导入数学
导入副本
将numpy作为np导入
作为pd进口熊猫
从sklearn.cluster导入DBSCAN
def loadDataSet(文件名,splitChar='\t'):
数据集=[]
打开(文件名)为fr时:
对于fr.readlines()中的行:
curline=line.strip().split(splitChar)
fltline=列表(贴图(浮动、卷曲线))
dataSet.append(fltline)
返回数据集
def区(a、b):
返回math.sqrt(math.pow(a[0]-b[0,2)+math.pow(a[1]-b[1,2))
def returnDk(矩阵,k):
Dk=[]
对于范围内的i(len(矩阵)):
Dk.追加(矩阵[i][k])
返回Dk
def返回Dk平均值(Dk):
总和=0
对于范围内的i(len(Dk)):
总和=总和+Dk[i]
返回金额/长度(丹麦克朗)
def CalculatedStMatrix(数据集):
DistMatrix=[[0表示范围内的j(len(数据集))]表示范围内的i(len(数据集))]
对于范围内的i(len(数据集)):
对于范围内的j(len(数据集)):
DistMatrix[i][j]=dist(数据集[i],数据集[j])
返回距离矩阵
def RETURNEPS候选者(数据集):
DistMatrix=CalculateDistMatrix(数据集)
tmp_矩阵=copy.deepcopy(DistMatrix)
对于范围内的i(len(tmp_矩阵)):
tmp_矩阵[i].sort()
EpsCandidate=[]
对于范围(1,len(数据集))中的k:
Dk=返回Dk(tmp_矩阵,k)
Dk平均值=返回Dk平均值(Dk)
EpsCandidate.append(DkAverage)
返回候选人
def returnMinptsCandidate(DistMatrix,EPS候选者):
MinptsCandidate=[]
对于范围内的k(len(eps)):
tmp_eps=eps候选[k]
tmp_计数=0
对于范围内的i(len(DistMatrix)):
对于范围内的j(len(DistMatrix[i]):

如果DistMatrix[i][j],我建议您改用HDBSCAN。训练速度更快,效果也不错@法尔胡德特。对于HDBSCAN,我认为我们仍然需要设置epsilon和MinSample的值。我试图在上面实现的是自适应地设置不需要预定义的值。HDBSCAN对这些值的敏感度不如DBSCAN对这些值的敏感度。在API参考资料中,它明确指出:“在不同的epsilon值上执行DBSCAN,并集成结果,以找到在epsilon上提供最佳稳定性的聚类。”