Python 自适应DBSCAN实现_Python_Machine Learning_Cluster Analysis_Dbscan

Python 自适应DBSCAN实现

python machine-learning

Python 自适应DBSCAN实现,python,machine-learning,cluster-analysis,dbscan,Python,Machine Learning,Cluster Analysis,Dbscan,我正在用python进行DBSCAN集群。我想通过自我计算eps和Minpts参数，实现一种自适应的方法来返回集群数量。下面是我的代码 import math import copy import numpy as np import pandas as pd from sklearn.cluster import DBSCAN def loadDataSet(fileName, splitChar='\t'): dataSet = [] with open(fileNam

我正在用python进行DBSCAN集群。我想通过自我计算eps和Minpts参数，实现一种自适应的方法来返回集群数量。下面是我的代码

import math
import copy
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN


def loadDataSet(fileName, splitChar='\t'):

    dataSet = []
    with open(fileName) as fr:
        for line in fr.readlines():
            curline = line.strip().split(splitChar)
            fltline = list(map(float, curline))
            dataSet.append(fltline)
    return dataSet


def dist(a,b):

    return math.sqrt(math.pow(a[0]-b[0],2) + math.pow(a[1]-b[1],2))


def returnDk(matrix,k):

    Dk = []
    for i in range(len(matrix)):
        Dk.append(matrix[i][k])
    return Dk


def returnDkAverage(Dk):

    sum = 0
    for i in range(len(Dk)):
        sum = sum + Dk[i]
    return sum/len(Dk)


def CalculateDistMatrix(dataset):

    DistMatrix = [[0 for j in range(len(dataset))] for i in range(len(dataset))]
    for i in range(len(dataset)):
        for j in range(len(dataset)):
            DistMatrix[i][j] = dist(dataset[i], dataset[j])
    return DistMatrix


def returnEpsCandidate(dataSet):

    DistMatrix = CalculateDistMatrix(dataSet)
    tmp_matrix = copy.deepcopy(DistMatrix)
    for i in range(len(tmp_matrix)):
        tmp_matrix[i].sort()
    EpsCandidate = []
    for k in range(1,len(dataSet)):
        Dk = returnDk(tmp_matrix,k)
        DkAverage = returnDkAverage(Dk)
        EpsCandidate.append(DkAverage)
    return EpsCandidate


def returnMinptsCandidate(DistMatrix,EpsCandidate):

    MinptsCandidate = []
    for k in range(len(EpsCandidate)):
        tmp_eps = EpsCandidate[k]
        tmp_count = 0
        for i in range(len(DistMatrix)):
            for j in range(len(DistMatrix[i])):
                if DistMatrix[i][j] <= tmp_eps:
                    tmp_count = tmp_count + 1
        MinptsCandidate.append(tmp_count/len(dataSet))
    return MinptsCandidate


def returnClusterNumberList(dataset,EpsCandidate,MinptsCandidate):

    np_dataset = np.array(dataset)
    ClusterNumberList = []
    for i in range(len(EpsCandidate)):
        clustering = DBSCAN(eps= EpsCandidate[i],min_samples= MinptsCandidate[i]).fit(np_dataset)
        num_clustering = max(clustering.labels_)
        ClusterNumberList.append(num_clustering)
    return ClusterNumberList

if __name__ == '__main__':
    data = pd.read_csv('/Users/Desktop/Mic/recorder_test1/New folder/MFCCresultsforclustering/MFCCresultsforclustering.csv')
    dataSet = data.iloc[:,0:13].values
    EpsCandidate = returnEpsCandidate(dataSet)
    DistMatrix = CalculateDistMatrix(dataSet)
    MinptsCandidate = returnMinptsCandidate(DistMatrix,EpsCandidate)
    ClusterNumberList = returnClusterNumberList(dataSet,EpsCandidate,MinptsCandidate)
    print(EpsCandidate)
    print(MinptsCandidate)
    print('cluster number list is')
    print(ClusterNumberList)

导入数学
导入副本
将numpy作为np导入
作为pd进口熊猫
从sklearn.cluster导入DBSCAN
def loadDataSet（文件名，splitChar='\t'）：
数据集=[]
打开（文件名）为fr时：
对于fr.readlines（）中的行：
curline=line.strip（）.split（splitChar）
fltline=列表（贴图（浮动、卷曲线））
dataSet.append（fltline）
返回数据集
def区（a、b）：
返回math.sqrt（math.pow（a[0]-b[0,2）+math.pow（a[1]-b[1,2））
def returnDk（矩阵，k）：
Dk=[]
对于范围内的i（len（矩阵））：
Dk.追加（矩阵[i][k]）
返回Dk
def返回Dk平均值（Dk）：
总和=0
对于范围内的i（len（Dk））：
总和=总和+Dk[i]
返回金额/长度（丹麦克朗）
def CalculatedStMatrix（数据集）：
DistMatrix=[[0表示范围内的j（len（数据集））]表示范围内的i（len（数据集））]
对于范围内的i（len（数据集））：
对于范围内的j（len（数据集））：
DistMatrix[i][j]=dist（数据集[i]，数据集[j]）
返回距离矩阵
def RETURNEPS候选者（数据集）：
DistMatrix=CalculateDistMatrix（数据集）
tmp_矩阵=copy.deepcopy（DistMatrix）
对于范围内的i（len（tmp_矩阵））：
tmp_矩阵[i].sort（）
EpsCandidate=[]
对于范围（1，len（数据集））中的k：
Dk=返回Dk（tmp_矩阵，k）
Dk平均值=返回Dk平均值（Dk）
EpsCandidate.append（DkAverage）
返回候选人
def returnMinptsCandidate（DistMatrix，EPS候选者）：
MinptsCandidate=[]
对于范围内的k（len（eps））：
tmp_eps=eps候选[k]
tmp_计数=0
对于范围内的i（len（DistMatrix））：
对于范围内的j（len（DistMatrix[i]）：
如果DistMatrix[i][j]，我建议您改用HDBSCAN。训练速度更快，效果也不错@法尔胡德特。对于HDBSCAN，我认为我们仍然需要设置epsilon和MinSample的值。我试图在上面实现的是自适应地设置不需要预定义的值。HDBSCAN对这些值的敏感度不如DBSCAN对这些值的敏感度。在API参考资料中，它明确指出：“在不同的epsilon值上执行DBSCAN，并集成结果，以找到在epsilon上提供最佳稳定性的聚类。”