Python 基于热图的Kmeans聚类_Python_Pandas_Numpy_Matplotlib_Heatmap

Python 基于热图的Kmeans聚类

python pandas numpy matplotlib

Python 基于热图的Kmeans聚类,python,pandas,numpy,matplotlib,heatmap,Python,Pandas,Numpy,Matplotlib,Heatmap,只是想知道，对于这个数据集，您将如何使用k均值聚类？我被限制使用任何软件包或模块。此数据集正在为此数据集进行培训我已经尝试解决这个问题有一段时间了，尝试了几件事情，但似乎都没有成功。没有代码是必需的，但如果有人能给我一个一般的思考过程来解决这个问题，我将非常感激这是我目前的思维方式。我想把他的数据放到热图上我目前的想法是首先随机选择中心。然后为到每个中心的距离的每个点创建列表列表。找到每个中心每个点的最小距离索引。创建一个与数据集大小相同的数据框，并用点最接近的中心的索引填充

只是想知道，对于这个数据集，您将如何使用k均值聚类？我被限制使用任何软件包或模块。

此数据集正在为此数据集进行培训

我已经尝试解决这个问题有一段时间了，尝试了几件事情，但似乎都没有成功。没有代码是必需的，但如果有人能给我一个一般的思考过程来解决这个问题，我将非常感激

这是我目前的思维方式。我想把他的数据放到热图上我目前的想法是首先随机选择中心。然后为到每个中心的距离的每个点创建列表列表。找到每个中心每个点的最小距离索引。创建一个与数据集大小相同的数据框，并用点最接近的中心的索引填充每个元素的每个索引。通过取具有相同中心索引的点的平均值重新计算中心重复此过程多次，直到索引数据帧不变。创建一个新的数据帧，并在帧中添加具有相同中心点的点。然后创建热图

但这似乎并不奏效。只是想知道，我是在正确的轨道上还是完全偏离了轨道，如果我在正确的轨道上，我需要更改哪些部分来解决问题。如果没有，请你给我指出正确的方向

下面是要查看的代码

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import random
#%matplotlib inline

def truncate(f, n):
    return math.floor(f * 10 ** n) / 10 ** n

def chooseCenter(data, centers):
    length = data.shape
    cent = []
    while len(cent) < centers :
        x = random.randrange(0,length[0])
        y = random.randrange(0,length[1])
        if data.iloc[x][y] not in cent:
            d = truncate(data.iloc[x][y],2)
            cent.append(d)
    return cent


def distance(val, center):
    return math.sqrt((val- center)**2)


def getDistances(centers, data):
    length = data.shape
    dist = []
    for i in range(length[0]):
        for j in range(length[1]):
            y = []
            for k in range(len(centers)):
                val = distance(data.iloc[i][j], centers[k]) 
                y.append(truncate(val,3))
            dist.append(y)
    return dist

def findClosest(data, dist):
    close = data.copy()
    length = close.shape
    indexes = []
    for i in range(len(dist)):
        pt = min(dist[i])
        idx = dist[i].index(pt)
        indexes.append(idx)
    #print(indexes)
    length = data.shape
    n = np.array(indexes)
    n = pd.DataFrame(np.reshape(n, (length[0],length[1])))
    #reshape this data frame into the same shape as the data
    #keep running the find closest until there is no change
    #try heatmap on this?
    #this should cluster it, but to make sure test it
    #might need to do some tweaking to this

    return n
#    for i in range(length[0]):
#        for j in range(length[1]):
#            print('dist[i]', dist[j])
#            pt = min(dist[j])
#            print(pt)
#            idx = dist[j].index(pt)
#            close.iloc[i][j] = int(idx)
    #return close

def computeNewCenter(data, close):
    d = dict()
    for i in range(len(close)):
        for j in range(len(close[0])):
            d[close.iloc[i][j]] = []

    for i in range(len(data)):
        for j in range(len(data[0])):
            if close.iloc[i][j] in d:
                d[close.iloc[i][j]].append(data.iloc[i][j])
    newCenters = []

    for key, value in d.items():
        m = np.mean(value)
        newCenters.append(truncate(m, 3))

    return newCenters
#    lst = [[] * numcenters]
#    for i in range(len(close)):
#        for j in range(len(close[0])):
#            if close.iloc[i][j]


def main():
    data = np.array(pd.read_csv('https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv',  header=None))
    data = data.T
    #print(data)
    df = pd.DataFrame(data[1:], columns=data[0], dtype=float).T
    df = df.iloc[::-1]
#    print(df)
#    print(df.iloc[1][9])
#    print(df)
#    print(df.iloc[0][1])
#    heatmap = plt.pcolor(df, cmap=plt.cm.bwr)
#    plt.colorbar(heatmap)
    c = chooseCenter(df, 3)
    print(c)
    #print(len(c))
    dist = getDistances(c, df)
    #print(dist)
    y = findClosest(df, dist)
#    q = []
#    for i in range(len(c)):
#        q.append([])
#    #print(q)
    j = computeNewCenter(df, y)
    #print(j)
    length = df.shape
    oldFrame = pd.DataFrame(np.ndarray((length[0],length[1])))
    oldFrame = oldFrame.fillna(0)
    ct=0
    while y.equals(oldFrame) == False:
        ct+=1
        oldFrame = y.copy()
        c = computeNewCenter(df, oldFrame)
        #print(c)
        dist = getDistances(c, df)
        #print(dist)
        y = findClosest(df, dist)
        #print(y)
    #plt.pcolor(df, cmap=plt.cm.bwr)

    l = []
    for i in range(len(y)):
        for j in range(len(y[0])):
            if y.iloc[i][j] == 1:
                l.append(df.iloc[i][j])

    for i in range(len(y)):
        for j in range(len(y[0])):
            if y.iloc[i][j] == 2:
                l.append(df.iloc[i][j])
    for i in range(len(y)):
        for j in range(len(y[0])):
            if y.iloc[i][j] == 0:
                 l.append(df.iloc[i][j])


    l = np.ndarray((length[0],length[1]))
    l = pd.DataFrame(l)
    print(l)
    hm = plt.pcolor(l, cmap=plt.cm.bwr)
    plt.colorbar(hm)    
#    print(y)
#    print(c)
#    print(ct)
    #plt.pcolor(y, cmap=plt.cm.bwr)


if __name__ == '__main__':
    main()

导入matplotlib.pyplot作为plt
将numpy作为np导入
作为pd进口熊猫
输入数学
随机输入
#%matplotlib内联
def截断（f，n）：
返回数学楼层（f*10**n）/10**n
def chooseCenter（数据中心）：
长度=data.shape
分=[]
而len（cent）<中心：
x=random.randrange（0，长度[0]）
y=random.randrange（0，长度[1]）
如果data.iloc[x][y]的单位不是美分：
d=截断（data.iloc[x][y]，2）
分.追加（d）
回款
def距离（val，中心）：
返回数学sqrt（（值-中心）**2）
def getDistances（中心、数据）：
长度=data.shape
dist=[]
对于范围内的i（长度[0]）：
对于范围内的j（长度[1]）：
y=[]
对于范围内的k（透镜（中心））：
val=距离（data.iloc[i][j]，中心[k]）
y、 追加（截断（val，3））
附加区（y）
返回区
def findClosest（数据，区域）：
close=data.copy（）
长度=close.shape
索引=[]
对于范围内的i（len（dist））：
pt=min（距离[i]）
idx=dist[i]。索引（pt）
index.append（idx）
#打印（索引）
长度=data.shape
n=np.数组（索引）
n=pd.DataFrame（np.reformate（n，（长度[0]，长度[1]））
#将此数据帧重塑为与数据帧相同的形状
#继续运行“查找最近的”，直到没有更改为止
#试试这个热图？
#这应该对其进行集群，但要确保对其进行测试
#可能需要对此进行一些调整
返回n
#对于范围内的i（长度[0]）：
#对于范围内的j（长度[1]）：
#打印（'dist[i]'，dist[j]）
#pt=min（距离[j]）
#打印（pt）
#idx=dist[j]。索引（pt）
#close.iloc[i][j]=int（idx）
#回程结束
def计算中心（数据，关闭）：
d=dict（）
对于范围内的i（len（close））：
对于范围内的j（len（接近[0]）：
d[close.iloc[i][j]=[]
对于范围内的i（len（数据））：
对于范围内的j（len（数据[0]）：
如果在d中关闭.iloc[i][j]：
d[close.iloc[i][j].append（data.iloc[i][j]）
新中心=[]
对于键，d.items（）中的值：
m=np.平均值（值）
追加（截断（m，3））
返回新中心
#lst=[[]*numcenters]
#对于范围内的i（len（close））：
#对于范围内的j（len（接近[0]）：
#如果关闭。iloc[i][j]
def main（）：
数据=np.数组（pd.读取\u csv（'https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv“，页眉=无”）
data=data.T
#打印（数据）
df=pd.DataFrame（数据[1:]，列=data[0]，dtype=float）.T
df=df.iloc[：-1]
#打印（df）
#打印（df.iloc[1][9]）
#打印（df）
#打印（df.iloc[0][1]）
#热图=plt.pcolor（df，cmap=plt.cm.bwr）
#plt.colorbar（热图）
c=选择中心（df，3）
印刷品（c）
#印刷品（透镜（c））
距离=距离（c，df）
#打印（区）
y=findClosest（df，dist）
#q=[]
#对于范围内的i（len（c））：
#q.append（[]）
##打印（q）
j=计算中心（df，y）
#印刷品（j）
长度=df.shape
oldFrame=pd.DataFrame（np.ndarray（（长度[0]，长度[1]））
oldFrame=oldFrame.fillna（0）
ct=0
而y.equals（oldFrame）==False：
ct+=1
oldFrame=y.copy（）
c=计算中心（df，旧帧）
#印刷品（c）
距离=距离（c，df）
#打印（区）
y=findClosest（df，dist）
#打印（y）
#plt.pcolor（df，cmap=plt.cm.bwr）
l=[]
对于范围内的i（len（y））：
对于范围内的j（len（y[0]）：
如果y.iloc[i][j]==1：
l、 追加（df.iloc[i][j]）
对于范围内的i（len（y））：
对于范围内的j（len（y[0]）：
如果y.iloc[i][j]==2：
l、 追加（df.iloc[i][j]）
对于范围内的i（len（y））：
对于范围内的j（len（y[0]）：
如果y.iloc[i][j]==0：
l、 追加（df.iloc[i][j]）
l=np.ndarray（（长度[0]，长度[1]））
l=pd.数据帧（l）
印刷品（l）
hm=plt.pcolor（l，cmap=plt.cm.bwr）
plt.色条（hm）
#打印（y）
#印刷品（c）
#打印（ct）
#plt.pcolor（y，cmap=plt.cm.bwr）
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu'：
main（）

感谢您阅读

您是否在使用

sklearn

或

tensorflow

时受到限制？您的数据集只有6个数据点？您希望找到多少个群集？正如@DanielF所提到的，如果您只想

kmeans

，请使用scikit-learn、tensorflow、scipy或任何其他库，不要再编写已经存在的代码