K表示从头开始的集群(Python)
这是K均值聚类从头开始的代码 我想导出添加了一列的数据帧,该列是cluster,我使用该行tdf['cluster']=classification将名为cluster的新列添加到我的数据帧中,但它只添加了一个cluster'4' 其他群集为0 1 2 3 这个问题有什么解决办法吗K表示从头开始的集群(Python),python,cluster-analysis,Python,Cluster Analysis,这是K均值聚类从头开始的代码 我想导出添加了一列的数据帧,该列是cluster,我使用该行tdf['cluster']=classification将名为cluster的新列添加到我的数据帧中,但它只添加了一个cluster'4' 其他群集为0 1 2 3 这个问题有什么解决办法吗 import numpy as np import matplotlib.pyplot as plt from matplotlib import style import pandas as pd im
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
import time
start_time = time.time()
style.use('ggplot')
class K_Means:
def __init__(self, k =3, tolerance = 0.0001, max_iterations = 500):
self.k = k
self.tolerance = tolerance
self.max_iterations = max_iterations
def fit(self, data):
self.centroids = {}
#initialize the centroids, the first 'k' elements in the dataset will be our initial centroids
for i in range(self.k):
self.centroids[i] = data[i]
#begin iterations
for i in range(self.max_iterations):
self.classes = {}
for i in range(self.k):
self.classes[i] = []
#find the distance between the point and cluster; choose the nearest centroid
for features in data:
distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
self.classes[classification].append(features)
previous = dict(self.centroids)
#average the cluster datapoints to re-calculate the centroids
for classification in self.classes:
self.centroids[classification] = np.average(self.classes[classification], axis = 0)
isOptimal = True
for centroid in self.centroids:
original_centroid = previous[centroid]
curr = self.centroids[centroid]
if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.tolerance:
isOptimal = False
#break out of the main loop if the results are optimal, ie. the centroids don't change their positions much(more than our tolerance)
if isOptimal:
break
def pred(self, data):
distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
return classification
def main():
#df = pd.read_csv(r"ipl.csv")
df = pd.read_csv(r"CustomerData4.csv",nrows=200)
#df = df[['one', 'two']]
df=df[['MRank','FRank','RRank']]
dataset = df.astype(float).values.tolist()
X = df.values
#df
dataset = df.astype(float).values.tolist()
X = df.values #returns a numpy array
km = K_Means(5)
km.fit(X)
#y_kmeansP=km.fit(X)
# Plotting starts here
colors = 10*["r", "g", "c", "b", "k"]
#prediction = pd.DataFrame(km.fit(X), columns=['predictions']).to_csv('prediction.csv')
for centroid in km.centroids:
plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], s = 130, marker = "x")
for classification in km.classes:
color = colors[classification]
for features in km.classes[classification]:
print(classification)
df['Cluster'] = classification
plt.scatter(features[0], features[1], color = color,s = 30)
df.to_csv("clusteringfromscrtach.csv")
#plt.show()
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
main()
显然,您正在覆盖此列k次
相反,将结果合并到一列中
在更大的数据上对代码进行基准测试
df['Cluster'] = classification