R：除了nstart和iter.max的不同设置外，k-means中的簇相同_R_Cluster Analysis_K Means

R：除了nstart和iter.max的不同设置外，k-means中的簇相同

R：除了nstart和iter.max的不同设置外，k-means中的簇相同,r,cluster-analysis,k-means,R,Cluster Analysis,K Means,尽管我对kmeans（）的iter.max和nstart使用了（非常）不同的设置，但为什么得到相同的集群 set.seed（1） ff_1随机选择kmeans的初始质心（1）您在所有情况下都选择了相同的随机种子=1（这将强制为所有情况选择完全相同的质心），并且（2）这些簇是非常可分离的，您在这些情况下得到的结果是相同的（在第一次迭代后，收敛速度非常快）下图显示了它 library(grid) library(gridExtra) library(ggplot2) set.seed(1

尽管我对

kmeans（）

的

iter.max

和

nstart

使用了（非常）不同的设置，但为什么得到相同的集群

set.seed（1）
ff_1随机选择kmeans
的初始质心
（1） 您在所有情况下都选择了相同的随机种子=1（这将强制为所有情况选择完全相同的质心），并且
（2） 这些簇是非常可分离的，您在这些情况下得到的结果是相同的（在第一次迭代后，收敛速度非常快）
下图显示了它
library(grid)
library(gridExtra)
library(ggplot2)

set.seed(1)
ff_1 <- kmeans(faithful, 2, iter.max = 1, nstart = 1)

set.seed(1)
ff_2 <- kmeans(faithful, 2, iter.max = 2, nstart = 1)

set.seed(1)
ff_300 <- kmeans(faithful, 2, iter.max = 300, nstart = 300)

grid.arrange(
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_1$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_1$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 1\n", color = "ff1 cluster\n"), 
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_2$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_2$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 1\n", color = "ff2 cluster\n"),
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_300$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_300$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 1\n", color = "ff300 cluster\n"))

identical(ff_1, ff_2) # TRUE
identical(ff_1, ff_300) # TRUE

您写道算法在第二次迭代后收敛，但为什么相同（ff_1，ff_2）
正确？第一个集群中心是在第一次迭代之前还是在第一次迭代期间选择的？如果它们是在第一次迭代中（随机）选择的，我根本不明白相同（ff_1，ff_300）
怎么可能是真的。@joe它们是相同的，因为随机选择的初始簇质心（在第一次迭代之前选择的）对于ff_1
和ff_2
都是一样的，因为您使用了相同的种子。我的观点是，这里的收敛显然在第一次迭代之后发生（而不是在您编写的第二次迭代之后），否则ff_1
和ff_2就不一样了。@joe确信这是正确的，编辑了我的帖子，谢谢你指出。
library(grid)
library(gridExtra)
library(ggplot2)

set.seed(1)
ff_1 <- kmeans(faithful, 2, iter.max = 1, nstart = 1)

set.seed(1)
ff_2 <- kmeans(faithful, 2, iter.max = 2, nstart = 1)

set.seed(1)
ff_300 <- kmeans(faithful, 2, iter.max = 300, nstart = 300)

grid.arrange(
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_1$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_1$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 1\n", color = "ff1 cluster\n"), 
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_2$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_2$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 1\n", color = "ff2 cluster\n"),
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_300$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_300$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 1\n", color = "ff300 cluster\n"))

identical(ff_1, ff_2) # TRUE
identical(ff_1, ff_300) # TRUE

set.seed(1)
ff_1 <- kmeans(faithful, 2, iter.max = 1, nstart = 1)

set.seed(12)
ff_2 <- kmeans(faithful, 2, iter.max = 2, nstart = 1)

set.seed(123)
ff_300 <- kmeans(faithful, 2, iter.max = 300, nstart = 300)

grid.arrange(
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_1$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_1$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 1\n", color = "ff1 cluster\n"), 
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_2$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_2$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 12\n", color = "ff2 cluster\n"),
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_300$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_300$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 123\n", color = "ff300 cluster\n"))

identical(ff_1, ff_2) # FALSE
identical(ff_1, ff_300) # FALSE