R 两种不同聚类方法的编码

R 两种不同聚类方法的编码,r,variable-assignment,R,Variable Assignment,我使用了两种不同的聚类方法来生成两个聚类结果,每个聚类方法包含10个不同的组。但是,它们的编码不同。下面的示例显示了聚类结果: set.seed(1) Df <- data.frame(Var1 = sample(1:6, 100, replace =T), Var2 = sample(1:6,100, replace =T)) table(Df) set.seed(1) Df这可能有点像散弹枪,因为我不知道真实数据中有多少集群。我在这里尝试所有可能的组合: df <- dat

我使用了两种不同的聚类方法来生成两个聚类结果,每个聚类方法包含10个不同的组。但是,它们的编码不同。下面的示例显示了聚类结果:

set.seed(1)

Df <- data.frame(Var1 = sample(1:6, 100, replace =T), Var2 = sample(1:6,100, replace =T))

table(Df)
set.seed(1)

Df这可能有点像散弹枪,因为我不知道真实数据中有多少集群。我在这里尝试所有可能的组合:

df <- data.frame(Cluster1 = c("A","A", "B", "B", "C","C", "C"), 
                 Cluster2 = c("1", "2", "3", "3", "2","1","3"))

require(gtools)
comb <- permutations(n = 3, r = 3, v = 1:3)

#try every combination and count the matches
nmatch <- apply(comb,1,function(x) sum(LETTERS[match(df$Cluster2,x)] == df$Cluster1))

#pick the best performing translation
best <- comb[which.max(nmatch),]
# generate translation table
data.frame(Cluster2 = 1:3, Cluster2new = LETTERS[best])
您的新示例数据:

set.seed(314)
df <- data.frame(Cluster1 = sample(LETTERS[1:6], 100, replace =T), Cluster2 = sample(1:6,100, replace =T))

require(gtools)
comb <- permutations(n = 6, r = 6, v = 1:6)

#try every combination and count the matches
nmatch <- apply(comb,1,function(x) sum(LETTERS[match(df$Cluster2,x)] == df$Cluster1))

#pick the best performing translation
best <- comb[which.max(nmatch),]
# generate translation table
data.frame(Cluster2 = 1:3, Cluster2new = LETTERS[best])

计算排列似乎是限制因素。因此,我有另一种解决方案,随机抽取可能性样本,并计算匹配百分比。这种方法速度要快得多,但可能不包括问题的最佳解决方案

set.seed(314)

c = 10000
n = 10
tries = 1000

df <- data.frame(Cluster1 = sample(LETTERS[1:n], c, replace =T), Cluster2 = sample(1:n,c, replace =T))

#try every combination and count the matches
nmatch <- sapply(1:tries,function(x) {
  set.seed(x)
  comb <- sample(1:n,n)
  sum(LETTERS[match(df$Cluster2,comb)] == df$Cluster1)
  })

#pick the best performing translation
best <- which.max(nmatch)
# generate translation table
set.seed(best)
data.frame(Cluster2 = 1:n, Cluster2new = LETTERS[sample(1:n,n)])

nmatch[best]/c

或一个较慢的迭代过程:

solve <- function(start)
{
  sol <- integer()
  start <- sample(1:n)
  left <- start
  for(i in start){

    nmatch <- sapply(left, function(x) {
      cl <- df[df$Cluster2==x,]
      sum(LETTERS[cl$Cluster2] == cl$Cluster1)
    })
    ix <- which.max(nmatch)
    sol[i] <- left[ix]
    left <- left[-ix]
  }
  sol
}

nmatch <- sapply(1:tries, function(x) {
  set.seed(x)
  sum(LETTERS[match(df$Cluster2,solve(sample(1:n)))] == df$Cluster1)
})

best <- which.max(nmatch)

data.frame(Cluster2 = 1:n, Cluster2new = LETTERS[sample(1:n,n)])

nmatch[best]/c
例如,当您查看每个方法的
nmatch
分布时,第二个随机过程可能会更好地获得一个好的解决方案:


经过思考,我想我找到了一个简单的问题答案。我可以简单地使用一个循环来修剪它并找到匹配项

set.seed (1)
df <- data.frame(Cluster1 = sample(LETTERS[1:n], c, replace =T), Cluster2 = sample(1:n,c, replace =T))
findmatch <- function(df, group1 = "Cluster1", group2 = "Cluster2" ) {
    n <- length(unique(df[, group1]))
    matches <- matrix(NA, n, 2) 
    for(i in 1:n) {
        if(i==1) {
        table1 <- table(df[, group1], df[,group2])
        } else if(i<n) {
        table1 <- table1[-maxs[1],-maxs[2]]  
       } 
       maxs <- which(table1 == max(table1), arr.ind = TRUE)
       if(i < n) {
       matches[i,1:2] <- c(rownames(table1)[maxs[1]], colnames(table1)[maxs[2]])    
       } else {
         matches[i,1:2] <- c(rownames(table1)[-maxs[1]], colnames(table1)[-maxs[2]])    
     }
   }
   return(matches)
 }
findmatch(df=df)


      [,1] [,2]
 [1,] "J"  "5" 
 [2,] "I"  "7" 
 [3,] "A"  "6" 
 [4,] "E"  "3" 
 [5,] "D"  "10"
 [6,] "C"  "8" 
 [7,] "B"  "1" 
 [8,] "F"  "9" 
 [9,] "H"  "2" 
[10,] "G"  "4" 
set.seed(1)

df table(df)/nrow(df)我的目标是通过将A、B、C分配给集群2来最大化百分比一致性,因此集群2中的1、2、3也将变成A、B、C。在这种情况下,3将是B,1将是A,3将是C。我可以使用表(Df)来查找最大匹配的成员资格,但有时会因为多个匹配而变得复杂。Df$Var2这种方法很有前途,适用于少量集群。我的问题是,两种聚类方法中的每一种都有15个聚类,计算时间太长了!我添加了另一种方法,试图使它更快但更不准确。还有另一种方法。不知道有没有比我更具数学背景的人能更好地解决这个问题?
   Cluster2 Cluster2new
1         1           B
2         2           J
3         3           D
4         4           C
5         5           A
6         6           G
7         7           E
8         8           F
9         9           I
10       10           H
> 
  > nmatch[best]/c
[1] 0.1099
solve <- function(start)
{
  sol <- integer()
  start <- sample(1:n)
  left <- start
  for(i in start){

    nmatch <- sapply(left, function(x) {
      cl <- df[df$Cluster2==x,]
      sum(LETTERS[cl$Cluster2] == cl$Cluster1)
    })
    ix <- which.max(nmatch)
    sol[i] <- left[ix]
    left <- left[-ix]
  }
  sol
}

nmatch <- sapply(1:tries, function(x) {
  set.seed(x)
  sum(LETTERS[match(df$Cluster2,solve(sample(1:n)))] == df$Cluster1)
})

best <- which.max(nmatch)

data.frame(Cluster2 = 1:n, Cluster2new = LETTERS[sample(1:n,n)])

nmatch[best]/c
   Cluster2 Cluster2new
1         1           D
2         2           G
3         3           C
4         4           I
5         5           E
6         6           A
7         7           B
8         8           J
9         9           F
10       10           H
>     
  >     nmatch[best]/c
[1] 0.1121
set.seed (1)
df <- data.frame(Cluster1 = sample(LETTERS[1:n], c, replace =T), Cluster2 = sample(1:n,c, replace =T))
findmatch <- function(df, group1 = "Cluster1", group2 = "Cluster2" ) {
    n <- length(unique(df[, group1]))
    matches <- matrix(NA, n, 2) 
    for(i in 1:n) {
        if(i==1) {
        table1 <- table(df[, group1], df[,group2])
        } else if(i<n) {
        table1 <- table1[-maxs[1],-maxs[2]]  
       } 
       maxs <- which(table1 == max(table1), arr.ind = TRUE)
       if(i < n) {
       matches[i,1:2] <- c(rownames(table1)[maxs[1]], colnames(table1)[maxs[2]])    
       } else {
         matches[i,1:2] <- c(rownames(table1)[-maxs[1]], colnames(table1)[-maxs[2]])    
     }
   }
   return(matches)
 }
findmatch(df=df)


      [,1] [,2]
 [1,] "J"  "5" 
 [2,] "I"  "7" 
 [3,] "A"  "6" 
 [4,] "E"  "3" 
 [5,] "D"  "10"
 [6,] "C"  "8" 
 [7,] "B"  "1" 
 [8,] "F"  "9" 
 [9,] "H"  "2" 
[10,] "G"  "4"