在R中对用户进行聚类;监控集群结构的变化,以检测;“消失”;或;“移动”;簇

在R中对用户进行聚类;监控集群结构的变化,以检测;“消失”;或;“移动”;簇,r,for-loop,cluster-analysis,k-means,prediction,R,For Loop,Cluster Analysis,K Means,Prediction,我正在处理一个纵向用户事件生成的数据集,并尝试使用k-means将数据中的用户ID按月-年级别进行聚类。我的想法是,我希望看到用户如何在不同的时间点从不同的集群原型中消失或移动到不同的集群原型中 这里是我到目前为止的代码,它包含一个模拟数据帧和集群过程 library(Pareto) library(uuid) library(ggplot2) library(data.table) library(zoo) #generating the user ID variable set.seed(1

我正在处理一个纵向用户事件生成的数据集,并尝试使用k-means将数据中的用户ID按月-年级别进行聚类。我的想法是,我希望看到用户如何在不同的时间点从不同的集群原型中消失或移动到不同的集群原型中

这里是我到目前为止的代码,它包含一个模拟数据帧和集群过程

library(Pareto)
library(uuid)
library(ggplot2)
library(data.table)
library(zoo)
#generating the user ID variable
set.seed(1)
n_users <- 1300
n_rows <- 365000

relative_probs <- rPareto(n = n_users, t = 1, alpha = 0.3, truncation = 500) 
unique_ids <- UUIDgenerate(n = n_users)

id_sample <- sample(unique_ids, size = n_rows, prob = relative_probs, replace = TRUE)

id_sample


df<-data.frame(id_sample)


#creating the date variable
df$Date<-sample(seq(as.Date("2015-01-01"), as.Date("2017-12-31"), by = "1 day"), 
                size = n_rows,replace = T)

#creating a numeric value called Duration; this will be a feature in the clustering
df$Duration<-sample(0:3000, nrow(df), replace = T)
df<-df%>%arrange(Date)
#getting Month-Year
df$MonthYear<-as.Date(as.yearmon(df$Date, "%m/%Y"))
MonthYear<-unique(df$MonthYear)

#empty results df
resultsdf<-data.frame()

for (i in MonthYear) {
  #getting variables for clustering. I need to cluster based on the number of  times the User appears in the data i.e. "Count"
  #the second variable is the mean duration for each User ID i.e. "MeanDur" 
  #the third and final variable is the standard deviation of duration i.e. "SDDur"
  
df_filtered<-df%>%
filter(MonthYear<=i)

  callerData<-df_filtered%>%
    group_by(id_sample)%>%
    summarise(Count=n(),MeanDur=mean(Duration),SDDur=sd(Duration))
 #convert NA to zero's
  callerData$SDDur[is.na(callerData$SDDur)]<-0.0 
  #scale data
  scaledData<-scale(callerData[,2:4])
  
  set.seed(20)
  clust<-kmeans(scaledData, centers= 5,nstart = 15)
  #pinning cluster number back onto callerData
  callerData$Cluster<-clust$cluster
  #getting cluster means and creating a rank order based on "Count" 
  callerData_centers<-callerData%>%
    group_by(Cluster)%>%
    summarise(Count=mean(Count),MeanDur=mean(MeanDur),SDDur=mean(SDDur))%>%
    arrange(Count)
    
callerDate_centers$Rank<-c(1:5)
  #Once the new ranking variable is created, I then use the code below to consistently name the clusters based on their rank
  setDT(callerData_centers)[Rank==1,ClusName:="Cluster 1"]
  callerData_centers[Rank==2,ClusName:="Cluster 2"]
  callerData_centers[Rank==3,ClusName:="Cluster 3"]
  callerData_centers[Rank==4,ClusName:="Cluster 4"]
  callerData_centers[Rank==5,ClusName:="Cluster 5"]
#get the ClusName variable and the Cluster; this is then used to merge the new name back onto callerData
  callerData_vars<-callerData_centers%>%select(Cluster,ClusName)
  callerData<-merge(callerData,callerData_vars, by="Cluster")
 
    newVars<-callerData%>%
    select(CallerId,ClusterName)%>%
    mutate(MonthYear=i)
 
resultsdf<-rbind(resultsdf,newVars)
}


head(resultsdf)


库(帕累托)
图书馆(uuid)
图书馆(GG2)
库(数据表)
图书馆(动物园)
#生成用户ID变量
种子(1)

n_users我必须稍微更改一下您的代码才能运行
CallerId
ClusterName
不属于
callerData
的一部分。因此,首先运行以下命令:

library(Pareto)
library(uuid)
library(ggplot2)
library(data.table)
library(zoo)
library(dplyr)
#generating the user ID variable
#generating the user ID variable
set.seed(1)
n_users <- 1300
n_rows <- 365000

relative_probs <-
  rPareto(
    n = n_users,
    t = 1,
    alpha = 0.3,
    truncation = 500
  )
unique_ids <- UUIDgenerate(n = n_users)

id_sample <-
  sample(unique_ids,
         size = n_rows,
         prob = relative_probs,
         replace = TRUE)

id_sample


df <- data.frame(id_sample)


#creating the date variable
df$Date <-
  sample(seq(as.Date("2015-01-01"), as.Date("2017-12-31"), by = "1 day"),
         size = n_rows,
         replace = T)

#creating a numeric value called Duration; this will be a feature in the clustering
df$Duration <- sample(0:3000, nrow(df), replace = T)
df <- df %>% arrange(Date)
#getting Month-Year
df$MonthYear <- as.Date(as.yearmon(df$Date, "%m/%Y"))
MonthYear <- unique(df$MonthYear)

#empty results df
resultsdf <- data.frame()

for (i in MonthYear) {
  #getting variables for clustering. I need to cluster based on the number of  times the User appears in the data i.e. "Count"
  #the second variable is the mean duration for each User ID i.e. "MeanDur"
  #the third and final variable is the standard deviation of duration i.e. "SDDur"
  
  df_filtered <- df %>%
    filter(MonthYear <= i)
  
  callerData <- df_filtered %>%
    group_by(id_sample) %>%
    summarise(
      Count = n(),
      MeanDur = mean(Duration),
      SDDur = sd(Duration)
    )
  #convert NA to zero's
  callerData$SDDur[is.na(callerData$SDDur)] <- 0.0
  #scale data
  scaledData <- scale(callerData[, 2:4])
  
  set.seed(20)
  clust <- kmeans(scaledData, centers = 5, nstart = 15)
  #pinning cluster number back onto callerData
  callerData$Cluster <- clust$cluster
  #getting cluster means and creating a rank order based on "Count"
  callerData_centers <- callerData %>%
    group_by(Cluster) %>%
    summarise(
      Count = mean(Count),
      MeanDur = mean(MeanDur),
      SDDur = mean(SDDur)
    ) %>%
    arrange(Count)
  
  callerData_centers$Rank <- c(1:5)
  #Once the new ranking variable is created, I then use the code below to consistently name the clusters based on their rank
  setDT(callerData_centers)[Rank == 1, ClusName := "Cluster 1"]
  callerData_centers[Rank == 2, ClusName := "Cluster 2"]
  callerData_centers[Rank == 3, ClusName := "Cluster 3"]
  callerData_centers[Rank == 4, ClusName := "Cluster 4"]
  callerData_centers[Rank == 5, ClusName := "Cluster 5"]
  #get the ClusName variable and the Cluster; this is then used to merge the new name back onto callerData
  callerData_vars <-
    callerData_centers %>% select(Cluster, ClusName)
  callerData <- merge(callerData, callerData_vars, by = "Cluster")
  
  newVars <- callerData %>%
    select(id_sample, ClusName) %>%
    mutate(MonthYear = i)
  
  resultsdf <- rbind(resultsdf, newVars)
}
库(帕累托)
图书馆(uuid)
图书馆(GG2)
库(数据表)
图书馆(动物园)
图书馆(dplyr)
#生成用户ID变量
#生成用户ID变量
种子(1)

n_用户非常感谢您的回复。看起来很清楚/我马上就要试试。我想知道我是否可以问最后一件事。当
id\u样本
改变集群时,
res
中是否会有一个额外的列,说明发生这种情况时的MonthYear或date变量?如果没有太多的问题,那么还有两列说明用户跳转到的前一个集群和新集群。除非不是太麻烦。我认为这将有助于补充我必须进行的任何后续分析。到目前为止,这是伟大的,谢谢你
dfwide_子集[,c(2:5)]==dfwide_子集[,2]
检查列2:5中的簇是否等于列2中的簇,并给出该检查的视觉表示。如果为
FALSE
,则集群与第一列不同。另一种方法是对resultsdf中的每个id将集群延迟一个周期,然后查看它是否等于当前集群。使用data.table应该很容易实现。
# long to wide
dfwide <- data.table::dcast(resultsdf, formula = id_sample ~ MonthYear, value.var = 'ClusName')
colnames(dfwide)[2:37] <- paste0('Date_', MonthYear)

> dfwide[1:5, 1:5]
                             id_sample Date_2015-01-01 Date_2015-02-01 Date_2015-03-01 Date_2015-04-01
1 0025a4ba-d620-4ffc-a82d-660354c2b21d            <NA>       Cluster 3       Cluster 3       Cluster 3
2 00403759-46d8-4c60-b298-a57e6299b2ca       Cluster 3       Cluster 3       Cluster 3       Cluster 3
3 005e6e19-8e02-4326-993d-1fffa0b70c67       Cluster 4       Cluster 4       Cluster 4       Cluster 4
4 007c99ef-7e37-42c0-8883-90c275c03eab       Cluster 3       Cluster 3       Cluster 3       Cluster 3
5 007e70f9-d960-4679-8088-a1065ea9835c       Cluster 3       Cluster 3       Cluster 3       Cluster 2
# check if they stay in their cluster:
dfwide_subset <- na.omit(dfwide) # drop rows with NAs 

res <- dfwide_subset[, c(2:5)] == dfwide_subset[, 2]
res <- data.frame('id_sample' = dfwide_subset$id_sample,
                  'switches_cluster' = rowSums(!res))

> head(res)
                             id_sample switches_cluster
2 00403759-46d8-4c60-b298-a57e6299b2ca                0
3 005e6e19-8e02-4326-993d-1fffa0b70c67                0
4 007c99ef-7e37-42c0-8883-90c275c03eab                0
5 007e70f9-d960-4679-8088-a1065ea9835c                1
6 00b90528-5ee1-40f2-a2ba-5b6fc4b3707f                0
8 00d5cf0c-e2b4-4ed6-a69c-776b83ff8697                0