Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/68.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R 基于周的多特征聚类_R_Cluster Analysis_K Means_Hierarchical Clustering - Fatal编程技术网

R 基于周的多特征聚类

R 基于周的多特征聚类,r,cluster-analysis,k-means,hierarchical-clustering,R,Cluster Analysis,K Means,Hierarchical Clustering,我每周都有一组数据,包含两个功能-销售和pp。我想根据这些功能对它们进行聚类,理想情况下将全年交易模式最相似的功能进行分组 用我掌握的数据能做到这一点吗。我的理解是,功能是列,行是基于集群分配的标签,但我有几周的时间来考虑,所以我不知道这些应该在行还是列中 library(data.table) dt <- data.table(weeks = rep(seq(1:5),5), store = c(rep("a", 5), rep("b", 5), rep("c"

我每周都有一组数据,包含两个功能-销售和pp。我想根据这些功能对它们进行聚类,理想情况下将全年交易模式最相似的功能进行分组

用我掌握的数据能做到这一点吗。我的理解是,功能是列,行是基于集群分配的标签,但我有几周的时间来考虑,所以我不知道这些应该在行还是列中

library(data.table)
dt <- data.table(weeks = rep(seq(1:5),5),
             store = c(rep("a", 5), rep("b", 5), rep("c", 5), 
                       rep("e", 5), rep("d", 5)),
             sales = rep(rnorm(5), 5),
             pp = rep(rnorm(5), 5))
dt <- dcast.data.table(dt, store ~ weeks, value.var = c("sales", "pp"))
库(data.table)

dt由于您有1000多家店铺,下面的演示可能无法直接应用,但希望能为您指明正确的方向

您可以每周或每周的任何其他变量(N=4,8,…)分析门店集群

这里,我们以每周的频率查看商店集群:

测试数据

library(dplyr)     #data manipulation
library(ggdendro)  #extracting clusters
library(ggplot2)   #plotting
library(gridExtra) #for arranging ggplot graphs in grids 

set.seed(42)

DF <- data.frame(weeks = rep(seq(1:5),5),
    store = c(rep("A", 5), rep("B", 5), rep("C", 5), 
            rep("E", 5), rep("D", 5)),
    sales = rnorm(25),
    pp = rnorm(25))


weekInput = unique(DF$weeks)
集群的绘制

绘制各种树状图的良好资源如下: 及

每周,我们计算数字数据上的距离矩阵,创建分层聚类,并使用ggdendro软件包打印和输出打印对象列表

plotList = lapply(weekInput,function(x) {

subsetWeek=DF %>% 
  group_by(weeks) %>% 
  filter(weeks==x) %>%  #you could change this to `weeks %in% c(x[1],x[2])`
  as.data.frame()  %>%  # x[1] and x[2] are start and end points of weekInput
  select(-weeks) %>% 
  as.data.frame()

#For numeric features of data, compute the distance matrix and form hierarchical cluster

numericDF= subsetWeek[,sapply(subsetWeek,is.numeric)]

clustDF = hclust(dist(numericDF))

#You can choose to limit the clusters to N = n, as per your discretion
#clustDF =  cutree(clustDF, 4)


clustDF$labels = subsetWeek$store

#Use functions from ggdendro package for extracting clusters for ease in plotting

clustDendro = as.dendrogram(clustDF)

dendroData = dendro_data(clustDendro,type="rectangle")

Labels = label(dendroData)
Labels$group <- c(rep("Area1", 2), rep("Area2", 2), rep("Area3", 1))


gPlot = ggplot(segment(dendroData)) +
    geom_segment(aes(x=x,y=y,xend=xend,yend=yend)) + 
    geom_label(data=Labels,aes(label=label,x=x,y=0,label.size=5,colour=Labels$group,fontface="bold")) +
    ggtitle(paste0("Store Clusters for Week:",x)) +
    labs(color="Area Names\n")


gPlot = gPlot + theme(legend.title = element_text(face = "bold"))

return(gPlot)


})
单周:

所有周

由于单列排列,可读性受到轻微影响

您是否认为商店(销售额、pp)的集群每周都会有所不同。是的,数据中有1000多家商店,其中一些商店将位于旅游胜地,因此当天气非常炎热时,这些商店的数据会出现峰值等。因此,我希望在商店内部有一个明显的说法,可能还有一个基于pp度量的子集。通常我只取平均每周销售额/pp,并将其用于剔除个别周数,但我希望能够捕捉全年周数的变化。
plotList = lapply(weekInput,function(x) {

subsetWeek=DF %>% 
  group_by(weeks) %>% 
  filter(weeks==x) %>%  #you could change this to `weeks %in% c(x[1],x[2])`
  as.data.frame()  %>%  # x[1] and x[2] are start and end points of weekInput
  select(-weeks) %>% 
  as.data.frame()

#For numeric features of data, compute the distance matrix and form hierarchical cluster

numericDF= subsetWeek[,sapply(subsetWeek,is.numeric)]

clustDF = hclust(dist(numericDF))

#You can choose to limit the clusters to N = n, as per your discretion
#clustDF =  cutree(clustDF, 4)


clustDF$labels = subsetWeek$store

#Use functions from ggdendro package for extracting clusters for ease in plotting

clustDendro = as.dendrogram(clustDF)

dendroData = dendro_data(clustDendro,type="rectangle")

Labels = label(dendroData)
Labels$group <- c(rep("Area1", 2), rep("Area2", 2), rep("Area3", 1))


gPlot = ggplot(segment(dendroData)) +
    geom_segment(aes(x=x,y=y,xend=xend,yend=yend)) + 
    geom_label(data=Labels,aes(label=label,x=x,y=0,label.size=5,colour=Labels$group,fontface="bold")) +
    ggtitle(paste0("Store Clusters for Week:",x)) +
    labs(color="Area Names\n")


gPlot = gPlot + theme(legend.title = element_text(face = "bold"))

return(gPlot)


})
grid::grid.newpage()
grid::grid.draw(do.call(rbind,lapply(plotList,function(x) ggplotGrob(x))))