R 基于列值按组对行进行聚类_R_Dplyr_Seq

R 基于列值按组对行进行聚类

R 基于列值按组对行进行聚类,r,dplyr,seq,R,Dplyr,Seq,我有以下资料： df <- data.frame(ID = c(1,1,1,1,1,1,1,1,1,1,2,2,2), Obs = c(0,1, 1, 0, 1,0,0, 1, 1, 1, 0,0,1)) df这里有一个使用rle的选项： df %>% group_by(ID) %>% mutate(clust = with(rle(Obs), rep(cumsum(values == 1), lengths))) # # A tibbl

我有以下资料：

df <- data.frame(ID = c(1,1,1,1,1,1,1,1,1,1,2,2,2),
             Obs = c(0,1, 1, 0, 1,0,0, 1, 1, 1, 0,0,1))

df这里有一个使用rle
的选项：
df %>% 
  group_by(ID) %>% 
  mutate(clust = with(rle(Obs), rep(cumsum(values == 1), lengths)))
# # A tibble: 13 x 4
# # Groups:   ID [2]
# ID   Obs Cluster clust
# <dbl> <dbl>   <dbl> <int>
# 1    1.    0.      0.     0
# 2    1.    1.      1.     1
# 3    1.    1.      1.     1
# 4    1.    0.      1.     1
# 5    1.    1.      2.     2
# 6    1.    0.      2.     2
# 7    1.    0.      2.     2
# 8    1.    1.      3.     3
# 9    1.    1.      3.     3
# 10    1.    1.      3.     3
# 11    2.    0.      0.     0
# 12    2.    0.      0.     0
# 13    2.    1.      1.     1

这将告诉您1或0在Obs列中的长度（我现在忽略ID分组）
我们现在需要的是，累计计算出有多少次出现了1的拉伸，要做到这一点，我们只需求和，其中的值为1：
with(rle(df$Obs), cumsum(values == 1))
#[1] 0 1 1 2 2 3 3 4

到目前为止还不错，现在我们需要重复这些值的次数与拉伸的次数相同，因此我们使用了来自rle的rep
和length
信息：
with(rle(df$Obs), rep(cumsum(values == 1), lengths))
# [1] 0 1 1 1 2 2 2 3 3 3 3 3 4

最后，我们通过一组ID来完成这项工作

如果需要为不同的obs列创建多个群集列，可以按如下方式轻松完成：
df %>% 
  group_by(ID) %>% 
  mutate_at(vars(starts_with("Obs")), 
            funs(cluster= with(rle(.), rep(cumsum(values == 1), lengths))))

# # A tibble: 13 x 7
# # Groups:   ID [2]
# ID  Obs1  Obs2 ClusterObs1 ClusterObs2 Obs1_cluster Obs2_cluster
# <dbl> <dbl> <dbl>       <dbl>       <dbl>        <int>        <int>
# 1    1.    0.    0.          0.          0.            0            0
# 2    1.    1.    0.          1.          0.            1            0
# 3    1.    1.    0.          1.          0.            1            0
# 4    1.    0.    1.          1.          1.            1            1
# 5    1.    1.    1.          2.          1.            2            1
# 6    1.    0.    1.          2.          1.            2            1
# 7    1.    0.    0.          2.          1.            2            1
# 8    1.    1.    1.          3.          2.            3            2
# 9    1.    1.    0.          3.          2.            3            2
# 10    1.    1.    1.          3.          3.            3            3
# 11    2.    0.    0.          0.          0.            0            0
# 12    2.    0.    0.          0.          0.            0            0
# 13    2.    1.    1.          1.          1.            1            1

df%>%
分组依据（ID）%>%
在（变量（以“Obs”开头）处进行变异，
funs（集群=具有（rle（.），代表（总和（值==1），长度）））
##A tibble:13 x 7
##组：ID[2]
#ID Obs1 Obs2 ClusterObs1 ClusterObs2 Obs1_群集Obs2_群集
#                                 
# 1    1.    00000            0
# 2    1.    1.01.01            0
# 3    1.    1.01.01            0
# 4    1.    01.1.1.1            1
# 5    1.    1.1.2.1.2            1
# 6    1.    01.2.1.2            1
# 7    1.    002.1.2            1
# 8    1.    1.1.3.2.3            2
# 9    1.    1.03.2.3            2
# 10    1.    1.1.3.3.3            3
# 11    2.    00000            0
# 12    2.    00000            0
# 13    2.    1.1.1.1.1            1

其中df为：
df <- data.frame(ID = c(1,1,1,1,1,1,1,1,1,1,2,2,2), Obs1 = c(0,1, 1, 0, 1,0,0, 1, 1, 1, 0,0,1), Obs2 = c(0,0, 0, 1, 1,1,0, 1, 0, 1, 0,0,1), ClusterObs1 = c(0,1,1,1,2,2,2,3,3,3,0,0,1), ClusterObs2 = c(0,0,0,1,1,1,1,2,2,3,0,0,1))

df这是一个非常有趣的问题，因此这里有一个data.table解决方案：
# Packages used
library(data.table)
library(magrittr)

# Setup
setDT(df)
df[, Obs := as.integer(Obs)]

# Calculations
df[, Cluster := cumsum(!Obs), by = ID] %>%
  .[, Cluster := Cluster - rowid(Obs) * !Obs, by = rleid(Obs)] %>%
  .[, Cluster := frank(Cluster, ties.method = "dense") - 1L, by = ID]

df
    ID Obs Cluster
 1:  1   0       0
 2:  1   1       1
 3:  1   1       1
 4:  1   0       1
 5:  1   1       2
 6:  1   0       2
 7:  1   0       2
 8:  1   1       3
 9:  1   1       3
10:  1   1       3
11:  2   0       0
12:  2   0       0
13:  2   1       1

嗯，我试着用rle
获得一个解决方案，但我似乎从来没有成功过。你能用（rle（Obs），rep（cumsum（value==1），length））
快速概括一下中发生的事情吗？那太好了，我以前也尝试过使用rle，但没有成功。还有，我怎样才能适应在许多列下运行？@LAP，我添加了一些解释，希望现在更清楚，我们能不能只做groupby（ID）%%>%mutate（Cluster=cumsum（Obs==1&lag（Obs）==0））？@mtoto，也许吧。但是如果第一个obs是1呢？
df %>% 
  group_by(ID) %>% 
  mutate_at(vars(starts_with("Obs")), 
            funs(cluster= with(rle(.), rep(cumsum(values == 1), lengths))))

# # A tibble: 13 x 7
# # Groups:   ID [2]
# ID  Obs1  Obs2 ClusterObs1 ClusterObs2 Obs1_cluster Obs2_cluster
# <dbl> <dbl> <dbl>       <dbl>       <dbl>        <int>        <int>
# 1    1.    0.    0.          0.          0.            0            0
# 2    1.    1.    0.          1.          0.            1            0
# 3    1.    1.    0.          1.          0.            1            0
# 4    1.    0.    1.          1.          1.            1            1
# 5    1.    1.    1.          2.          1.            2            1
# 6    1.    0.    1.          2.          1.            2            1
# 7    1.    0.    0.          2.          1.            2            1
# 8    1.    1.    1.          3.          2.            3            2
# 9    1.    1.    0.          3.          2.            3            2
# 10    1.    1.    1.          3.          3.            3            3
# 11    2.    0.    0.          0.          0.            0            0
# 12    2.    0.    0.          0.          0.            0            0
# 13    2.    1.    1.          1.          1.            1            1

df <- data.frame(ID = c(1,1,1,1,1,1,1,1,1,1,2,2,2), Obs1 = c(0,1, 1, 0, 1,0,0, 1, 1, 1, 0,0,1), Obs2 = c(0,0, 0, 1, 1,1,0, 1, 0, 1, 0,0,1), ClusterObs1 = c(0,1,1,1,2,2,2,3,3,3,0,0,1), ClusterObs2 = c(0,0,0,1,1,1,1,2,2,3,0,0,1))

# Packages used
library(data.table)
library(magrittr)

# Setup
setDT(df)
df[, Obs := as.integer(Obs)]

# Calculations
df[, Cluster := cumsum(!Obs), by = ID] %>%
  .[, Cluster := Cluster - rowid(Obs) * !Obs, by = rleid(Obs)] %>%
  .[, Cluster := frank(Cluster, ties.method = "dense") - 1L, by = ID]

df
    ID Obs Cluster
 1:  1   0       0
 2:  1   1       1
 3:  1   1       1
 4:  1   0       1
 5:  1   1       2
 6:  1   0       2
 7:  1   0       2
 8:  1   1       3
 9:  1   1       3
10:  1   1       3
11:  2   0       0
12:  2   0       0
13:  2   1       1