R：序列数据的整齐聚合和STEP函数的可视化_R_Ggplot2_Dplyr_Ggridges

R：序列数据的整齐聚合和STEP函数的可视化

R：序列数据的整齐聚合和STEP函数的可视化,r,ggplot2,dplyr,ggridges,R,Ggplot2,Dplyr,Ggridges,我有一些患者数据，其中个别患者随时间改变治疗组。我的目标是可视化组变化的顺序，并将这些数据聚合为每个治疗组的“序列配置文件” 对于每一个治疗组，我想展示它通常发生的时间在治疗周期中（更确切地说是在开始或结束时）。为了解释不同的序列长度，我想在NN 0（最开始）和1（结束）之间标准化这些配置文件我想找到一个有效的数据准备和可视化极小示例数据结构库（dplyr）图书馆（purrr）图书馆（GG2） #最小数据 cj_df_原始% 分组依据（id）%>% 变异（位置=行号（）， len=

我有一些患者数据，其中个别患者随时间改变治疗组。我的目标是可视化组变化的顺序，并将这些数据聚合为每个治疗组的“序列配置文件”

对于每一个治疗组，我想展示它通常发生的时间在治疗周期中（更确切地说是在开始或结束时）。为了解释不同的序列长度，我想在NN 0（最开始）和1（结束）之间标准化这些配置文件

我想找到一个有效的数据准备和可视化

极小示例数据结构

库（dplyr）
图书馆（purrr）
图书馆（GG2）
#最小数据
cj_df_原始%
分组依据（id）%>%
变异（位置=行号（），
len=长度（id），
开始=（位置-1）/len，
结束=位置/长度）%>%
过滤器（组==“A”）
#>#tibble:3 x 6
#>#组：id[2]
#>id组pos len起始端
#>        
#>1A 120.5
#>2 A 1 3 0.333
#>3 2 A 3 0.667 1

（因此，Id 1在其序列的前50%在A组中，Id 2在其序列的前33%和后33%在A组中。这意味着，2个Id在序列的0-33%之间，1个在33-50%之间，0个在50-66%之间，1个在66%以上。）

这是我想要实现的结果，我错过了有效转换数据的机会

预期结果

profile\u treatmen\u a%
ggplot（aes（x，y））+
几何步进（方向=“vh”）+
扩展_极限（x=c（0，1），y=0）

（理想情况下，曲线下的区域将被着色）

理想的解决方案：通过GGR 可视化的目标是同时比较多个治疗组的“序列图”。如果我能相应地准备数据，我想使用ggridges软件包对治疗组进行一次引人注目的视觉比较

库（ggridges）
数据帧（组=代表（字母[1:2]，每个=20），
平均值=rep（2个，每个=20））%>%
突变（计数=runif（nrow（%））%%>
ggplot（aes（x=计数，y=组，填充=组））+
geom_山脊线（stat=“binline”，binwidth=0.5，scale=0.9）

您可以构建帮助器间隔，然后只绘制直方图。由于每位患者都属于A组或B组，因此两组的总和为100%。使用这些辅助间隔，您还可以轻松切换到其他

geom

library(tidyverse, warn.conflicts = FALSE)
library(ggplot2)

# create sample data
set.seed(42)

id <- 1:10 %>% map(~ rep(x = .x, times = runif(n = 1, min = 1, max = 6))) %>%
  unlist()
group <- sample(x = c("A", "B"), size = length(id), replace = TRUE) %>%
  as_factor()
df <- tibble(id, group)
glimpse(df)
#> Observations: 37
#> Variables: 2
#> $ id    <int> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 5, 5,...
#> $ group <fct> A, B, B, A, A, B, B, A, A, B, B, A, B, B, A, B, A, B, A,...

# tidy data
df <- df %>%
  group_by(id) %>%
  mutate(from = (row_number() - 1) / n(),
         to = row_number() / n()) %>%
  ungroup() %>%
  rowwise() %>%
  mutate(list = seq(from + 1/60, to, 1/60) %>% list()) %>%
  unnest()

# plot
df %>%
  ggplot(aes(x = list, fill = group)) +
  geom_histogram(binwidth = 1/60) +
  ggthemes::theme_hc()

您可以建立辅助间隔，然后只绘制直方图。由于每位患者都属于A组或B组，因此两组的总和为100%。使用这些辅助间隔，您还可以轻松切换到其他

geom

library(tidyverse, warn.conflicts = FALSE)
library(ggplot2)

# create sample data
set.seed(42)

id <- 1:10 %>% map(~ rep(x = .x, times = runif(n = 1, min = 1, max = 6))) %>%
  unlist()
group <- sample(x = c("A", "B"), size = length(id), replace = TRUE) %>%
  as_factor()
df <- tibble(id, group)
glimpse(df)
#> Observations: 37
#> Variables: 2
#> $ id    <int> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 5, 5,...
#> $ group <fct> A, B, B, A, A, B, B, A, A, B, B, A, B, B, A, B, A, B, A,...

# tidy data
df <- df %>%
  group_by(id) %>%
  mutate(from = (row_number() - 1) / n(),
         to = row_number() / n()) %>%
  ungroup() %>%
  rowwise() %>%
  mutate(list = seq(from + 1/60, to, 1/60) %>% list()) %>%
  unnest()

# plot
df %>%
  ggplot(aes(x = list, fill = group)) +
  geom_histogram(binwidth = 1/60) +
  ggthemes::theme_hc()

我试图回答。。虽然这可能不是最好的/最快的/最有效的方法，但我认为它可能会对你的努力有所帮助

library(data.table)
# compute "intervals" for each person [start, end]
df <- cj_df_raw %>% 
  group_by(id) %>% 
  mutate(pos = row_number(),
         len = length(id),
         from = (pos - 1) / len,
         to = pos / len,
         value = 1)

dt <- as.data.table(df)
setkey(dt, from, to)

#create intervals
dt.interval <- data.table(from = seq( from = 0, by = 0.01, length.out = 100),
                          to = seq( from = 0.01, by = 0.01, length.out = 100))

#perform overlap join on intervals
dt2 <- foverlaps( dt.interval, dt, type = "within", nomatch = NA)[, sum(value), by = c("i.from", "group")]
#some melting ans casting to fill in '0' on empty intervals
dt3 <- melt( dcast(dt2, ... ~ group, fill = 0), id.vars = 1 )

#plot
ggplot( dt3 ) +
  geom_step( aes( x = i.from, y = value, color = variable ) ) + 
  facet_grid( .~variable )

库（data.table）
#计算每个人的“时间间隔”[开始，结束]
df%
分组依据（id）%>%
变异（位置=行号（），
len=长度（id），
from=（位置-1）/len，
to=位置/长度，
值=1）
我试图找到答案。。虽然这可能不是最好的/最快的/最有效的方法，但我认为它可能会对你的努力有所帮助
library(data.table)
# compute "intervals" for each person [start, end]
df <- cj_df_raw %>% 
  group_by(id) %>% 
  mutate(pos = row_number(),
         len = length(id),
         from = (pos - 1) / len,
         to = pos / len,
         value = 1)

dt <- as.data.table(df)
setkey(dt, from, to)

#create intervals
dt.interval <- data.table(from = seq( from = 0, by = 0.01, length.out = 100),
                          to = seq( from = 0.01, by = 0.01, length.out = 100))

#perform overlap join on intervals
dt2 <- foverlaps( dt.interval, dt, type = "within", nomatch = NA)[, sum(value), by = c("i.from", "group")]
#some melting ans casting to fill in '0' on empty intervals
dt3 <- melt( dcast(dt2, ... ~ group, fill = 0), id.vars = 1 )

#plot
ggplot( dt3 ) +
  geom_step( aes( x = i.from, y = value, color = variable ) ) + 
  facet_grid( .~variable ) 

库（data.table）
#计算每个人的“时间间隔”[开始，结束]
df%
分组依据（id）%>%
变异（位置=行号（），
len=长度（id），
from=（位置-1）/len，
to=位置/长度，
值=1）
dt