基于不同大小数据集的值范围的R ifelse语句_R_If Statement_Dataframe_Dplyr

基于不同大小数据集的值范围的R ifelse语句

r if-statement dataframe

基于不同大小数据集的值范围的R ifelse语句,r,if-statement,dataframe,dplyr,R,If Statement,Dataframe,Dplyr,我有两个不同大小的数据集，记录了人们服用两种不同药物的开始和停止时间。我想将它们结合起来，以便明确地包括来自任意一个数据集的每一次，以及两种药物使用的相应变量（0/1）示例数据： library(dplyr) set.seed(100) df <- data.frame (id=c(1,1,1,1,2,2,2,3,3,3), start=c(0,10,16,21,0,13,21,0,6,9),

我有两个不同大小的数据集，记录了人们服用两种不同药物的开始和停止时间。我想将它们结合起来，以便明确地包括来自任意一个数据集的每一次，以及两种药物使用的相应变量（0/1）

示例数据：

library(dplyr)    
set.seed(100)
    df <- data.frame (id=c(1,1,1,1,2,2,2,3,3,3),
                      start=c(0,10,16,21,0,13,21,0,6,9),
                      stop=c(9,15,20,24,12,20,25,5,8,14),
                      drugA=rbinom(10,1,0.5))

df2 <- data.frame (id=c(1,1,2,2,3,3),
                   start=c(12,20,2,12,17,22),
                   stop=c(18,25,8,17,19,25),
                   drugB=c(1,1,1,1,1,1))

到目前为止，我试图获得数据集的整体形状是：

t<-sort(unique(c(df$start,df$stop ,df2$start,df2$stop)))  #list all the times
finaldf<-data.frame(id = rep(unique(df$id), each = length(t)))
finaldf$stop<-rep(t, each = length(finaldf))
finaldf<- finaldf %>%
          group_by(id) %>%
          mutate(start = lag(stop)) %>%
          ungroup()
finaldf<-filter(finaldf,start>=0)

任何帮助都将不胜感激。谢谢

最简单的方法可能是首先将所有内容转换为更简单的长格式。具体来说，我会将所有内容转换为每单位时间有一行（即，时间1的状态条目，时间2的状态条目，等等）

为此，我首先将data.frame按id（以便以后填补空白）拆分，然后按行拆分（以延长每个时段，使每个单位时间都有一个条目）。然后，对于每种药物，我使用

tidyr

中的

complete

填写任何缺失的时间（假设它们不在药物上）。您在这里的设计意味着每个个体在研究中的时间相同，但如果不是这样，您可以简单地在

lapply

函数中为每个个体重新定义

所有时间

maxTime <- max(c(df$stop, df2$stop))

allTimes <-
  0:maxTime

allIds <-
  c(df$id, df2$id) %>%
  unique %>%
  sort

fullData <-
  lapply(allIds, function(thisID){
    tempA <-
      df %>%
      filter(id == thisID) %>%
      split(1:nrow(.)) %>%
      lapply(function(thisSet){
        data_frame(
          id = thisID
          , time = thisSet$start:thisSet$stop
          , drugA = thisSet$drugA
        )
      }) %>%
      bind_rows %>%
      complete(time = allTimes, fill = list(id = thisID, drugA = 0))

    tempB <-
      df2 %>%
      filter(id == thisID) %>%
      split(1:nrow(.)) %>%
      lapply(function(thisSet){
        data_frame(
          id = thisID
          , time = thisSet$start:thisSet$stop
          , drugB = thisSet$drugB
        )
      }) %>%
      bind_rows %>%
      complete(time = allTimes, fill = list(id = thisID, drugB = 0))

    out <-
      left_join(tempA, tempB)
  }) %>%
  bind_rows

    time    id drugA drugB
   <int> <dbl> <dbl> <dbl>
 1     0     1     0     0
 2     4     1     0     0
 3     9     1     0     0
 4    14     1     0     1
 5     0     2     0     0
 6     4     2     0     1
 7     9     2     0     0
 8    14     2     0     1
 9     0     3     0     0
10     4     3     0     0
11     9     3     0     0
12    14     3     0     0

但是，如果您真的想恢复您的开始-停止格式，您可以确定药物状态变化的点，然后为每个个体总结该时期：

reformatted <-
  fullData %>%
  group_by(id) %>%
  mutate(
    drugChange =
      (drugA != lag(drugA, default = -1)) |
      (drugB != lag(drugB, default = -1))
    , period = cumsum(drugChange)
  ) %>%
  select(-drugChange) %>%
  group_by(id, period, drugA, drugB) %>%
  summarise(start = min(time)
            , stop = max(time)) %>%
  ungroup()

重新格式化%
分组依据（id）%>%
变异(
换药=
（drugA！=滞后（drugA，默认值=-1））|
（drugB！=滞后（drugB，默认值=-1））
，期间=累计金额（药费变更）
) %>%
选择（-drugChange）%%>%
分组依据（id、期间、药物A、药物B）%>%
总结（开始=分钟（时间）
，停止=最大（时间））%>%
解组（）

      id period drugA drugB start  stop
   <dbl>  <int> <dbl> <dbl> <dbl> <dbl>
 1     1      1     0     0     0    11
 2     1      2     0     1    12    15
 3     1      3     1     1    16    18
 4     1      4     1     0    19    19
 5     1      5     1     1    20    20
 6     1      6     0     1    21    25
 7     2      1     0     0     0     1
 8     2      2     0     1     2     8
 9     2      3     0     0     9    11
10     2      4     0     1    12    17
11     2      5     0     0    18    20
12     2      6     1     0    21    25
13     3      1     0     0     0     5
14     3      2     1     0     6     8
15     3      3     0     0     9    16
16     3      4     0     1    17    19
17     3      5     0     0    20    21
18     3      6     0     1    22    25

id周期drugA drugB开始停止
1     1      1     0     0     0    11
2     1      2     0     1    12    15
3     1      3     1     1    16    18
4     1      4     1     0    19    19
5     1      5     1     1    20    20
6     1      6     0     1    21    25
7     2      1     0     0     0     1
8     2      2     0     1     2     8
9     2      3     0     0     9    11
10     2      4     0     1    12    17
11     2      5     0     0    18    20
12     2      6     1     0    21    25
13     3      1     0     0     0     5
14     3      2     1     0     6     8
15     3      3     0     0     9    16
16     3      4     0     1    17    19
17     3      5     0     0    20    21
18     3      6     0     1    22    25

您最后想要什么？因此，请包含您的预期输出。我已经在代码中给出了我想要的最终数据集，请参见

structure我想在Durga和DurGB上做时变变量，因为我正在运行一个边缘结构模型。非常感谢！我真的想要一个时变分析的开始，停止格式。很高兴它对你有用。如果这样，你就可以接受答案，让别人知道它解决了你的ISS。ue（）。如果没有，请告诉我它缺少什么，我可能很容易扩展一下答案。它做得很好，我只是在等待是否有其他解决方案，可能是在接受答案之前使用ifelse语句的基本R类型解决方案。可能有更有效的方法（当前运行时间约25分钟），虽然我知道任何方法都需要时间，因为我们正在创建一个大型数据集。如果没有，我将接受您的方法！再次感谢。
    time    id drugA drugB
   <int> <dbl> <dbl> <dbl>
 1     0     1     0     0
 2     4     1     0     0
 3     9     1     0     0
 4    14     1     0     1
 5     0     2     0     0
 6     4     2     0     1
 7     9     2     0     0
 8    14     2     0     1
 9     0     3     0     0
10     4     3     0     0
11     9     3     0     0
12    14     3     0     0

fullData %>%
  mutate(drugState = paste(drugA, drugB, sep = "-")) %>%
  ggplot(aes(x = time
             , y = id
             , fill = drugState)) +
  geom_tile(height = 0.9) +
  scale_fill_manual(values = RColorBrewer::brewer.pal(4, "Set1")[c(3,1,2,4)] )

reformatted <-
  fullData %>%
  group_by(id) %>%
  mutate(
    drugChange =
      (drugA != lag(drugA, default = -1)) |
      (drugB != lag(drugB, default = -1))
    , period = cumsum(drugChange)
  ) %>%
  select(-drugChange) %>%
  group_by(id, period, drugA, drugB) %>%
  summarise(start = min(time)
            , stop = max(time)) %>%
  ungroup()

      id period drugA drugB start  stop
   <dbl>  <int> <dbl> <dbl> <dbl> <dbl>
 1     1      1     0     0     0    11
 2     1      2     0     1    12    15
 3     1      3     1     1    16    18
 4     1      4     1     0    19    19
 5     1      5     1     1    20    20
 6     1      6     0     1    21    25
 7     2      1     0     0     0     1
 8     2      2     0     1     2     8
 9     2      3     0     0     9    11
10     2      4     0     1    12    17
11     2      5     0     0    18    20
12     2      6     1     0    21    25
13     3      1     0     0     0     5
14     3      2     1     0     6     8
15     3      3     0     0     9    16
16     3      4     0     1    17    19
17     3      5     0     0    20    21
18     3      6     0     1    22    25