R 如果两个字段跨年，则基于行重复_R_Dplyr

R 如果两个字段跨年，则基于行重复

R 如果两个字段跨年，则基于行重复,r,dplyr,R,Dplyr,我有以下数据结构： df <- data.frame('unique_ref' = c("a_2016","a_2016","a_2016"), 'trans_type' = c('NB','MTA','MTA'), 'incept_dt' = c('01/07/2016','01/07/2016','01/07/2016'), 'exp_dt' = c('30/06/2017','30/

我有以下数据结构：

df <- data.frame('unique_ref' = c("a_2016","a_2016","a_2016"),
                 'trans_type' = c('NB','MTA','MTA'),
                 'incept_dt' = c('01/07/2016','01/07/2016','01/07/2016'),
                 'exp_dt' = c('30/06/2017','30/06/2017','30/06/2017'),
                 'trans_dt' = c('01/07/2016','01/10/2016','01/02/2017'),
                 'trans_end_dt' = c('30/09/2016','31/01/2017','30/06/2017'))
df <- df %>% 
  mutate_at(vars(ends_with("_dt")), as.Date, format = "%d/%m/%Y")

> df
  unique_ref trans_type  incept_dt     exp_dt   trans_dt trans_end_dt
1     a_2016         NB 2016-07-01 2017-06-30 2016-07-01   2016-09-30
2     a_2016        MTA 2016-07-01 2017-06-30 2016-10-01   2017-01-31
3     a_2016        MTA 2016-07-01 2017-06-30 2017-02-01   2017-06-30

因此，需要做的是将trans_end_dt更新到2016年的最后一天，然后创建一个与前一行2完全相同的新行，但唯一的变化是，2017年初的trans_dt与前一行2的trans_end_dt相同

最初的第3行在trans_dt和trans_end_dt之间的年份内没有移动，因此没有问题，也不会采取任何措施

我的实际数据集有许多“唯一的参考”，因此如果可能，这将需要以自动方式完成

最好是dplyr解决方案，但会接受任何我无法理解的问题

一种可能也包括

tidyr

：

df %>%
 mutate_at(vars(5:6), ~ as.Date(., format = "%d/%m/%Y")) %>%
 rowid_to_column() %>%
 uncount((format(trans_dt, "%Y") != format(trans_end_dt, "%Y")) + 1) %>%
 group_by(rowid) %>%
 mutate(trans_end_dt = if_else(row_number() == 1 & n() != 1, 
                       as.Date(paste0(format(trans_dt, "%Y"),"-12-31")), 
                       trans_end_dt),
        trans_dt = if_else(row_number() == 2 & n() != 1, 
                           as.Date(paste0(format(trans_end_dt, "%Y"),"-01-01")), 
                           trans_dt)) %>%
 ungroup() %>%
 select(-rowid)

  unique_ref trans_type incept_dt  exp_dt     trans_dt   trans_end_dt
  <chr>      <chr>      <chr>      <chr>      <date>     <date>      
1 a_2016     NB         01/07/2016 30/06/2017 2016-07-01 2016-09-30  
2 a_2016     MTA        01/07/2016 30/06/2017 2016-10-01 2016-12-31  
3 a_2016     MTA        01/07/2016 30/06/2017 2017-01-01 2017-01-31  
4 a_2016     MTA        01/07/2016 30/06/2017 2017-02-01 2017-06-30

df%>%
在（变量（5:6），~as.Date（，format=“%d/%m/%Y”））%%>处进行变异
rowid_到_列（）%>%
未计数（（格式（trans\u dt，“%Y”）！=格式（trans\u end\u dt，“%Y”））+1）%>%
分组依据（rowid）%>%
变异（trans_end_dt=if_else）（行号（）==1&n（）！=1，
截止日期（粘贴0（格式（trans_dt，“%Y”），“-12-31”），
trans_end_dt），
trans_dt=if_else（行号（）==2&n（）！=1，
截止日期（粘贴0（格式（trans_end_dt，“%Y”），“-01-01”），
trans_dt））%>%
解组（）%>%
选择（-rowid）
唯一的\u ref trans\u类型接收\u dt exp\u dt trans\u dt trans\u end\u dt
1 a_2016 NB 01/07/2016 30/06/2017 2016-07-01 2016-09-30
2 a_2016 MTA 01/07/2016 30/06/2017 2016-10-01 2016-12-31
3 a_2016 MTA 01/07/2016 30/06/2017-01-01 2017-01-31
4 a_2016 MTA 01/07/2016 30/06/2017-02-01 2017-06-30

这只使用了base R，但我认为这样更容易

#Find the rows where the years are different for trans_dt and trans_end_dt
inds <- which(format(df$trans_dt, "%Y") != format(df$trans_end_dt, "%Y"))
#create a subset of those rows
subset_df <- df[inds, ]
#extract year from trans_end_dt and assign it to trans_dt
df$trans_dt[inds] <- as.Date(paste0(format(df$trans_end_dt[inds], "%Y"), "-01-01"))
#extract the year from trans_dt and assign it to trans_end_dt for subset_df
subset_df$trans_end_dt <- as.Date(paste0(format(subset_df$trans_dt, "%Y"),"-12-31"))
#rbind both the dataset to get duplicated row
rbind(df, subset_df)

#  unique_ref trans_type  incept_dt     exp_dt   trans_dt  trans_end_dt
#1      a_2016         NB 2016-07-01 2017-06-30 2016-07-01   2016-09-30
#2      a_2016        MTA 2016-07-01 2017-06-30 2017-01-01   2017-01-31
#3      a_2016        MTA 2016-07-01 2017-06-30 2017-02-01   2017-06-30
#21     a_2016        MTA 2016-07-01 2017-06-30 2016-10-01   2016-12-31

#查找trans_dt和trans_end_dt年份不同的行
inds我不认为这是有效的，因为你仍然有跨年份的trans_dt和trans_end_dt。trans_dt也需要更新以适应新的需求row@user33484好的，我错过了。现在更新。
#Find the rows where the years are different for trans_dt and trans_end_dt
inds <- which(format(df$trans_dt, "%Y") != format(df$trans_end_dt, "%Y"))
#create a subset of those rows
subset_df <- df[inds, ]
#extract year from trans_end_dt and assign it to trans_dt
df$trans_dt[inds] <- as.Date(paste0(format(df$trans_end_dt[inds], "%Y"), "-01-01"))
#extract the year from trans_dt and assign it to trans_end_dt for subset_df
subset_df$trans_end_dt <- as.Date(paste0(format(subset_df$trans_dt, "%Y"),"-12-31"))
#rbind both the dataset to get duplicated row
rbind(df, subset_df)

#  unique_ref trans_type  incept_dt     exp_dt   trans_dt  trans_end_dt
#1      a_2016         NB 2016-07-01 2017-06-30 2016-07-01   2016-09-30
#2      a_2016        MTA 2016-07-01 2017-06-30 2017-01-01   2017-01-31
#3      a_2016        MTA 2016-07-01 2017-06-30 2017-02-01   2017-06-30
#21     a_2016        MTA 2016-07-01 2017-06-30 2016-10-01   2016-12-31