R 如果两个字段跨年,则基于行重复

R 如果两个字段跨年,则基于行重复,r,dplyr,R,Dplyr,我有以下数据结构: df <- data.frame('unique_ref' = c("a_2016","a_2016","a_2016"), 'trans_type' = c('NB','MTA','MTA'), 'incept_dt' = c('01/07/2016','01/07/2016','01/07/2016'), 'exp_dt' = c('30/06/2017','30/

我有以下数据结构:

df <- data.frame('unique_ref' = c("a_2016","a_2016","a_2016"),
                 'trans_type' = c('NB','MTA','MTA'),
                 'incept_dt' = c('01/07/2016','01/07/2016','01/07/2016'),
                 'exp_dt' = c('30/06/2017','30/06/2017','30/06/2017'),
                 'trans_dt' = c('01/07/2016','01/10/2016','01/02/2017'),
                 'trans_end_dt' = c('30/09/2016','31/01/2017','30/06/2017'))
df <- df %>% 
  mutate_at(vars(ends_with("_dt")), as.Date, format = "%d/%m/%Y")

> df
  unique_ref trans_type  incept_dt     exp_dt   trans_dt trans_end_dt
1     a_2016         NB 2016-07-01 2017-06-30 2016-07-01   2016-09-30
2     a_2016        MTA 2016-07-01 2017-06-30 2016-10-01   2017-01-31
3     a_2016        MTA 2016-07-01 2017-06-30 2017-02-01   2017-06-30
因此,需要做的是将trans_end_dt更新到2016年的最后一天,然后创建一个与前一行2完全相同的新行,但唯一的变化是,2017年初的trans_dt与前一行2的trans_end_dt相同

最初的第3行在trans_dt和trans_end_dt之间的年份内没有移动,因此没有问题,也不会采取任何措施

我的实际数据集有许多“唯一的参考”,因此如果可能,这将需要以自动方式完成


最好是dplyr解决方案,但会接受任何我无法理解的问题

一种可能也包括
tidyr

df %>%
 mutate_at(vars(5:6), ~ as.Date(., format = "%d/%m/%Y")) %>%
 rowid_to_column() %>%
 uncount((format(trans_dt, "%Y") != format(trans_end_dt, "%Y")) + 1) %>%
 group_by(rowid) %>%
 mutate(trans_end_dt = if_else(row_number() == 1 & n() != 1, 
                       as.Date(paste0(format(trans_dt, "%Y"),"-12-31")), 
                       trans_end_dt),
        trans_dt = if_else(row_number() == 2 & n() != 1, 
                           as.Date(paste0(format(trans_end_dt, "%Y"),"-01-01")), 
                           trans_dt)) %>%
 ungroup() %>%
 select(-rowid)

  unique_ref trans_type incept_dt  exp_dt     trans_dt   trans_end_dt
  <chr>      <chr>      <chr>      <chr>      <date>     <date>      
1 a_2016     NB         01/07/2016 30/06/2017 2016-07-01 2016-09-30  
2 a_2016     MTA        01/07/2016 30/06/2017 2016-10-01 2016-12-31  
3 a_2016     MTA        01/07/2016 30/06/2017 2017-01-01 2017-01-31  
4 a_2016     MTA        01/07/2016 30/06/2017 2017-02-01 2017-06-30  
df%>%
在(变量(5:6),~as.Date(,format=“%d/%m/%Y”))%%>处进行变异
rowid_到_列()%>%
未计数((格式(trans\u dt,“%Y”)!=格式(trans\u end\u dt,“%Y”))+1)%>%
分组依据(rowid)%>%
变异(trans_end_dt=if_else)(行号()==1&n()!=1,
截止日期(粘贴0(格式(trans_dt,“%Y”),“-12-31”),
trans_end_dt),
trans_dt=if_else(行号()==2&n()!=1,
截止日期(粘贴0(格式(trans_end_dt,“%Y”),“-01-01”),
trans_dt))%>%
解组()%>%
选择(-rowid)
唯一的\u ref trans\u类型接收\u dt exp\u dt trans\u dt trans\u end\u dt
1 a_2016 NB 01/07/2016 30/06/2017 2016-07-01 2016-09-30
2 a_2016 MTA 01/07/2016 30/06/2017 2016-10-01 2016-12-31
3 a_2016 MTA 01/07/2016 30/06/2017-01-01 2017-01-31
4 a_2016 MTA 01/07/2016 30/06/2017-02-01 2017-06-30

这只使用了base R,但我认为这样更容易

#Find the rows where the years are different for trans_dt and trans_end_dt
inds <- which(format(df$trans_dt, "%Y") != format(df$trans_end_dt, "%Y"))
#create a subset of those rows
subset_df <- df[inds, ]
#extract year from trans_end_dt and assign it to trans_dt
df$trans_dt[inds] <- as.Date(paste0(format(df$trans_end_dt[inds], "%Y"), "-01-01"))
#extract the year from trans_dt and assign it to trans_end_dt for subset_df
subset_df$trans_end_dt <- as.Date(paste0(format(subset_df$trans_dt, "%Y"),"-12-31"))
#rbind both the dataset to get duplicated row
rbind(df, subset_df)

#  unique_ref trans_type  incept_dt     exp_dt   trans_dt  trans_end_dt
#1      a_2016         NB 2016-07-01 2017-06-30 2016-07-01   2016-09-30
#2      a_2016        MTA 2016-07-01 2017-06-30 2017-01-01   2017-01-31
#3      a_2016        MTA 2016-07-01 2017-06-30 2017-02-01   2017-06-30
#21     a_2016        MTA 2016-07-01 2017-06-30 2016-10-01   2016-12-31
#查找trans_dt和trans_end_dt年份不同的行

inds我不认为这是有效的,因为你仍然有跨年份的trans_dt和trans_end_dt。trans_dt也需要更新以适应新的需求row@user33484好的,我错过了。现在更新。
#Find the rows where the years are different for trans_dt and trans_end_dt
inds <- which(format(df$trans_dt, "%Y") != format(df$trans_end_dt, "%Y"))
#create a subset of those rows
subset_df <- df[inds, ]
#extract year from trans_end_dt and assign it to trans_dt
df$trans_dt[inds] <- as.Date(paste0(format(df$trans_end_dt[inds], "%Y"), "-01-01"))
#extract the year from trans_dt and assign it to trans_end_dt for subset_df
subset_df$trans_end_dt <- as.Date(paste0(format(subset_df$trans_dt, "%Y"),"-12-31"))
#rbind both the dataset to get duplicated row
rbind(df, subset_df)

#  unique_ref trans_type  incept_dt     exp_dt   trans_dt  trans_end_dt
#1      a_2016         NB 2016-07-01 2017-06-30 2016-07-01   2016-09-30
#2      a_2016        MTA 2016-07-01 2017-06-30 2017-01-01   2017-01-31
#3      a_2016        MTA 2016-07-01 2017-06-30 2017-02-01   2017-06-30
#21     a_2016        MTA 2016-07-01 2017-06-30 2016-10-01   2016-12-31