R 如何在给定时间段内基于多个条件删除重复值
我有如下数据:R 如何在给定时间段内基于多个条件删除重复值,r,date,dataframe,R,Date,Dataframe,我有如下数据: library(dplyr) dat<-data_frame(Date=as.Date(c("2012-08-06","2012-08-06","2016-01-01","2016-12-20","2017-02-01","2015-02-03","2014-12-28","2017-06-06","2017-08-04","2017-10-28")),Person=c(1,1,2,2,2,3,3,4,4,4),Type=c("A","B","A","A","A","C",
library(dplyr)
dat<-data_frame(Date=as.Date(c("2012-08-06","2012-08-06","2016-01-01","2016-12-20","2017-02-01","2015-02-03","2014-12-28","2017-06-06","2017-08-04","2017-10-28")),Person=c(1,1,2,2,2,3,3,4,4,4),Type=c("A","B","A","A","A","C","C","A","B","C"))
# A tibble: 10 x 4
Date Person Type
<date> <dbl> <chr>
1 2012-08-06 1 A
2 2012-08-06 1 B
3 2016-01-01 2 A
4 2016-12-20 2 A
5 2017-02-01 2 A
6 2015-02-03 3 C
7 2014-12-28 3 C
8 2017-06-06 4 A
9 2017-08-04 4 B
10 2017-10-28 4 C
# A tibble: 10 x 4
Date Person Type Original
<date> <dbl> <chr> <dbl>
1 2012-08-06 1 A 1
2 2012-08-06 1 B 1
3 2016-01-01 2 A 1
4 2016-12-20 2 A 0
5 2017-02-01 2 A 1
6 2015-02-03 3 C 0
7 2014-12-28 3 C 1
8 2017-06-06 4 A 1
9 2017-08-04 4 B 1
10 2017-10-28 4 C 1
更新 我添加了一个检测周期重置的过程,还修改了创建年份索引的过程,使其自动检测您的年份范围,这样您就不必手动设置
library(dplyr)
library(data.table)
library(tis)
#creating our data.table
dat <- data.table(Date=as.Date(c("2012-08-06","2012-08-06","2016-01-01","2016-12-20","2017-02-01","2017-03-01","2015-02-03","2014-12-28","2017-06-06","2017-08-04","2017-10-28")),Person=c(1,1,2,2,2,2,3,3,4,4,4),Type=c("A","B","A","A","A","A","C","C","A","B","C"))
#creating a year column to merge with year table
dat$year <- substring(dat$Date,1,4)
#creationg a year table with a number column which correspond to the number of the year among our list of year
year_table <- data.table(year=as.character(min(dat$year):max(dat$year)),number=1:length(min(dat$year):max(dat$year)))
#merging the two tables
dat <- year_table[dat,on = .(year)]
#seting the number to get exactly to which number correspond the date
dat$number <- dat$number + yday(dat$Date)/(365 + isLeapYear(as.numeric(dat$year)))
#creating min column by Person and Date which contains the value of min(number)
dat[,min:=ifelse(number==min(number),number,min(number)),by=list(Person,Type)][]
#calculation number - min and comapring it to 0 or 1
dat$diff <- dat$number - dat$min
#if diff==0, it's the first occurence so we set Original to 1, if 0<diff<1 set to 0 and if diff>1 set to 1
dat[,round_diff := trunc(diff)][] #Edit
dat[,temp_Original := ifelse(diff==0,1,ifelse(diff>0&diff<1,0,1)),by=list(Person,Type)][]
dat[,temp_Original2 := ifelse(diff>1 & diff>min(diff),0,1),by=list(round_diff,Type,Person)][] #Edit
dat[,Original := temp_Original*temp_Original2] #Edit
dat[,c("Date","Person","Type","Original")]
库(dplyr)
库(数据表)
图书馆(tis)
#创建我们的data.table
数据更新
我添加了一个检测周期重置的过程,还修改了创建年份索引的过程,使其自动检测您的年份范围,这样您就不必手动设置
library(dplyr)
library(data.table)
library(tis)
#creating our data.table
dat <- data.table(Date=as.Date(c("2012-08-06","2012-08-06","2016-01-01","2016-12-20","2017-02-01","2017-03-01","2015-02-03","2014-12-28","2017-06-06","2017-08-04","2017-10-28")),Person=c(1,1,2,2,2,2,3,3,4,4,4),Type=c("A","B","A","A","A","A","C","C","A","B","C"))
#creating a year column to merge with year table
dat$year <- substring(dat$Date,1,4)
#creationg a year table with a number column which correspond to the number of the year among our list of year
year_table <- data.table(year=as.character(min(dat$year):max(dat$year)),number=1:length(min(dat$year):max(dat$year)))
#merging the two tables
dat <- year_table[dat,on = .(year)]
#seting the number to get exactly to which number correspond the date
dat$number <- dat$number + yday(dat$Date)/(365 + isLeapYear(as.numeric(dat$year)))
#creating min column by Person and Date which contains the value of min(number)
dat[,min:=ifelse(number==min(number),number,min(number)),by=list(Person,Type)][]
#calculation number - min and comapring it to 0 or 1
dat$diff <- dat$number - dat$min
#if diff==0, it's the first occurence so we set Original to 1, if 0<diff<1 set to 0 and if diff>1 set to 1
dat[,round_diff := trunc(diff)][] #Edit
dat[,temp_Original := ifelse(diff==0,1,ifelse(diff>0&diff<1,0,1)),by=list(Person,Type)][]
dat[,temp_Original2 := ifelse(diff>1 & diff>min(diff),0,1),by=list(round_diff,Type,Person)][] #Edit
dat[,Original := temp_Original*temp_Original2] #Edit
dat[,c("Date","Person","Type","Original")]
库(dplyr)
库(数据表)
图书馆(tis)
#创建我们的data.table
dat这里有一个dplyr
解决方案
我们首先定义一个自定义函数,它采用YYYYMMDD格式并在间隔超过10000时标记和重新初始化一行(因此闰年被处理)
然后我们在相关组中使用它
library(dplyr)
new_year <- function(x) {
is_new <- 1
if(length(x)>1){
base <- x[1]
for (i in 2:length(x)) {
interval <- x[i] - base
if (interval >= 10000) {
is_new <- c(is_new,1)
base <- x[i]
} else {
is_new <- c(is_new,0)
}
}
}
return (is_new)
}
dat %>%
arrange(Date) %>%
group_by(Person,Type) %>%
do({mutate(., Original=new_year(year(Date)*10000+month(Date)*100+day(Date)))}) %>%
right_join(dat)
# Joining, by = c("Date", "Person", "Type")
# # A tibble: 10 x 4
# # Groups: Person, Type [?]
# Date Person Type Original
# <date> <dbl> <chr> <dbl>
# 1 2012-08-06 1 A 1
# 2 2012-08-07 1 B 1
# 3 2016-01-01 2 A 1
# 4 2016-12-20 2 A 0
# 5 2017-02-01 2 A 1
# 6 2015-02-03 3 C 0
# 7 2014-12-28 3 C 1
# 8 2017-06-06 4 A 1
# 9 2017-08-04 4 B 1
# 10 2017-10-28 4 C 1
库(dplyr)
新年这里有一个dplyr
解决方案
我们首先定义一个自定义函数,它采用YYYYMMDD格式并在间隔超过10000时标记和重新初始化一行(因此闰年被处理)
然后我们在相关组中使用它
library(dplyr)
new_year <- function(x) {
is_new <- 1
if(length(x)>1){
base <- x[1]
for (i in 2:length(x)) {
interval <- x[i] - base
if (interval >= 10000) {
is_new <- c(is_new,1)
base <- x[i]
} else {
is_new <- c(is_new,0)
}
}
}
return (is_new)
}
dat %>%
arrange(Date) %>%
group_by(Person,Type) %>%
do({mutate(., Original=new_year(year(Date)*10000+month(Date)*100+day(Date)))}) %>%
right_join(dat)
# Joining, by = c("Date", "Person", "Type")
# # A tibble: 10 x 4
# # Groups: Person, Type [?]
# Date Person Type Original
# <date> <dbl> <chr> <dbl>
# 1 2012-08-06 1 A 1
# 2 2012-08-07 1 B 1
# 3 2016-01-01 2 A 1
# 4 2016-12-20 2 A 0
# 5 2017-02-01 2 A 1
# 6 2015-02-03 3 C 0
# 7 2014-12-28 3 C 1
# 8 2017-06-06 4 A 1
# 9 2017-08-04 4 B 1
# 10 2017-10-28 4 C 1
库(dplyr)
新年一个可能的替代方案,包含数据。表
-包:
# load the 'data.table' package
library(data.table)
# convert 'dat' to a 'data.table' and set the keys to 'Person' & 'Date'
# (which will reorder the data according to 'Person' & 'Date')
setDT(dat, key = c('Person','Date'))
# create a 'newgroup' variable for each person
# then check for duplicates by 'Person' & 'newgroup'
dat[, newgroup := cumsum(rowSums(abs(outer(Date, Date, '-')) > 365)), by = Person
][, original := +(!duplicated(Type)), by = .(Person, newgroup)
][, newgroup := NULL][]
其中:
带有数据的可能替代解决方案。表
-包:
# load the 'data.table' package
library(data.table)
# convert 'dat' to a 'data.table' and set the keys to 'Person' & 'Date'
# (which will reorder the data according to 'Person' & 'Date')
setDT(dat, key = c('Person','Date'))
# create a 'newgroup' variable for each person
# then check for duplicates by 'Person' & 'newgroup'
dat[, newgroup := cumsum(rowSums(abs(outer(Date, Date, '-')) > 365)), by = Person
][, original := +(!duplicated(Type)), by = .(Person, newgroup)
][, newgroup := NULL][]
其中:
这里是另一个只使用dplyr的解决方案。我按个人和类型获取原始日期,将其加入现有数据集,然后检查日期是否在原始日期的一年内,而不是等于其本身
更新-我更改了dat3步骤以回答下面的评论。我为重置日期添加了一列,并更改了datediff变量的逻辑
library(dplyr)
dat<-data_frame(Date=as.Date(c("2012-08-06","2012-08-06","2016-01-01","2016-12-20",
"2017-02-01","2015-02-03","2014-12-28",
"2017-06-06","2017-08-04","2017-10-28")),
Person=c(1,1,2,2,2,3,3,4,4,4),
Type=c("A","B","A","A","A","C","C","A","B","C"))
dat2 <- dat %>%
arrange(Date, Person) %>%
group_by(Person, Type) %>%
filter(row_number() ==1) %>%
mutate(FirstDate = Date) %>%
select(-Date)
dat3 <- left_join(dat, dat2, by = c("Person"= "Person", "Type" = "Type")) %>%
mutate(Original = ifelse(Date == FirstDate, 1,
ifelse(Date - FirstDate <= 365, 0 ,1)),
datediff = ifelse(Date - FirstDate >= 365, 0, Date - FirstDate),
ResetDate = as.Date(ifelse(Date - FirstDate > 365 & Original == 1, Date,
FirstDate), origin = '1970-01-01'))
库(dplyr)
dat%
分组依据(个人、类型)%>%
过滤器(行号()==1)%>%
变异(FirstDate=Date)%>%
选择(-Date)
dat3%
变异(原始=ifelse(日期=FirstDate,1,
ifelse(Date-FirstDate=365,0,Date-FirstDate),
ResetDate=as.Date(如果其他(Date-FirstDate>365&原始==1,日期,
第一个日期),原产地='1970-01-01'))
这里是另一个仅使用dplyr的解决方案。我按个人和类型获取原始日期,将其加入现有数据集,然后检查该日期是否在原始日期的一年之内,而不是等于其本身
更新-我更改了dat3步骤以回答下面的注释。我在重置日期中添加了一列,并更改了datediff变量的逻辑
library(dplyr)
dat<-data_frame(Date=as.Date(c("2012-08-06","2012-08-06","2016-01-01","2016-12-20",
"2017-02-01","2015-02-03","2014-12-28",
"2017-06-06","2017-08-04","2017-10-28")),
Person=c(1,1,2,2,2,3,3,4,4,4),
Type=c("A","B","A","A","A","C","C","A","B","C"))
dat2 <- dat %>%
arrange(Date, Person) %>%
group_by(Person, Type) %>%
filter(row_number() ==1) %>%
mutate(FirstDate = Date) %>%
select(-Date)
dat3 <- left_join(dat, dat2, by = c("Person"= "Person", "Type" = "Type")) %>%
mutate(Original = ifelse(Date == FirstDate, 1,
ifelse(Date - FirstDate <= 365, 0 ,1)),
datediff = ifelse(Date - FirstDate >= 365, 0, Date - FirstDate),
ResetDate = as.Date(ifelse(Date - FirstDate > 365 & Original == 1, Date,
FirstDate), origin = '1970-01-01'))
库(dplyr)
dat%
分组依据(个人、类型)%>%
过滤器(行号()==1)%>%
变异(FirstDate=Date)%>%
选择(-Date)
dat3%
变异(原始=ifelse(日期=FirstDate,1,
ifelse(Date-FirstDate=365,0,Date-FirstDate),
ResetDate=as.Date(如果其他(Date-FirstDate>365&原始==1,日期,
第一个日期),原产地='1970-01-01'))
这不提供正确的解决方案,因为它不考虑时间。再次观察,如果在<<代码> > <代码>类型> <代码> >代码< >人> /代码>和<代码>类型< /代码>的一年时间内,一个观察被认为是<代码>原始< /代码>。知道是否需要额外的澄清。我非常感谢您的努力,但这仍然不能满足我的需要,因为它只适用于此场景。理想情况下,我不必手动指定年份,因为它们可能会发生变化,尽管这更为琐碎,因为我可以将年份设置为较大的合理范围。更大的问题是这种方法假设在初始365天期间重置后,Person
和Type
组合不会再次重复。例如,如果Person 2
在日期
2017-03-01有Type
A,您的方法会将其标记为原始实例,但不应该重复。新周期从2017年2月1日开始。@costebk08你能试试这个吗?如果它不起作用,我就不能再帮助你了。谢谢你为我所做的努力。我张贴了一张赏金来看看是否有更有效的解决方案。这不提供正确的解决方案,因为它不考虑时间。再次,观察被认为是<代码>原始< /代码>,如果它是由代码唯一的。>Person
和Type
在Person
和Type
组合的原始
实例的一年内。如果需要进一步澄清,请告诉我。我非常感谢您的努力,但这仍然不能满足我的需要,因为它仅适用于此场景。理想情况下,我不必手动指定年份,因为它们可能会发生变化,尽管这更简单,因为我可以将年份设置为较大的合理范围。更大的问题是,这种方法假设在初始365天期间重置后,Person
和Type
组合不会再次重复。例如,如果若要在2017-03-01的日期键入A,您的方法会将其标记为原始实例,但不应该是。新周期将从2017-01-02开始。@costebk08您可以尝试一下吗?如果它不起作用,我无法再帮您了对不起。这很有效,谢谢您的所有努力。我正在发布一份奖金,以查看是否有更有效的方法解决方案是可能的。您想在日期列或日期年份内检查它吗?您能告诉我们您的原始数据框是否大吗?@OrhanYazar如果数据大,这与问题无关。