R:使用日期和日期按多个条件合并2个数据帧;身份证件

R:使用日期和日期按多个条件合并2个数据帧;身份证件,r,merge,dataframe,lookup,R,Merge,Dataframe,Lookup,我正在尝试使用多个条件合并2个数据帧,并使用了merge命令,但无法获得成功的输出 #Data Frame df1# ID<- c("A1", "A2","A3", "A4") Location <- c("012A","234B","012A","238C" ) startdate <- as.Date(c("2014-11-01","2014-01-01","2015-10-01", "2015-01-01")) enddate <- as.Date(c("2014-

我正在尝试使用多个条件合并2个数据帧,并使用了merge命令,但无法获得成功的输出

#Data Frame df1#
ID<- c("A1", "A2","A3", "A4")
Location <- c("012A","234B","012A","238C" )
startdate <- as.Date(c("2014-11-01","2014-01-01","2015-10-01", "2015-01-01"))
enddate <- as.Date(c("2014-12-31","2014-08-31","2015-12-31","2015-12-31"))
df1<- data.frame(ID,Location, startdate, enddate)

#Data Frame df2#
ID<-c("A1", "A1", "A4")
N<- c(2,1,2)
Loss_Date <- as.Date(c("2014-11-15", "2015-12-25", "2015-11-30"))
Amt<-c("2200","1000", "500")
df2<- data.frame(ID, N, Loss_Date,Amt)
数据帧df1#
ID我已经使用package
dplyr
完成了合并,它非常快速且易于使用

您应该将此
stringsAsFactors=F

 df1<- data.frame(ID,Location, startdate, enddate, stringsAsFactors = F)
 df2<- data.frame(ID, N, Loss_Date,Amt, stringsAsFactors = F)
同样,根据注释指定,如果要保留与条件不匹配的行,则应使用另一个函数:

output2 <- left_join(df1, df2, by="ID") %>% 
 mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate)) %>%
 mutate(N = ifelse(condition & !is.na(condition), N, 0)) %>%
 mutate(Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01")) %>%
 mutate(Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
 mutate(condition = ifelse(is.na(condition),T,condition)) %>%
 filter(condition) %>%
 select(-condition)

我刚刚在@VincentBoned的回答中添加了一点额外的代码

# create 1st dataframe
ID<- c("A1", "A2","A3", "A4")
Location <- c("012A","234B","012A","238C" )
startdate <- as.Date(c("2014-11-01","2014-01-01","2015-10-01", "2015-01-01"))
enddate <- as.Date(c("2014-12-31","2014-08-31","2015-12-31","2015-12-31"))

df1<- data.frame(ID,Location, startdate, enddate, stringsAsFactors = F)


# create 2nd dataframe
ID<-c("A1", "A1", "A4")
N<- c(2,1,2)
Loss_Date <- as.Date(c("2014-11-15", "2015-12-25", "2015-11-30"))
Amt<-c("2200","1000", "500")

df2<- data.frame(ID, N, Loss_Date,Amt, stringsAsFactors = F)


library(dplyr)

full_join(df1, df2, by="ID") %>% 
  mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate)) %>%
  mutate(N = ifelse(condition & !is.na(condition), N, 0)) %>%
  mutate(Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01")) %>%
  mutate(Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
  select(-condition) %>%
  group_by(ID) %>%                              # for each ID
  mutate(Nrows = n()) %>%                       # count how many rows they have in the final table
  ungroup() %>%
  filter(!(Nrows > 1 & is.na(Loss_Date))) %>%   # filter out rows with IDs that have more than 1 rows and those rows are not matched
  select(-Nrows)

#   ID Location  startdate    enddate N  Loss_Date  Amt 
# 1 A1     012A 2014-11-01 2014-12-31 2 2014-11-15 2200 
# 2 A2     234B 2014-01-01 2014-08-31 0       <NA>    0 
# 3 A3     012A 2015-10-01 2015-12-31 0       <NA>    0 
# 4 A4     238C 2015-01-01 2015-12-31 2 2015-11-30  500 
#创建第一个数据帧
ID 1&is.na(丢失日期))%>%#筛选出ID超过1行且这些行不匹配的行
选择(-Nrows)
#ID位置开始日期结束日期N损失日期金额
#A1 012A 2014-11-01 2014-12-31 2 2014-11-15 2200
#2 A2 234B 2014-01-01 2014-08-31 0 0
#3 A3 012A 2015-10-01 2015-12-31 0 0
#4 A4 238C 2015-01-01 2015-12-31 2 2015-11-30 500
如果您了解上述代码的工作原理(一步一步),则可以使用更紧凑的版本,返回相同的结果:

full_join(df1, df2, by="ID") %>% 
  mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate),
         N = ifelse(condition & !is.na(condition), N, 0),
         Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01"),
         Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
  group_by(ID) %>%                             
  mutate(Nrows = n()) %>%                      
  filter(!(Nrows > 1 & is.na(Loss_Date))) %>%
  select(-c(condition, Nrows))
full_join(df1,df2,by=“ID”)%>%
变异(条件=(损失日期>=开始日期和损失日期%)
分组依据(ID)%>%
突变(Nrows=n())%>%
过滤器(!(Nrows>1和is.na(丢失日期))%>%
选择(-c(条件,Nrows))

sqldf非常健壮且易于阅读。请查看以下代码:

library(sqldf)
Output<-sqldf("
           SELECT L.*, r.N, r.Loss_Date, r.Amt
           FROM df1 as L
           LEFT JOIN df2 as r
           ON 
           L.ID=r.ID AND
              r.Loss_Date BETWEEN L.startdate AND L.enddate
           ORDER BY L.ID")
库(sqldf)

输出在data.table(v1.9.7)的当前开发版本中,实现了非等联接。利用这一点,我们可以:

require(data.table) # v1.9.7+
setDT(df2)[df1, .(ID, Location, startdate, enddate, N, x.Loss_Date, Amt), 
                      on=.(ID, Loss_Date>=startdate, Loss_Date<=enddate)]
#    ID Location  startdate    enddate  N x.Loss_Date  Amt
# 1: A1     012A 2014-11-01 2014-12-31  2  2014-11-15 2200
# 2: A2     234B 2014-01-01 2014-08-31 NA        <NA>   NA
# 3: A3     012A 2015-10-01 2015-12-31 NA        <NA>   NA
# 4: A4     238C 2015-01-01 2015-12-31  2  2015-11-30  500
require(data.table)#v1.9.7+
setDT(df2)[df1,(ID,位置,起始日期,结束日期,N,x.损失日期,金额),

on=。(ID,Loss\u Date>=startdate,Loss\u Date您的第一个问题是在
df1
中,列
位置
df2
中的列
ID
匹配。这是一个键入错误,还是这就是您获取数据的方式?谢谢您指出这一点。我已经解决了这个问题。也许可以使用
mutate
来代替
过滤器返回
NA
0
,或根据您的情况返回实际值,以匹配@Adi的期望输出?当然,如果这是他的期望输出,您是对的,应该使用
mutate
,而不是
过滤器
。我已经对其进行了编辑,但我不知道一个更有效的方法(且简洁)方法我想你是以相反的方式返回了值。不,他并不真的需要它。是的,他已经将列更改为ID。我现在正在制作一个版本。你可以看到第二行返回
N
Amt
的值。如果你想使用与我的建议相反的值,你必须这样做:
mutate(N=ifelse(condition&!is.na(condition),N,0))
。在选择(-condition)之前,我只是简单地使用了以下命令,这似乎很有效。感谢您的回答,并帮助您真正理解它。filter(condition==TRUE | is.na(condition)==TRUE)%%>%虽然这些解决方案有效,但我的原始数据集仍然存在一个问题,很遗憾无法共享。但如果我可以解释:我希望同时应用这两个条件,因为如果我不这样做,我在df2中的一行可以多次匹配,如果我在df1中仅按ID进行匹配,并且随后使用False indicator我也删除了df1中的原始行。我只想将df2中的行映射到df1,它同时满足ID和日期这两个条件,res可以是NA或0Can您可以创建一个示例,在其中我将应用此过程并将产生该问题吗?看不到该过程将如何影响原始数据帧。因此,请完成如果没有匹配项,则不应在df1中创建重复记录
# create 1st dataframe
ID<- c("A1", "A2","A3", "A4")
Location <- c("012A","234B","012A","238C" )
startdate <- as.Date(c("2014-11-01","2014-01-01","2015-10-01", "2015-01-01"))
enddate <- as.Date(c("2014-12-31","2014-08-31","2015-12-31","2015-12-31"))

df1<- data.frame(ID,Location, startdate, enddate, stringsAsFactors = F)


# create 2nd dataframe
ID<-c("A1", "A1", "A4")
N<- c(2,1,2)
Loss_Date <- as.Date(c("2014-11-15", "2015-12-25", "2015-11-30"))
Amt<-c("2200","1000", "500")

df2<- data.frame(ID, N, Loss_Date,Amt, stringsAsFactors = F)


library(dplyr)

full_join(df1, df2, by="ID") %>% 
  mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate)) %>%
  mutate(N = ifelse(condition & !is.na(condition), N, 0)) %>%
  mutate(Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01")) %>%
  mutate(Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
  select(-condition) %>%
  group_by(ID) %>%                              # for each ID
  mutate(Nrows = n()) %>%                       # count how many rows they have in the final table
  ungroup() %>%
  filter(!(Nrows > 1 & is.na(Loss_Date))) %>%   # filter out rows with IDs that have more than 1 rows and those rows are not matched
  select(-Nrows)

#   ID Location  startdate    enddate N  Loss_Date  Amt 
# 1 A1     012A 2014-11-01 2014-12-31 2 2014-11-15 2200 
# 2 A2     234B 2014-01-01 2014-08-31 0       <NA>    0 
# 3 A3     012A 2015-10-01 2015-12-31 0       <NA>    0 
# 4 A4     238C 2015-01-01 2015-12-31 2 2015-11-30  500 
full_join(df1, df2, by="ID") %>% 
  mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate),
         N = ifelse(condition & !is.na(condition), N, 0),
         Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01"),
         Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
  group_by(ID) %>%                             
  mutate(Nrows = n()) %>%                      
  filter(!(Nrows > 1 & is.na(Loss_Date))) %>%
  select(-c(condition, Nrows))
library(sqldf)
Output<-sqldf("
           SELECT L.*, r.N, r.Loss_Date, r.Amt
           FROM df1 as L
           LEFT JOIN df2 as r
           ON 
           L.ID=r.ID AND
              r.Loss_Date BETWEEN L.startdate AND L.enddate
           ORDER BY L.ID")
require(data.table) # v1.9.7+
setDT(df2)[df1, .(ID, Location, startdate, enddate, N, x.Loss_Date, Amt), 
                      on=.(ID, Loss_Date>=startdate, Loss_Date<=enddate)]
#    ID Location  startdate    enddate  N x.Loss_Date  Amt
# 1: A1     012A 2014-11-01 2014-12-31  2  2014-11-15 2200
# 2: A2     234B 2014-01-01 2014-08-31 NA        <NA>   NA
# 3: A3     012A 2015-10-01 2015-12-31 NA        <NA>   NA
# 4: A4     238C 2015-01-01 2015-12-31  2  2015-11-30  500