R 匹配两个数据帧中的两列并提供不同的列
我想根据索引列匹配两个不同维度的数据帧df1和df2。然后,根据匹配将两列从df2 shift&shiftdate添加到df1。但我有很多规则需要遵守R 匹配两个数据帧中的两列并提供不同的列,r,R,我想根据索引列匹配两个不同维度的数据帧df1和df2。然后,根据匹配将两列从df2 shift&shiftdate添加到df1。但我有很多规则需要遵守 df1 <- data.frame("Index" = c("Adams10-1", "Adams10-1", "Adams10-2", "Adams10-2", "Ball10-1", "Ball10-2", "Cash10-1", "Cash10-2", "David10-1", "David10-2"),
df1 <- data.frame("Index" = c("Adams10-1", "Adams10-1", "Adams10-2", "Adams10-2", "Ball10-1", "Ball10-2", "Cash10-1", "Cash10-2", "David10-1", "David10-2"),
"CaseDate" = c("2005-10-01", "2005-10-01", "2005-10-02", "2005-10-02", "2005-10-01", "2005-10-02", "2005-10-01", "2005-10-02", "2005-10-01", "2005-10-02"),
"Type" = c("heart", "local", "knee", "nose", "heart", "foot", "shin", "foot", "spine", "delivery"),
"StartTime" = c(1640, 1755, 0112, 0300, 2145, 0233, 2123, 0326, 858, 1024))
df2 <- data.frame("Index" = c("Adams10-1", "Adams10-1", "Ball10-1", "Cash10-1", "David10-1", "David10-1", "David10-3"),
"ShiftDate" = c("2005-10-01", "2005-10-01", "2005-10-01", "2005-10-01", "2005-10-01", "2005-10-01", "2005-10-03"),
"Shift" = c("OB", "CV", "Night", "Super", "OB", "Day", "OB"),
"Multiple" = c("yes", "yes", "no", "no", "yes", "yes", "no"))
规则:
如果df1$Index和df2$Index之间存在匹配,并且:
如果df2$Multiple==否,则将df2$Shift和df2$ShiftDate添加到df1
如果df2$Multiple==yes,则给出NA,除非df1$Type==heart&df2$Shift==CV在本例中,将CV Shift和shiftdate从df2添加到df1
如果df1$Index和df2$Index之间不匹配,请给出NA
除非df1$StartTime>0000和,否则我将根据df1和df2的结构以及问题中提供的目标数据集的内容在这里做出一些假设 索引只是此处人员标识符、姓名和班次或案例日期的组合。所以我们真的很想在人和日期上加入。 df1中的Multiple只标识一个人是否在一天内有多个班次。我假设第一张David10-1记录的“否”是一个打字错误。所以规则1实际上是关于一个人是否在一天内有多个班次。 如果这两个是正确的,我们可以执行以下操作。这段代码在很多地方都是多余的;它可以收紧很多。但它非常明确地显示了规则的逻辑
library(dplyr)
library(lubridate)
# First, let's do make two changes: (1) convert the dates to real dates, and
# (2) replace Index with Name.
df1 = df1 %>%
mutate(CaseDate = ymd(CaseDate),
Name = gsub("[^A-Za-z]", "", Index)) %>%
select(Name, CaseDate, Type, StartTime)
df2 = df2 %>%
mutate(ShiftDate = ymd(ShiftDate),
Name = gsub("[^A-Za-z]", "", Index)) %>%
select(Name, ShiftDate, Shift)
# Start with df1.
df3 = df1 %>%
# Bring in matching records in df2. Filter df2 to records that are either
# (1) the only record for that person, or (2) CV shifts.
left_join(df2 %>%
group_by(Name, ShiftDate) %>%
mutate(num.shifts = n()) %>%
filter(num.shifts == 1 | Shift == "CV"),
by = c("Name", "CaseDate" = "ShiftDate")) %>%
# We want to keep Shift and ShiftDate for records from df2 that are either
# (1) the only record for that person, or (2) CV shifts that join to a
# "heart" type in df1.
mutate(Shift = case_when(num.shifts == 1 ~ Shift,
Type == "heart" & Shift == "CV" ~ Shift,
T ~ NA_character_),
ShiftDate = case_when(num.shifts == 1 ~ CaseDate,
Type == "heart" & Shift == "CV" ~ CaseDate)) %>%
select(Name, CaseDate, Type, StartTime, Shift, ShiftDate) %>%
# Bring in records in df2 that match on person and whose shift date is the
# day before the case date.
left_join(df2 %>%
group_by(Name, ShiftDate) %>%
filter(n() == 1) %>%
mutate(ShiftDateOneDayLater = ShiftDate + 1),
by = c("Name", "CaseDate" = "ShiftDateOneDayLater")) %>%
# Keep Shift and ShiftDate only if StartTime is between 0000 and 0700.
mutate(Shift = case_when(!is.na(Shift.x) ~ Shift.x,
StartTime > 0 & StartTime < 700 ~ Shift.y),
ShiftDate = case_when(!is.na(ShiftDate.x) ~ ShiftDate.x,
StartTime > 0 & StartTime < 700 ~ ShiftDate.y)) %>%
select(Name, CaseDate, Type, StartTime, Shift, ShiftDate) %>%
# Bring in records in df2 that match on person and whose shift date is the
# day after the case date.
left_join(df2 %>%
group_by(Name, ShiftDate) %>%
filter(n() == 1) %>%
mutate(ShiftDateOneDayBefore = ShiftDate - 1),
by = c("Name", "CaseDate" = "ShiftDateOneDayBefore")) %>%
# Keep Shift and ShiftDate only if this is a "delivery" case and an "OB"
# shift.
mutate(Shift = case_when(!is.na(Shift.x) ~ Shift.x,
Type == "delivery" & Shift.y == "OB" ~ Shift.y),
ShiftDate = case_when(!is.na(Shift.x) ~ ShiftDate.x,
Type == "delivery" & Shift.y == "OB" ~ ShiftDate.y)) %>%
select(Name, CaseDate, Type, StartTime, Shift, ShiftDate)
基于df1和df2的结构以及问题中提供的目标数据集的内容,我将在这里做一些假设 索引只是此处人员标识符、姓名和班次或案例日期的组合。所以我们真的很想在人和日期上加入。 df1中的Multiple只标识一个人是否在一天内有多个班次。我假设第一张David10-1记录的“否”是一个打字错误。所以规则1实际上是关于一个人是否在一天内有多个班次。 如果这两个是正确的,我们可以执行以下操作。这段代码在很多地方都是多余的;它可以收紧很多。但它非常明确地显示了规则的逻辑
library(dplyr)
library(lubridate)
# First, let's do make two changes: (1) convert the dates to real dates, and
# (2) replace Index with Name.
df1 = df1 %>%
mutate(CaseDate = ymd(CaseDate),
Name = gsub("[^A-Za-z]", "", Index)) %>%
select(Name, CaseDate, Type, StartTime)
df2 = df2 %>%
mutate(ShiftDate = ymd(ShiftDate),
Name = gsub("[^A-Za-z]", "", Index)) %>%
select(Name, ShiftDate, Shift)
# Start with df1.
df3 = df1 %>%
# Bring in matching records in df2. Filter df2 to records that are either
# (1) the only record for that person, or (2) CV shifts.
left_join(df2 %>%
group_by(Name, ShiftDate) %>%
mutate(num.shifts = n()) %>%
filter(num.shifts == 1 | Shift == "CV"),
by = c("Name", "CaseDate" = "ShiftDate")) %>%
# We want to keep Shift and ShiftDate for records from df2 that are either
# (1) the only record for that person, or (2) CV shifts that join to a
# "heart" type in df1.
mutate(Shift = case_when(num.shifts == 1 ~ Shift,
Type == "heart" & Shift == "CV" ~ Shift,
T ~ NA_character_),
ShiftDate = case_when(num.shifts == 1 ~ CaseDate,
Type == "heart" & Shift == "CV" ~ CaseDate)) %>%
select(Name, CaseDate, Type, StartTime, Shift, ShiftDate) %>%
# Bring in records in df2 that match on person and whose shift date is the
# day before the case date.
left_join(df2 %>%
group_by(Name, ShiftDate) %>%
filter(n() == 1) %>%
mutate(ShiftDateOneDayLater = ShiftDate + 1),
by = c("Name", "CaseDate" = "ShiftDateOneDayLater")) %>%
# Keep Shift and ShiftDate only if StartTime is between 0000 and 0700.
mutate(Shift = case_when(!is.na(Shift.x) ~ Shift.x,
StartTime > 0 & StartTime < 700 ~ Shift.y),
ShiftDate = case_when(!is.na(ShiftDate.x) ~ ShiftDate.x,
StartTime > 0 & StartTime < 700 ~ ShiftDate.y)) %>%
select(Name, CaseDate, Type, StartTime, Shift, ShiftDate) %>%
# Bring in records in df2 that match on person and whose shift date is the
# day after the case date.
left_join(df2 %>%
group_by(Name, ShiftDate) %>%
filter(n() == 1) %>%
mutate(ShiftDateOneDayBefore = ShiftDate - 1),
by = c("Name", "CaseDate" = "ShiftDateOneDayBefore")) %>%
# Keep Shift and ShiftDate only if this is a "delivery" case and an "OB"
# shift.
mutate(Shift = case_when(!is.na(Shift.x) ~ Shift.x,
Type == "delivery" & Shift.y == "OB" ~ Shift.y),
ShiftDate = case_when(!is.na(Shift.x) ~ ShiftDate.x,
Type == "delivery" & Shift.y == "OB" ~ ShiftDate.y)) %>%
select(Name, CaseDate, Type, StartTime, Shift, ShiftDate)
我想你可以很容易地从mergedf1,df2,all=TRUE开始,然后从那里直接进行列计算。如果df1在df2中有两个匹配项,一个有Multiple=yes,另一个有Multiple=no,那么规则1的哪个子部分适用?示例df3表明这是第二部分,df1从df2中匹配的非多个记录中什么也得不到。我认为您可以很容易地从mergedf1开始,df2,all=TRUE,然后从那里直接进行列计算。如果df1在df2中有两个匹配项,其中一个匹配项为multiple=yes,另一个匹配项为multiple=no,规则1的哪个子部分适用?示例df3表明这是第二部分,而df1从df2中匹配的非多个记录中什么也得不到。这很有效,非常感谢!是的,关于假设和我的错误,你是正确的。我在原来的帖子里把它修好了。我确实从您的代码中得到了这个错误:mutate_impl.data中的错误,dots:求值错误:必须是整数类型,而不是字符。我将NA_字符改为NA_整数,它成功了。非常感谢。啊哈,当然-我忘了提到我在测试东西的时候把所有东西都改成了因子而不是字符串。所以这是有道理的。干杯嗨,我在改进了一些匹配条件后遇到了一个问题。你能帮我更正一下这里的代码吗?[这很有效,非常感谢!是的,关于假设和我的错误,你是正确的。我在最初的帖子中修复了它。我确实从你的代码中得到了这个错误:mutate_impl.data中的错误,圆点:求值错误:必须是整型,而不是字符。我将NA_character改为NA_integer_uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu当我测试的时候,我把所有的东西都改成了因子而不是字符串。所以这是有意义的。干杯!嗨,我在改进了一些匹配条件后遇到了一个问题。你能帮我更正一下这里的代码吗[