在R中查找匹配项和分配金额的棘手问题
这是一个需要(希望)在R中解决的棘手的小问题。它涉及到一些规则,以决定如何将未分类的信贷分摊到一类支出中 假设我有这个数据集(作为一个简单的例子): 有人能提出一个更简洁的方法吗?任何基于在R中查找匹配项和分配金额的棘手问题,r,loops,dplyr,vectorization,R,Loops,Dplyr,Vectorization,这是一个需要(希望)在R中解决的棘手的小问题。它涉及到一些规则,以决定如何将未分类的信贷分摊到一类支出中 假设我有这个数据集(作为一个简单的例子): 有人能提出一个更简洁的方法吗?任何基于dplyr的解决方案都是受欢迎的(我对这些东西相当熟悉)。执行速度应该不是什么大问题,因为对于我的“真实”数据来说,数量相当有限 谢谢。我建议首先使用连接查找匹配项。类似于z=filter(df,type==“z”)%%>%mutate(amount=-amount);notz=过滤器(df,类型!=“Z”);
dplyr
的解决方案都是受欢迎的(我对这些东西相当熟悉)。执行速度应该不是什么大问题,因为对于我的“真实”数据来说,数量相当有限
谢谢。我建议首先使用连接查找匹配项。类似于
z=filter(df,type==“z”)%%>%mutate(amount=-amount);notz=过滤器(df,类型!=“Z”);内部连接(z,notz,by=c(“id”,“amount”))
。。。不过,如果有多个可能的匹配项,那么您需要处理该案例。不过,总体而言,如果您的代码能够工作,并且能够足够快地处理您的数据,那么您做得很好。谢谢。是的,我需要在我的真实数据上测试代码,以确保它正在做我需要它做的事情……我很有希望。有趣的是,“reprex”原则——强迫自己将问题简化为最基本的问题——经常帮助我意外地找到解决方案。
df <- data.frame(id = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5),
counter = c(1, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3),
type = c("A", "B", "C", "Z", "B", "A", "C", "Z", "B", "A", "B", "Z", "A", "Z", "Z", "B", "A", "B", "B"),
amount = c(100, 200, 300, -100, 300, 200, 400, -300, 500, 100, 200, -250, 200, -200, -50, 100, 100, 200, 100),
type_2 = rep(NA, 19))
df$type <- as.character(df$type)
df$type_2 <- as.character(df$type_2)
df <- data.frame(id = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5),
counter = c(1, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3),
type = c("A", "B", "C", "Z", "B", "A", "C", "Z", "B", "A", "B", "Z", "A", "Z", "Z", "B", "A", "B", "B"),
amount = c(100, 200, 300, -100, 300, 200, 400, -300, 300, 100, 200, -250, 200, -200, -50, 100, 100, 200, 100),
type_2 = rep(NA, 19))
df$type <- as.character(df$type) # factor by default
df$type_2 <- df$type # to avoid overwriting data
df$amount_2 <- df$amount # ditto
# list of ids
id_list <- df %>% group_by(id) %>% summarise() %>% ungroup() %>% pull(id)
# for each of these...
for (my_id in id_list)
{
# get list of counters for this id
counter_list <- df %>% filter(id == my_id) %>% pull(counter)
# now loop through counters for that ID
for (my_counter in counter_list)
{
# is it type Z?
if(df$type[df$id == my_id & df$counter == my_counter] == "Z")
{
# yes - so search for an exact match
matched_flag <- FALSE
for (my_counter_2 in counter_list)
{
# if the amounts are equal and opposite *AND* we're matching with type A/B/C *AND* we haven't found a match previously
if(df$amount[df$id == my_id & df$counter == my_counter_2] == -df$amount[df$id == my_id & df$counter == my_counter] &
df$type [df$id == my_id & df$counter == my_counter_2] %in% c("A", "B", "C") & # only offset records of type A, B and C (check shouldn't be necessary since all type Zs should be negative, but...)
matched_flag == FALSE)
{
# found a match, so set type_2 for the Z record to be A/B/C as appropriate
df$type_2[df$id == my_id & df$counter == my_counter] <- df$type[df$id == my_id & df$counter == my_counter_2]
# then do the offsetting calculation (set both amounts to zero - not strictly necessary since the grouping at the next stage will cancel them out)
df$amount_2[df$id == my_id & df$counter == my_counter] <- 0
df$amount_2[df$id == my_id & df$counter == my_counter_2] <- 0
matched_flag <- TRUE # so that we don't keep looking for multiple matches - we don't want that
}
}
}
}
}
# reorder a bit
df <- df %>% select(id, counter, type, amount, type_2, amount_2)
df %>% summarise(amount = sum(amount), amount_2 = sum(amount_2))
# now we can group by type_2 - and add a fresh counter
df_grouped <- df %>%
group_by(id, type_2) %>%
summarise(amount = sum(amount_2)) %>% # note amount vs amount_2 - this is deliberate
ungroup() %>%
group_by(id) %>%
mutate(counter = row_number()) %>%
select(id, counter, everything()) %>%
arrange(id, type_2) %>%
rename(type = type_2) %>%
ungroup()
df_grouped$type_2 <- df_grouped$type # just to avoid overwriting original data
df_grouped$amount_2 <- df_grouped$amount # ditto
# loop through ids (same list as first stage)
for (my_id in id_list)
{
A_amount <- df_grouped %>% filter(id == my_id & type == "A") %>% summarise(amount = sum(amount)) %>% pull(amount) # returns 0 if no match is found, which is fine
B_amount <- df_grouped %>% filter(id == my_id & type == "B") %>% summarise(amount = sum(amount)) %>% pull(amount)
C_amount <- df_grouped %>% filter(id == my_id & type == "C") %>% summarise(amount = sum(amount)) %>% pull(amount)
Z_amount <- df_grouped %>% filter(id == my_id & type == "Z") %>% summarise(amount = sum(amount)) %>% pull(amount)
# take the amount in ZZ and apportion as A, B, C in turn (until the money runs out)
if(Z_amount != 0) # occasional zero recoveries appear and we can leave those alone
{
A_offset <- pmin(A_amount, -Z_amount)
A_amount <- A_amount - A_offset
Z_amount <- Z_amount + A_offset
B_offset <- pmin(B_amount, -Z_amount)
B_amount <- B_amount - B_offset
Z_amount <- Z_amount + B_offset
C_offset <- pmin(C_amount, -Z_amount)
C_amount <- C_amount - C_offset
Z_amount <- Z_amount + C_offset
# write back the relevant info to the dataframe
df_grouped$amount_2[df_grouped$id == my_id & df_grouped$type == "A"] <- A_amount
df_grouped$amount_2[df_grouped$id == my_id & df_grouped$type == "B"] <- B_amount
df_grouped$amount_2[df_grouped$id == my_id & df_grouped$type == "C"] <- C_amount
df_grouped$amount_2[df_grouped$id == my_id & df_grouped$type == "Z"] <- Z_amount
}
}
# sort again
df_grouped <- df_grouped %>% arrange(id, counter)