在R中查找匹配项和分配金额的棘手问题_R_Loops_Dplyr_Vectorization

在R中查找匹配项和分配金额的棘手问题

r loops

在R中查找匹配项和分配金额的棘手问题,r,loops,dplyr,vectorization,R,Loops,Dplyr,Vectorization,这是一个需要（希望）在R中解决的棘手的小问题。它涉及到一些规则，以决定如何将未分类的信贷分摊到一类支出中假设我有这个数据集（作为一个简单的例子）：有人能提出一个更简洁的方法吗？任何基于dplyr的解决方案都是受欢迎的（我对这些东西相当熟悉）。执行速度应该不是什么大问题，因为对于我的“真实”数据来说，数量相当有限谢谢。我建议首先使用连接查找匹配项。类似于z=filter（df，type==“z”）%%>%mutate（amount=-amount）；notz=过滤器（df，类型！=“Z”）；

这是一个需要（希望）在R中解决的棘手的小问题。它涉及到一些规则，以决定如何将未分类的信贷分摊到一类支出中

假设我有这个数据集（作为一个简单的例子）：

有人能提出一个更简洁的方法吗？任何基于

dplyr

的解决方案都是受欢迎的（我对这些东西相当熟悉）。执行速度应该不是什么大问题，因为对于我的“真实”数据来说，数量相当有限

谢谢。

我建议首先使用连接查找匹配项。类似于

z=filter（df，type==“z”）%%>%mutate（amount=-amount）；notz=过滤器（df，类型！=“Z”）；内部连接（z，notz，by=c（“id”，“amount”））

。。。不过，如果有多个可能的匹配项，那么您需要处理该案例。不过，总体而言，如果您的代码能够工作，并且能够足够快地处理您的数据，那么您做得很好。谢谢。是的，我需要在我的真实数据上测试代码，以确保它正在做我需要它做的事情……我很有希望。有趣的是，“reprex”原则——强迫自己将问题简化为最基本的问题——经常帮助我意外地找到解决方案。

df <- data.frame(id      = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5),
                 counter = c(1, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3),
                 type    = c("A", "B", "C", "Z", "B", "A", "C", "Z", "B", "A", "B", "Z", "A", "Z", "Z", "B", "A", "B", "B"),
                 amount  = c(100, 200, 300, -100, 300, 200, 400, -300, 500, 100, 200, -250, 200, -200, -50, 100, 100, 200, 100),
                 type_2  = rep(NA, 19))

df$type   <- as.character(df$type)
df$type_2 <- as.character(df$type_2)

df <- data.frame(id      = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5),
                 counter = c(1, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3),
                 type    = c("A", "B", "C", "Z", "B", "A", "C", "Z", "B", "A", "B", "Z", "A", "Z", "Z", "B", "A", "B", "B"),
                 amount  = c(100, 200, 300, -100, 300, 200, 400, -300, 300, 100, 200, -250, 200, -200, -50, 100, 100, 200, 100),
                 type_2  = rep(NA, 19))

df$type     <- as.character(df$type) # factor by default

df$type_2   <- df$type # to avoid overwriting data
df$amount_2 <- df$amount # ditto

# list of ids

id_list <- df %>% group_by(id) %>% summarise() %>% ungroup() %>% pull(id)

# for each of these...

for (my_id in id_list)
{

  # get list of counters for this id

  counter_list <- df %>% filter(id == my_id) %>% pull(counter)

  # now loop through counters for that ID

  for (my_counter in counter_list)
  {

    # is it type Z?

    if(df$type[df$id == my_id & df$counter == my_counter] == "Z")
    {
      # yes - so search for an exact match

      matched_flag <- FALSE

      for (my_counter_2 in counter_list)
      {

        # if the amounts are equal and opposite *AND* we're matching with type A/B/C *AND* we haven't found a match previously

        if(df$amount[df$id == my_id & df$counter == my_counter_2] == -df$amount[df$id == my_id & df$counter == my_counter] &
           df$type  [df$id == my_id & df$counter == my_counter_2] %in% c("A", "B", "C") & # only offset records of type A, B and C (check shouldn't be necessary since all type Zs should be negative, but...)
           matched_flag == FALSE)

        {
          # found a match, so set type_2 for the Z record to be A/B/C as appropriate

          df$type_2[df$id == my_id & df$counter == my_counter] <- df$type[df$id == my_id & df$counter == my_counter_2]

          # then do the offsetting calculation (set both amounts to zero - not strictly necessary since the grouping at the next stage will cancel them out)

          df$amount_2[df$id == my_id & df$counter == my_counter]   <- 0
          df$amount_2[df$id == my_id & df$counter == my_counter_2] <- 0

          matched_flag <- TRUE # so that we don't keep looking for multiple matches - we don't want that

        }

      }

    }

  }

}

# reorder a bit

df <- df %>% select(id, counter, type, amount, type_2, amount_2)

df %>% summarise(amount = sum(amount), amount_2 = sum(amount_2))

# now we can group by type_2 - and add a fresh counter

df_grouped <- df %>% 
  group_by(id, type_2) %>% 
  summarise(amount = sum(amount_2)) %>% # note amount vs amount_2 - this is deliberate
  ungroup() %>%
  group_by(id) %>%
  mutate(counter = row_number()) %>%
  select(id, counter, everything()) %>%
  arrange(id, type_2) %>%
  rename(type = type_2) %>%
  ungroup()

df_grouped$type_2   <- df_grouped$type  # just to avoid overwriting original data
df_grouped$amount_2 <- df_grouped$amount # ditto

# loop through ids (same list as first stage)

for (my_id in id_list)
{

  A_amount <- df_grouped %>% filter(id == my_id & type == "A") %>% summarise(amount = sum(amount)) %>% pull(amount) # returns 0 if no match is found, which is fine
  B_amount <- df_grouped %>% filter(id == my_id & type == "B") %>% summarise(amount = sum(amount)) %>% pull(amount)
  C_amount <- df_grouped %>% filter(id == my_id & type == "C") %>% summarise(amount = sum(amount)) %>% pull(amount)
  Z_amount <- df_grouped %>% filter(id == my_id & type == "Z") %>% summarise(amount = sum(amount)) %>% pull(amount)

  # take the amount in ZZ and apportion as A, B, C in turn (until the money runs out)

  if(Z_amount != 0) # occasional zero recoveries appear and we can leave those alone
  {

    A_offset <- pmin(A_amount, -Z_amount)
    A_amount <- A_amount - A_offset
    Z_amount <- Z_amount + A_offset

    B_offset <- pmin(B_amount, -Z_amount)
    B_amount <- B_amount - B_offset
    Z_amount <- Z_amount + B_offset

    C_offset <- pmin(C_amount, -Z_amount)
    C_amount <- C_amount - C_offset
    Z_amount <- Z_amount + C_offset

    # write back the relevant info to the dataframe

    df_grouped$amount_2[df_grouped$id == my_id & df_grouped$type == "A"] <- A_amount
    df_grouped$amount_2[df_grouped$id == my_id & df_grouped$type == "B"] <- B_amount
    df_grouped$amount_2[df_grouped$id == my_id & df_grouped$type == "C"] <- C_amount
    df_grouped$amount_2[df_grouped$id == my_id & df_grouped$type == "Z"] <- Z_amount

  }

}

# sort again

df_grouped <- df_grouped %>% arrange(id, counter)