计算R中三重求和的最快方法

计算R中三重求和的最快方法,r,matrix,R,Matrix,我的目标是计算以下三重求和: $V = \( \frac{1}{n1n2n3} \) \sum_{i=1}^{n1}\sum_{j=1}^{n2}\sum_{k=1}^{n3} I(Y_{1i},Y_{2j},Y_{3k})$ 其中I(Y1,Y2,Y3)定义为: I(Y1,Y2,Y3) = 1 if Y[1] < Y[2] < Y[3] I(Y1,Y2,Y3) = 1/2 if Y[1] = Y[2] < Y[3] I(Y1,Y2,Y3) = 1/6 if Y[1] =

我的目标是计算以下三重求和:

$V = \( \frac{1}{n1n2n3} \) \sum_{i=1}^{n1}\sum_{j=1}^{n2}\sum_{k=1}^{n3} I(Y_{1i},Y_{2j},Y_{3k})$
其中I(Y1,Y2,Y3)定义为:

I(Y1,Y2,Y3) = 1 if Y[1] < Y[2] < Y[3]
 I(Y1,Y2,Y3) = 1/2 if Y[1] = Y[2] < Y[3]
 I(Y1,Y2,Y3) = 1/6 if Y[1] = Y[2] = Y[3]
 I(Y1,Y2,Y3) = 0 Otherwise.

最初在我的计算机上执行
结果花费了399秒%
当(Y1%
总结(平均值=平均值)

这似乎也是代数上可以解的,但我想这是通过模拟来解的

如果我们有三组独立的300个16位数的数字,每个数字都是用rnorm绘制的,那么任何一个数字相互匹配的可能性都是微乎其微的。因此,我们可以忽略第二和第三种情况,这两种情况在建议的
set.seed
中不会发生,一次可能需要数十亿次运行


现在,Y[1]set.seed(123),在27000000个案例中,有22379120个案例(82.9%)出现上升情景。

最初的案例在我的计算机上花费了399秒来执行
结果%
当(Y1%
总结(平均值=平均值)

这似乎也是代数上可以解的,但我想这是通过模拟来解的

如果我们有三组独立的300个16位数的数字,每个数字都是用rnorm绘制的,那么任何一个数字相互匹配的可能性都是微乎其微的。因此,我们可以忽略第二和第三种情况,这两种情况在建议的
set.seed
中不会发生,一次可能需要数十亿次运行


现在,Y[1]set.seed(123),在27000000个案例中,上升情景出现在22379120个案例中(82.9%)。

tl;dr-data.table使用非等联接可以在
tidyr
完成数据生成的相同时间内解决此问题。尽管如此,
tidyr
/
dplyr
解决方案看起来更好

data.table(c0
)[data.table(c1), on = .(c0 <= c1), .(c0 = x.c0, c1 = i.c1), allow.cartesian = T
  ][data.table(c2), on = .(c1 <= c2), .(c0 = x.c0, c1 = x.c1, c2 = i.c2), allow.cartesian = T
    ][c0 <= c1 & c1 <= c2, sum(ifelse(c0 < c1 & c1 < c2, 1,
                                      ifelse(c0 == c1 & c1 < c2, 1/2, 1/6
                                      )))
      ] / (length(c0) * length(c1) * length(c2))
另一种方法是使用非等联接或预过滤数据。我们知道,如果
c0>c1
c1>c2
,求和结果将为0。通过这种方式,我们可以过滤掉我们知道不需要存储到内存中的组合,从而更快地创建组合

虽然这两种方法都比
data.table::CJ()
慢,但它们为三重求和提供了更好的条件

# 'data.table_CJ_filter' = CJ(c0,c1,c2)[c0 <= c1 & c1 <= c2, ]
#'tidyr_cross_filter' =  crossing(c0, c1) %>% filter(c0 <= c1) %>% crossing(c2) %>% filter(c1 <= c2)

#Creating dataset with future calcs in mind
Unit: milliseconds
                 expr    min     lq   mean median      uq     max neval
  data.table_non_equi 358.41 360.35 373.95 374.57  383.62  400.42    10
 data.table_CJ_filter 515.50 517.99 605.06 527.63  661.54  856.43    10
   tidyr_cross_filter 776.91 783.35 980.19 928.25 1178.47 1287.91    10
把它放在一起 开始时提供的最终解决方案不到一秒钟。小于2秒的
dplyr
版本。如果。。。else语句

Unit: milliseconds
      expr     min      lq    mean  median      uq    max neval
    dt_res  589.83  608.26  736.34  642.46  760.18 1091.1    10
 dt_CJ_res  750.07  764.78  905.12  893.73 1040.21 1140.5    10
 dplyr_res 1156.69 1169.84 1363.82 1337.42 1496.60 1709.8    10
数据/代码

# https://stackoverflow.com/questions/56185072/fastest-way-to-compute-this-triple-summation-in-r
library(dplyr)
library(tidyr)
library(data.table)

options(digits = 5)
set.seed(123)

nclasses = 3
N_obs = 300

c0 <- rnorm(N_obs)
c1 <- rnorm(N_obs)
c2 <- rnorm(N_obs)

# Base R Data Generation --------------------------------------------------

mat <- matrix(unlist(t(matrix(expand.grid(c0,c1,c2)))), ncol= nclasses)
df <- expand.grid(c0,c1,c2)

identical(mat, unname(as.matrix(df))) #TRUE - names are different with as.matrix

# tidyr and data.table Data Generation ------------------------------------

tib <- crossing(c0, c1, c2) #faster than complete

tib2 <- crossing(c0, c1)%>% #faster but similar in concept to non-equi
  filter(c0 <= c1)%>%
  crossing(c2)%>%
  filter(c1 <= c2)

dt <-   data.table(c0
                   )[data.table(c1), on = .(c0 <= c1), .(c0 = x.c0, c1 = i.c1), allow.cartesian = T
                     ][data.table(c2), on = .(c1 <= c2), .(c0 = x.c0, c1 = x.c1, c2 = i.c2), allow.cartesian = T
                       ][c0 <= c1 & c1 <= c2, ]

# Base R summation --------------------------------------------------------

sum(ifelse(df$Var1 < df$Var2 & df$Var2 < df$Var3, 1,
                      ifelse(df$Var1 == df$Var2 & df$Var2 < df$Var3, 1/2,
                             ifelse(df$Var1 == df$Var2 & df$Var2 == df$Var3, 1/6, 0)
                      ))
    ) / (length(c0)*length(c1)*length(c2))


# dplyr summation ---------------------------------------------------------

tib %>%
  mutate(res = case_when(c0  < c1 & c1 < c2  ~ 1,
                         c0 == c1 & c1 < c2  ~ 1/2,
                         c0 == c1 & c1 == c2 ~ 1/6,
                         TRUE               ~ 0)) %>%
  summarize(mean_res = mean(res))

# data.table summation ----------------------------------------------------

#why base doesn't have case_when, who knows
dt[, sum(ifelse(c0 < c1 & c1 < c2, 1,
                ifelse(c0 == c1 & c1 < c2, 1/2,
                       ifelse(c0 == c1 & c1 == c2, 1/6)
                )))
   ] / (length(c0) * length(c1) * length(c2))


CJ(c0,c1,c2)[c0 <= c1 & c1 <= c2, sum(ifelse(c0 < c1 & c1 < c2, 1,
                                             ifelse(c0 == c1 & c1 < c2, 1/2, 1/6
                                             )))
             ] / (length(c0) * length(c1) * length(c2))

# Benchmarking ------------------------------------------------------------

library(microbenchmark)

# Data generation
microbenchmark('original' = {
  matrix(unlist(t(matrix(expand.grid(c0,c1,c2)))), ncol= nclasses)
}
, 'as.matrix' = {
  as.matrix(expand.grid(c0,c1,c2)) 
}
, 'expand.grid' = {
  expand.grid(c0,c1,c2) #keep it simpler
}
, 'tidyr_complete' = {
  tibble(c0, c1, c2) %>% complete(c0, c1, c2)
}
, 'tidyr_crossing' = {
  crossing(c0, c1, c2)
}
, 'data.table_CJ' = {
  CJ(c0,c1,c2)
}
, times = 10)

microbenchmark('data.table_non_equi' = {
  data.table(c0
             )[data.table(c1), on = .(c0 <= c1), .(c0 = x.c0, c1 = i.c1), allow.cartesian = T
               ][data.table(c2), on = .(c1 <= c2), .(c0 = x.c0, c1 = x.c1, c2 = i.c2), allow.cartesian = T
                 ][c0 <= c1 & c1 <= c2, ]
}
, 'data.table_CJ_filter' = {
  CJ(c0,c1,c2)[c0 <= c1 & c1 <= c2, ]
}
, 'tidyr_cross_filter' = {
  crossing(c0,c1)%>%filter(c0 <= c1)%>% crossing(c2)%>% filter(c1 <= c2)
}
, times = 10
)

# Summation Calculation
microbenchmark('base' = {
  sum(ifelse(df$Var1 < df$Var2 & df$Var2 < df$Var3, 1,
             ifelse(df$Var1 == df$Var2 & df$Var2 < df$Var3, 1/2,
                    ifelse(df$Var1 == df$Var2 & df$Var2 == df$Var3, 1/6, 0)
             ))
  ) / (length(c0)*length(c1)*length(c2))
}
, 'dplyr' = {
  tib %>%
    mutate(res = case_when(c0  < c1 & c1 < c2  ~ 1,
                           c0 == c1 & c1 < c2  ~ 1/2,
                           c0 == c1 & c1 == c2 ~ 1/6,
                           TRUE               ~ 0)) %>%
    summarize(mean_res = mean(res))
}
, 'data.table' = {
  dt[, sum(ifelse(c0 < c1 & c1 < c2, 1,
                  ifelse(c0 == c1 & c1 < c2, 1/2, 1/6)
                  ))
     ] / (length(c0) * length(c1) * length(c2))
}
, 'dplyr_pre_filter' = {
  tib2 %>%
    mutate(res = case_when(c0  < c1 & c1 < c2  ~ 1,
                           c0 == c1 & c1 < c2  ~ 1/2,
                           TRUE ~ 1/6)) %>%
    summarize(mean_res = sum(res)) / (length(c0) * length(c1) * length(c2))
}
, times = 10)

# Start to Finish

microbenchmark('dt_res' = {
  data.table(c0
)[data.table(c1), on = .(c0 <= c1), .(c0 = x.c0, c1 = i.c1), allow.cartesian = T
  ][data.table(c2), on = .(c1 <= c2), .(c0 = x.c0, c1 = x.c1, c2 = i.c2), allow.cartesian = T
    ][c0 <= c1 & c1 <= c2, sum(ifelse(c0 < c1 & c1 < c2, 1,
                                      ifelse(c0 == c1 & c1 < c2, 1/2, 1/6)
    ))
    ] / (length(c0) * length(c1) * length(c2))
}
, 'dt_CJ_res' = {
  CJ(c0, c1, c2)[c0 <= c1 & c1 <= c2, sum(ifelse(c0 < c1 & c1 < c2, 1,
                                                 ifelse(c0 == c1 & c1 < c2, 1/2, 1/6)
  ))
  ] / (length(c0) * length(c1) * length(c2))
}
, 'dplyr_res' = {
  crossing(c0, c1)%>% #faster but similar in concept to non-equi
    filter(c0 <= c1)%>%
    crossing(c2)%>%
    filter(c1 <= c2)%>%
    mutate(res = case_when(c0  < c1 & c1 < c2  ~ 1,
                           c0 == c1 & c1 < c2  ~ 1/2,
                           TRUE ~ 1/6)) %>%
    summarize(mean_res = sum(res)) / (length(c0) * length(c1) * length(c2))
}
, times = 10
)

#https://stackoverflow.com/questions/56185072/fastest-way-to-compute-this-triple-summation-in-r
图书馆(dplyr)
图书馆(tidyr)
库(数据表)
选项(数字=5)
种子集(123)
nclasses=3
N_obs=300

c0tl;dr-data.table使用非等联接可以在
tidyr
完成数据生成的相同时间内解决此问题。尽管如此,
tidyr
/
dplyr
解决方案看起来更好

data.table(c0
)[data.table(c1), on = .(c0 <= c1), .(c0 = x.c0, c1 = i.c1), allow.cartesian = T
  ][data.table(c2), on = .(c1 <= c2), .(c0 = x.c0, c1 = x.c1, c2 = i.c2), allow.cartesian = T
    ][c0 <= c1 & c1 <= c2, sum(ifelse(c0 < c1 & c1 < c2, 1,
                                      ifelse(c0 == c1 & c1 < c2, 1/2, 1/6
                                      )))
      ] / (length(c0) * length(c1) * length(c2))
另一种方法是使用非等联接或预过滤数据。我们知道,如果
c0>c1
c1>c2
,求和结果将为0。通过这种方式,我们可以过滤掉我们知道不需要存储到内存中的组合,从而更快地创建组合

虽然这两种方法都比
data.table::CJ()
慢,但它们为三重求和提供了更好的条件

# 'data.table_CJ_filter' = CJ(c0,c1,c2)[c0 <= c1 & c1 <= c2, ]
#'tidyr_cross_filter' =  crossing(c0, c1) %>% filter(c0 <= c1) %>% crossing(c2) %>% filter(c1 <= c2)

#Creating dataset with future calcs in mind
Unit: milliseconds
                 expr    min     lq   mean median      uq     max neval
  data.table_non_equi 358.41 360.35 373.95 374.57  383.62  400.42    10
 data.table_CJ_filter 515.50 517.99 605.06 527.63  661.54  856.43    10
   tidyr_cross_filter 776.91 783.35 980.19 928.25 1178.47 1287.91    10
把它放在一起 开始时提供的最终解决方案不到一秒钟。小于2秒的
dplyr
版本。如果。。。else
语句

Unit: milliseconds
      expr     min      lq    mean  median      uq    max neval
    dt_res  589.83  608.26  736.34  642.46  760.18 1091.1    10
 dt_CJ_res  750.07  764.78  905.12  893.73 1040.21 1140.5    10
 dplyr_res 1156.69 1169.84 1363.82 1337.42 1496.60 1709.8    10
数据/代码

# https://stackoverflow.com/questions/56185072/fastest-way-to-compute-this-triple-summation-in-r
library(dplyr)
library(tidyr)
library(data.table)

options(digits = 5)
set.seed(123)

nclasses = 3
N_obs = 300

c0 <- rnorm(N_obs)
c1 <- rnorm(N_obs)
c2 <- rnorm(N_obs)

# Base R Data Generation --------------------------------------------------

mat <- matrix(unlist(t(matrix(expand.grid(c0,c1,c2)))), ncol= nclasses)
df <- expand.grid(c0,c1,c2)

identical(mat, unname(as.matrix(df))) #TRUE - names are different with as.matrix

# tidyr and data.table Data Generation ------------------------------------

tib <- crossing(c0, c1, c2) #faster than complete

tib2 <- crossing(c0, c1)%>% #faster but similar in concept to non-equi
  filter(c0 <= c1)%>%
  crossing(c2)%>%
  filter(c1 <= c2)

dt <-   data.table(c0
                   )[data.table(c1), on = .(c0 <= c1), .(c0 = x.c0, c1 = i.c1), allow.cartesian = T
                     ][data.table(c2), on = .(c1 <= c2), .(c0 = x.c0, c1 = x.c1, c2 = i.c2), allow.cartesian = T
                       ][c0 <= c1 & c1 <= c2, ]

# Base R summation --------------------------------------------------------

sum(ifelse(df$Var1 < df$Var2 & df$Var2 < df$Var3, 1,
                      ifelse(df$Var1 == df$Var2 & df$Var2 < df$Var3, 1/2,
                             ifelse(df$Var1 == df$Var2 & df$Var2 == df$Var3, 1/6, 0)
                      ))
    ) / (length(c0)*length(c1)*length(c2))


# dplyr summation ---------------------------------------------------------

tib %>%
  mutate(res = case_when(c0  < c1 & c1 < c2  ~ 1,
                         c0 == c1 & c1 < c2  ~ 1/2,
                         c0 == c1 & c1 == c2 ~ 1/6,
                         TRUE               ~ 0)) %>%
  summarize(mean_res = mean(res))

# data.table summation ----------------------------------------------------

#why base doesn't have case_when, who knows
dt[, sum(ifelse(c0 < c1 & c1 < c2, 1,
                ifelse(c0 == c1 & c1 < c2, 1/2,
                       ifelse(c0 == c1 & c1 == c2, 1/6)
                )))
   ] / (length(c0) * length(c1) * length(c2))


CJ(c0,c1,c2)[c0 <= c1 & c1 <= c2, sum(ifelse(c0 < c1 & c1 < c2, 1,
                                             ifelse(c0 == c1 & c1 < c2, 1/2, 1/6
                                             )))
             ] / (length(c0) * length(c1) * length(c2))

# Benchmarking ------------------------------------------------------------

library(microbenchmark)

# Data generation
microbenchmark('original' = {
  matrix(unlist(t(matrix(expand.grid(c0,c1,c2)))), ncol= nclasses)
}
, 'as.matrix' = {
  as.matrix(expand.grid(c0,c1,c2)) 
}
, 'expand.grid' = {
  expand.grid(c0,c1,c2) #keep it simpler
}
, 'tidyr_complete' = {
  tibble(c0, c1, c2) %>% complete(c0, c1, c2)
}
, 'tidyr_crossing' = {
  crossing(c0, c1, c2)
}
, 'data.table_CJ' = {
  CJ(c0,c1,c2)
}
, times = 10)

microbenchmark('data.table_non_equi' = {
  data.table(c0
             )[data.table(c1), on = .(c0 <= c1), .(c0 = x.c0, c1 = i.c1), allow.cartesian = T
               ][data.table(c2), on = .(c1 <= c2), .(c0 = x.c0, c1 = x.c1, c2 = i.c2), allow.cartesian = T
                 ][c0 <= c1 & c1 <= c2, ]
}
, 'data.table_CJ_filter' = {
  CJ(c0,c1,c2)[c0 <= c1 & c1 <= c2, ]
}
, 'tidyr_cross_filter' = {
  crossing(c0,c1)%>%filter(c0 <= c1)%>% crossing(c2)%>% filter(c1 <= c2)
}
, times = 10
)

# Summation Calculation
microbenchmark('base' = {
  sum(ifelse(df$Var1 < df$Var2 & df$Var2 < df$Var3, 1,
             ifelse(df$Var1 == df$Var2 & df$Var2 < df$Var3, 1/2,
                    ifelse(df$Var1 == df$Var2 & df$Var2 == df$Var3, 1/6, 0)
             ))
  ) / (length(c0)*length(c1)*length(c2))
}
, 'dplyr' = {
  tib %>%
    mutate(res = case_when(c0  < c1 & c1 < c2  ~ 1,
                           c0 == c1 & c1 < c2  ~ 1/2,
                           c0 == c1 & c1 == c2 ~ 1/6,
                           TRUE               ~ 0)) %>%
    summarize(mean_res = mean(res))
}
, 'data.table' = {
  dt[, sum(ifelse(c0 < c1 & c1 < c2, 1,
                  ifelse(c0 == c1 & c1 < c2, 1/2, 1/6)
                  ))
     ] / (length(c0) * length(c1) * length(c2))
}
, 'dplyr_pre_filter' = {
  tib2 %>%
    mutate(res = case_when(c0  < c1 & c1 < c2  ~ 1,
                           c0 == c1 & c1 < c2  ~ 1/2,
                           TRUE ~ 1/6)) %>%
    summarize(mean_res = sum(res)) / (length(c0) * length(c1) * length(c2))
}
, times = 10)

# Start to Finish

microbenchmark('dt_res' = {
  data.table(c0
)[data.table(c1), on = .(c0 <= c1), .(c0 = x.c0, c1 = i.c1), allow.cartesian = T
  ][data.table(c2), on = .(c1 <= c2), .(c0 = x.c0, c1 = x.c1, c2 = i.c2), allow.cartesian = T
    ][c0 <= c1 & c1 <= c2, sum(ifelse(c0 < c1 & c1 < c2, 1,
                                      ifelse(c0 == c1 & c1 < c2, 1/2, 1/6)
    ))
    ] / (length(c0) * length(c1) * length(c2))
}
, 'dt_CJ_res' = {
  CJ(c0, c1, c2)[c0 <= c1 & c1 <= c2, sum(ifelse(c0 < c1 & c1 < c2, 1,
                                                 ifelse(c0 == c1 & c1 < c2, 1/2, 1/6)
  ))
  ] / (length(c0) * length(c1) * length(c2))
}
, 'dplyr_res' = {
  crossing(c0, c1)%>% #faster but similar in concept to non-equi
    filter(c0 <= c1)%>%
    crossing(c2)%>%
    filter(c1 <= c2)%>%
    mutate(res = case_when(c0  < c1 & c1 < c2  ~ 1,
                           c0 == c1 & c1 < c2  ~ 1/2,
                           TRUE ~ 1/6)) %>%
    summarize(mean_res = sum(res)) / (length(c0) * length(c1) * length(c2))
}
, times = 10
)

#https://stackoverflow.com/questions/56185072/fastest-way-to-compute-this-triple-summation-in-r
图书馆(dplyr)
图书馆(tidyr)
库(数据表)
选项(数字=5)
种子集(123)
nclasses=3
N_obs=300

非常感谢你的回答。它是完美的,正如你提到的,矢量化在这里起到了作用。再次感谢你。这要快得多。请注意,
complete
并不等同于
expand.grid
。对于
rnorm
,差异并不重要,因为匹配的可能性非常小。但是对于整数,会有一个不同的答案。非常感谢你的回答。它是完美的,正如你提到的,矢量化在这里起到了作用。再次感谢你。这要快得多。请注意,
complete
并不等同于
expand.grid
。对于
rnorm
,差异并不重要,因为匹配的可能性非常小。但对于整数,会有不同的答案。