Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/78.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/ios/110.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
优化R中的for循环_R_Performance_For Loop - Fatal编程技术网

优化R中的for循环

优化R中的for循环,r,performance,for-loop,R,Performance,For Loop,虚拟数据集:(与我的数据集的区别是项目\在我的情况下,代码是字符串) 在集群中循环500行数据帧时,使用bind_行似乎将性能提高了36秒 在这种情况下可以使用Lappy吗?我尝试了下面的代码,但出现错误: 筛选器_impl(.data,dots)中出错:$运算符对无效 原子矢量 myfun好的,那么你的代码中有很多不好的做法 您正在按行操作 每行创建2(!)个新数据帧(非常昂贵) 您正在一个循环中成长对象),该循环training\u df请包含一个,这将使其他人更容易帮助您。@Jaap当然在

虚拟数据集:(与我的数据集的区别是项目\在我的情况下,代码是字符串)

在集群中循环500行数据帧时,使用bind_行似乎将性能提高了36秒

在这种情况下可以使用Lappy吗?我尝试了下面的代码,但出现错误:

筛选器_impl(.data,dots)中出错:$运算符对无效 原子矢量


myfun好的,那么你的代码中有很多不好的做法

  • 您正在按行操作
  • 每行创建2(!)个新数据帧(非常昂贵)

  • 您正在一个循环中成长对象),该循环
    training\u df请包含一个,这将使其他人更容易帮助您。@Jaap当然在循环中。谢谢你。
    
    in_cluster <- data.frame(item_code = c(1:500))
    in_cluster$cluster <-
            sample(5, size = nrow(in_cluster), replace = TRUE)
    real_sales <- data.frame(item_code = numeric(0), sales = numeric(0))
    real_sales <-
        data.frame(
                item_code = sample(500, size = 100000, replace = TRUE),
                sales = sample(500, size = 100000, replace = TRUE)
        )
    
    mean_trajectory <- data.frame(sales = c(1:52))
    mean_trajectory$sales <- sample(500, size = 52, replace = TRUE)
    training_df <- data.frame(
            LTF_t_minus_1 = numeric(0),
            LTF_t = numeric(0),
            LTF_t_plus_1 = numeric(0),
            RS_t_minus_1 = numeric(0),
            RS_t = numeric(0),
            STF_t_plus_1 = numeric(0)
    )
    training_df[nrow(training_df) + 1, ] <-
            c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19) # week 0
    
    week = 2
    
    system.time({
        for (r in 1:nrow(in_cluster)) {
                item <- in_cluster[r,]
                sale_row <-
                        dplyr::filter(real_sales, item_code == item$item_code)
                if (nrow(sale_row) > 2) {
                        new_df <- data.frame(
                                LTF_t_minus_1 = mean_trajectory$sales[[week - 1]],
                                LTF_t = mean_trajectory$sales[[week]],
                                LTF_t_plus_1 = mean_trajectory$sales[[week + 1]],
                                RS_t_minus_1 = sale_row$sales[[week - 1]],
                                RS_t = sale_row$sales[[week]],
                                STF_t_plus_1 = sale_row$sales[[week + 1]]
                        )
                        training_df <-
                                bind_rows(training_df, new_df)
                }
        }
    }) 
    
    training_df[nrow(training_df) + 1,] <-
        c(mean_trajectory$sales[[week-1]], mean_trajectory$sales[[week]], mean_trajectory$sales[[week+1]], sale_row$sales[[week-1]], sale_row$sales[[week]], sale_row$sales[[week+1]])
    
    myfun <- function(item, sales, mean_trajectory, week) {
    sale_row<- filter(sales, item_code == item$item_code)
    data.frame(
      LTF_t_minus_1 = mean_trajectory$sales[[week-1]],
      LTF_t = mean_trajectory$sales[[week]],
      LTF_t_plus_1 = mean_trajectory$sales[[week+1]],
      RS_t_minus_1 = sale_row$sales[[week-1]],
      RS_t = sale_row$sales[[week]],
      STF_t_plus_1 = sale_row$sales[[week+1]])  
    }
    
    system.time({
          lapply(in_cluster, myfun, sales= sales, mean_trajectory = mean_trajectory) %>% bind_rows()
    })
    
    library(data.table) #v1.10.4
    ## First step
    res <-
      setDT(real_sales)[setDT(in_cluster), # binary join
                      if(.N > 2) .(RS_t_minus_1 = sales[week - 1], # The stuff you want to do
                                   RS_t = sales[week],             # by condition
                                   STF_t_plus_1 = sales[week + 1]), 
                      on = "item_code", # The join key
                      by = .EACHI] # Do the operations per each join
    
    ## Second step (run the `mean_trajectory` only once)
    res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1],
               LTF_t = mean_trajectory$sales[week],
               LTF_t_plus_1 = mean_trajectory$sales[week + 1])]
    
    ### Creating your data sets
    set.seed(123)
    N <- 1e5
    N2 <- 5e7
    
    in_cluster <- data.frame(item_code = c(1:N))
    
    real_sales <-
      data.frame(
        item_code = sample(N, size = N2, replace = TRUE),
        sales = sample(N, size = N2, replace = TRUE)
      )
    
    mean_trajectory <- data.frame(sales = sample(N, size = 25, replace = TRUE))
    
    training_df <- data.frame(
      LTF_t_minus_1 = numeric(0),
      LTF_t = numeric(0),
      LTF_t_plus_1 = numeric(0),
      RS_t_minus_1 = numeric(0),
      RS_t = numeric(0),
      STF_t_plus_1 = numeric(0)
    )
    week = 2
    
    ###############################
    ################# Your solution
    system.time({
      for (r in 1:nrow(in_cluster)) {
        item <- in_cluster[r,, drop = FALSE]
        sale_row <-
          dplyr::filter(real_sales, item_code == item$item_code)
        if (nrow(sale_row) > 2) {
          new_df <- data.frame(
            LTF_t_minus_1 = mean_trajectory$sales[[week - 1]],
            LTF_t = mean_trajectory$sales[[week]],
            LTF_t_plus_1 = mean_trajectory$sales[[week + 1]],
            RS_t_minus_1 = sale_row$sales[[week - 1]],
            RS_t = sale_row$sales[[week]],
            STF_t_plus_1 = sale_row$sales[[week + 1]]
          )
          training_df <-
            bind_rows(training_df, new_df)
        }
      }
    }) 
    ### Ran forever- I've killed it after half an hour
    
    
    ######################
    ########## My solution
    library(data.table)
    system.time({
    res <-
      setDT(real_sales)[setDT(in_cluster), 
                      if(.N > 2) .(RS_t_minus_1 = sales[week - 1],
                                   RS_t = sales[week],
                                   STF_t_plus_1 = sales[week + 1]), 
                      on = "item_code",
                      by = .EACHI]
    res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1],
               LTF_t = mean_trajectory$sales[week],
               LTF_t_plus_1 = mean_trajectory$sales[week + 1])]
    })
    
    # user  system elapsed 
    # 2.42    0.05    2.47