Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/76.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R 更快地聚合时间序列数据_R_Data.table - Fatal编程技术网

R 更快地聚合时间序列数据

R 更快地聚合时间序列数据,r,data.table,R,Data.table,我需要计算一些特性,将它们随时间分布,然后按如下所示进行聚合。这段代码产生了正确的结果,但我的实际数据集中大约有100万行数据,使用类似于下面的代码运行机器需要几天的时间。我正在寻找一种更有效的代码。我不确定使用xts或tidyverse软件包是否有助于加速。我使用的是data.table,我认为这有助于提高速度——也许这是一个错误的选择。有什么想法吗 library(data.table) library(lubridate) #toy example rows=1000 set.seed(

我需要计算一些特性,将它们随时间分布,然后按如下所示进行聚合。这段代码产生了正确的结果,但我的实际数据集中大约有100万行数据,使用类似于下面的代码运行机器需要几天的时间。我正在寻找一种更有效的代码。我不确定使用
xts
tidyverse
软件包是否有助于加速。我使用的是
data.table
,我认为这有助于提高速度——也许这是一个错误的选择。有什么想法吗

library(data.table)
library(lubridate)

#toy example
rows=1000
set.seed(1)
data=data.table(
  customer.arv = as.POSIXct("2020-01-01 00:00")+dminutes(sample(1:(60*24*7),rows,replace = T)),
  location = sample(1:4,rows,replace = T),
  customer.type = sample(LETTERS[1:5],rows,replace = T),
  charge = sample(seq(50,200,10),rows,replace = T)
  )
data[,':='(customer.dep = customer.arv+dminutes(sample(1:500,rows,replace = T)),
           arv.time.floor = floor_date(customer.arv,"hours"),
           arv.hour = hour(customer.arv))]

#distribute the charge over the length of stay (departure-arrival) and calculate the hourly charge
tot.hourly.charge = function(pass.location,pass.arv.time.floor,pass.customer.type) {
  full.hr.cust = data[customer.arv<=pass.arv.time.floor&customer.dep>=pass.arv.time.floor+dhours(1)&location==pass.location&customer.type==pass.customer.type,sum(charge)]
  partial.hr.cust = data[customer.arv<=pass.arv.time.floor&customer.dep<pass.arv.time.floor+dhours(1)&customer.dep>pass.arv.time.floor&location==pass.location&customer.type==pass.customer.type,sum(charge*minute(customer.dep)/60)]
  return(full.hr.cust+partial.hr.cust)
}

#aggregate
res = data[,.(hourly.charge = tot.hourly.charge(location,arv.time.floor,customer.type)), by=.(location,arv.time.floor,customer.type)]

#sample output
res[order(location,customer.type,arv.time.floor)][1:10,]
    location      arv.time.floor customer.type hourly.charge
 1:        1 2020-01-01 00:00:00             A       0.00000
 2:        1 2020-01-01 03:00:00             A     190.00000
 3:        1 2020-01-01 06:00:00             A     216.66667
 4:        1 2020-01-01 09:00:00             A     100.00000
 5:        1 2020-01-01 12:00:00             A     100.00000
 6:        1 2020-01-01 14:00:00             A      16.66667
 7:        1 2020-01-01 15:00:00             A      50.00000
 8:        1 2020-01-01 18:00:00             A      62.50000
 9:        1 2020-01-01 20:00:00             A       0.00000
10:        1 2020-01-01 22:00:00             A     190.00000
库(data.table)
图书馆(lubridate)
#玩具示例
行数=1000
种子(1)
data=data.table(
customer.arv=as.POSIXct(“2020-01-01 00:00”)+dminutes(样本(1:(60*24*7),行,替换=T)),
位置=样品(1:4,行,替换=T),
customer.type=样本(字母[1:5],行,替换=T),
费用=样品(序号(50200,10),行,替换=T)
)
数据[,':='(customer.dep=customer.arv+dminutes(示例(1:500,行,替换=T)),
arv.time.floor=楼层日期(customer.arv,“小时”),
arv.hour=小时(customer.arv))]
#按停留时间(出发-到达)分配费用,并计算每小时费用
tot.hourly.charge=功能(pass.location,pass.arv.time.floor,pass.customer.type){
full.hr.cust=data[customer.arv=pass.arv.time.floor+dhours(1)和location==pass.location和customer.type==pass.customer.type,sum(费用)]

partial.hr.cust=data[customer.arv以下是您可以首先尝试的内容:

data[, arv.time.floor.1h := arv.time.floor + 60*60]

full <- data[data, on=.(location=location, customer.type=customer.type, 
    customer.arv<=arv.time.floor, customer.dep>=arv.time.floor.1h),
    .(charge=x.charge, location, arv.time.floor=i.arv.time.floor, customer.type=i.customer.type)][,
        .(full.hr.cust=sum(charge)), keyby=.(location, customer.type, arv.time.floor)][
            is.na(full.hr.cust), full.hr.cust := 0]

partial <- data[data, on=.(location=location, customer.type=customer.type, 
    customer.arv<=arv.time.floor, customer.dep>arv.time.floor, customer.dep<arv.time.floor.1h),
    .(charge=x.charge, m=minute(x.customer.dep), location, arv.time.floor=i.arv.time.floor, customer.type=i.customer.type)][,
        .(partial.hr.cust=sum(charge * m / 60)), keyby=.(location, customer.type, arv.time.floor)][
            is.na(partial.hr.cust), partial.hr.cust := 0]

ans <- full[partial][, charge := full.hr.cust + partial.hr.cust]

您能否用一个简单的例子更详细地说明聚合应该做什么?例如,arv.time.floor“2020-01-01 03:00”,地点==1,类型==A:将2020-01-01 2020-01-01 03:00之前和2020-01-01 04:00之后居住的所有客户的费用相加。对于2020-01-01 03:00之前到达并在03:00和04:00之间离开的客户,将其费用乘以他们在03:00和04:00之间在业务中花费的小时数。因此,如果某个客户omer在03:20(20分钟)花了300美元后,他们每小时的费用是300美元*20/60=$100你有没有想过数据[,(hourly.charge
?我试过
复制(n=100,data[,。(hourly.charge,
),应该指向问题,
[
tot.hourly.charge
,但是你更了解你的问题。你能为你的实际数据集分享
数据[,.N,(位置,arv.time.floor,customer.type)],[unique(N)]
以及
数据[,范围(arv.time.floor)]
第一个:
[1]1 5 4 2 3 8 7 10 9 12 13 11 15 14 16 18 17
。日期范围:
[1]“2016-01-22 22:00:00 CST”“2020-06-30 23:00 CDT”
library(data.table)

#toy example
rows=1000
set.seed(1)
data=data.table(
    customer.arv = as.POSIXct("2020-01-01 00:00") + 60 * (sample(1:(60*24*7), rows, replace = TRUE)),
    location = sample(1:4,rows,replace = TRUE),
    customer.type = sample(LETTERS[1:5],rows,replace = TRUE),
    charge = sample(seq(50,200,10),rows,replace = TRUE)
)
data[, `:=`(customer.dep = customer.arv + 60 * sample(1:500,rows,replace = TRUE),
    arv.time.floor = as.POSIXct(round.POSIXt(customer.arv, units="hours")))]
setorder(data, location, customer.type, arv.time.floor)