Performance 数据帧子集性能_Performance_R_Dataframe_Subset

Performance 数据帧子集性能

performance r dataframe

Performance 数据帧子集性能,performance,r,dataframe,subset,Performance,R,Dataframe,Subset,我有几个大数据帧（100多万行x 6-10列），我需要重复子集。子集部分是我代码中最慢的部分，我想知道是否有办法更快地完成这项工作 load("https://dl.dropbox.com/u/4131944/Temp/DF_IOSTAT_ALL.rda") start_in <- strptime("2012-08-20 13:00", "%Y-%m-%d %H:%M") end_in<- strptime("2012-08-20 17:00", "%Y-%m-%d %H:%M")

我有几个大数据帧（100多万行x 6-10列），我需要重复子集。子集部分是我代码中最慢的部分，我想知道是否有办法更快地完成这项工作

load("https://dl.dropbox.com/u/4131944/Temp/DF_IOSTAT_ALL.rda")
start_in <- strptime("2012-08-20 13:00", "%Y-%m-%d %H:%M")
end_in<- strptime("2012-08-20 17:00", "%Y-%m-%d %H:%M")
system.time(DF_IOSTAT_INT <- DF_IOSTAT_ALL[DF_IOSTAT_ALL$date_stamp >= start_in & DF_IOSTAT_ALL$date_stamp <= end_in,])

> system.time(DF_IOSTAT_INT <- DF_IOSTAT_ALL[DF_IOSTAT_ALL$date_stamp >= start_in & DF_IOSTAT_ALL$date_stamp <= end_in,])
   user  system elapsed 
  16.59    0.00   16.60 

dput(head(DF_IOSTAT_ALL))
structure(list(date_stamp = structure(list(sec = c(14, 24, 34, 
44, 54, 4), min = c(0L, 0L, 0L, 0L, 0L, 1L), hour = c(0L, 0L, 
0L, 0L, 0L, 0L), mday = c(20L, 20L, 20L, 20L, 20L, 20L), mon = c(7L, 
7L, 7L, 7L, 7L, 7L), year = c(112L, 112L, 112L, 112L, 112L, 112L
), wday = c(1L, 1L, 1L, 1L, 1L, 1L), yday = c(232L, 232L, 232L, 
232L, 232L, 232L), isdst = c(1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("sec", 
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst"
), class = c("POSIXlt", "POSIXt")), cpu = c(0.9, 0.2, 0.2, 0.1, 
0.2, 0.1), rsec_s = c(0, 0, 0, 0, 0, 0), wsec_s = c(0, 3.8, 0, 
0.4, 0.2, 0.2), util_pct = c(0, 0.1, 0, 0, 0, 0), node = c("bda101", 
"bda101", "bda101", "bda101", "bda101", "bda101")), .Names = c("date_stamp", 
"cpu", "rsec_s", "wsec_s", "util_pct", "node"), row.names = c(NA, 
6L), class = "data.frame")

加载（“https://dl.dropbox.com/u/4131944/Temp/DF_IOSTAT_ALL.rda")
在中启动，我将使用xts。唯一可能的问题是xts是一个具有有序索引属性的矩阵，因此不能像在data.frame中那样混合类型
如果节点列是不变的，则可以将其从xts对象中排除：
library(xts)
x <- xts(DF_IOSTAT_ALL[,2:5], as.POSIXct(DF_IOSTAT_ALL$date_stamp))
x["2012-08-20 00:00:24/2012-08-20 00:00:54"]

库（xts）
x以下是我对数据的实验。表
。有趣的是，只要转换到data.table
就可以加快查找速度，可能是通过更有效地查找逻辑向量。我比较了四件事：原始数据帧查找；从POSIXlt转换为POSIXct的查找（感谢Matthew Dowle）；数据表查找；以及数据表查找，以及复制和转换设置。即使有额外的设置，数据表查找也会成功。通过多次查找，您将节省更多的时间
library(data.table)
library(rbenchmark)
load("DF_IOSTAT_ALL.rda")
DF_IOSTAT_ALL.original <- DF_IOSTAT_ALL

start_in <- strptime("2012-08-20 13:00", "%Y-%m-%d %H:%M")
end_in<- strptime("2012-08-20 17:00", "%Y-%m-%d %H:%M")
#function to test: original
fun <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.original[DF_IOSTAT_ALL.original$date_stamp >= start_in & DF_IOSTAT_ALL.original$date_stamp <= end_in,]
#function to test: changing to POSIXct
DF_IOSTAT_ALL.ct <- within(DF_IOSTAT_ALL.original,date_stamp <- as.POSIXct(date_stamp))
fun.ct <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.ct[with(DF_IOSTAT_ALL.ct,date_stamp >= start_in & date_stamp <= end_in),]
#function to test: with data.table and POSIXct
DF_IOSTAT_ALL.dt <- as.data.table(DF_IOSTAT_ALL.ct);
fun.dt <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.dt[date_stamp >= start_in & date_stamp <= end_in,]
#function to test: with data table and POSIXct, with setup steps
newfun <- function() {
    DF_IOSTAT_ALL <- DF_IOSTAT_ALL.original;
    #data.table doesn't play well with POSIXlt, so convert to POSIXct
    DF_IOSTAT_ALL$date_stamp <- as.POSIXct(DF_IOSTAT_ALL$date_stamp);
    DF_IOSTAT_ALL <- data.table(DF_IOSTAT_ALL);
    DF_IOSTAT_INT <<- DF_IOSTAT_ALL[date_stamp >= start_in & date_stamp <= end_in,];
}
benchmark(fun(), fun.ct(), fun.dt(), newfun(), replications=3,order="relative")

#      test replications elapsed   relative user.self sys.self user.child sys.child
#3 fun.dt()            3    0.18   1.000000      0.11     0.08         NA        NA
#2 fun.ct()            3    0.52   2.888889      0.44     0.08         NA        NA
#4 newfun()            3   35.49 197.166667     34.88     0.58         NA        NA
#1    fun()            3   66.68 370.444444     66.42     0.15         NA        NA

我相信你可以做得更快，但是最好的方法将取决于DF\u IOSTAT\u ALL
的结构。你能提供那个物体的小样本吗？例如，dput（head（DF_IOSTAT_ALL））
@JoshuaUlrich的输出添加了请求的输出。很抱歉没有包括第一次。你在做什么样的子集？出于兴趣，这有多慢？@BlueMagister我正在将其子集为时间片。它是来自iostat的机器集群上的性能数据。我有一些性能测试的开始和结束时间。所以，我想将其子集到测试的时间框架中，然后绘制它。希望这就是你要问的……有趣。不确定，但不是fun
慢于DF_IOSTAT_ALL.原始的
，因为日期戳
是类型POSIXlt
；i、 e.data.frame
在POSIXct
上也会更快吗？POSIXlt
对性能来说真的很糟糕（每个日期iirc的apx为40字节！）而且newfun
做as.data.table
会比data.table
快得多，后面跟着DF IOSTAT\u ALL[，date\u stamp:=as.POSIXct]（date\u stamp）
。$哦，我不知道。当我将原始数据帧更改为POSIXct
时，我将查看查找是如何更改的。我尝试了DF\u IOSTAT\u ALL[，date\u stamp:=as.POSIXct（date\u stamp）]
但我发现了一个错误（可能是POSIXlt
类作为数据表列的错误）？关于错误，哦，是的，我现在明白了。在这种情况下，只需从数据框中的POSIXct
开始，并尽早忘记POSIXlt。
library(data.table)
library(rbenchmark)
load("DF_IOSTAT_ALL.rda")
DF_IOSTAT_ALL.original <- DF_IOSTAT_ALL

start_in <- strptime("2012-08-20 13:00", "%Y-%m-%d %H:%M")
end_in<- strptime("2012-08-20 17:00", "%Y-%m-%d %H:%M")
#function to test: original
fun <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.original[DF_IOSTAT_ALL.original$date_stamp >= start_in & DF_IOSTAT_ALL.original$date_stamp <= end_in,]
#function to test: changing to POSIXct
DF_IOSTAT_ALL.ct <- within(DF_IOSTAT_ALL.original,date_stamp <- as.POSIXct(date_stamp))
fun.ct <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.ct[with(DF_IOSTAT_ALL.ct,date_stamp >= start_in & date_stamp <= end_in),]
#function to test: with data.table and POSIXct
DF_IOSTAT_ALL.dt <- as.data.table(DF_IOSTAT_ALL.ct);
fun.dt <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.dt[date_stamp >= start_in & date_stamp <= end_in,]
#function to test: with data table and POSIXct, with setup steps
newfun <- function() {
    DF_IOSTAT_ALL <- DF_IOSTAT_ALL.original;
    #data.table doesn't play well with POSIXlt, so convert to POSIXct
    DF_IOSTAT_ALL$date_stamp <- as.POSIXct(DF_IOSTAT_ALL$date_stamp);
    DF_IOSTAT_ALL <- data.table(DF_IOSTAT_ALL);
    DF_IOSTAT_INT <<- DF_IOSTAT_ALL[date_stamp >= start_in & date_stamp <= end_in,];
}
benchmark(fun(), fun.ct(), fun.dt(), newfun(), replications=3,order="relative")

#      test replications elapsed   relative user.self sys.self user.child sys.child
#3 fun.dt()            3    0.18   1.000000      0.11     0.08         NA        NA
#2 fun.ct()            3    0.52   2.888889      0.44     0.08         NA        NA
#4 newfun()            3   35.49 197.166667     34.88     0.58         NA        NA
#1    fun()            3   66.68 370.444444     66.42     0.15         NA        NA

DF_IOSTAT_ALL <- copy(DF_IOSTAT_ALL.new)
time.breaks <- strptime.d("2012-08-19 19:00:00") + 0:178 * 60 * 60 #by hour
DF_IOSTAT_ALL[,interval := findInterval(date_stamp,time.breaks)]
setkey(DF_IOSTAT_ALL,interval)

start_in <- time.breaks[60]
end_in <- time.breaks[61]
benchmark(a <- DF_IOSTAT_ALL[J(60)],b <- fun2(DF_IOSTAT_ALL))
#                  test replications elapsed relative user.self sys.self user.child sys.child
#1 DF_IOSTAT_ALL[J(60)]          100    0.78 1.000000      0.64     0.14         NA        NA
#2  fun2(DF_IOSTAT_ALL)          100    6.69 8.576923      5.76     0.91         NA        NA
all.equal(a,b[,.SD,.SDcols=c(12,1:11,13)]) #test for equality (rearranging columns to match)
#TRUE