R 选择由其他data.table中的筛选器给定的data.table中的行
我想选择data.table(R 选择由其他data.table中的筛选器给定的data.table中的行,r,data.table,R,Data.table,我想选择data.table(DT1)的特定行值(此处为TARGET),其中筛选条件位于其他data.table(DT2)中。 它不是一个精确的过滤器,因为如果我在DT2中有值3,我在DT1中有该值的最小值和最大值变量。我还有一个字符串,它将包含一个特定的模式。 例如:DT2中的A=3和DT1中的相应行包含minA=3,maxA=6和C=“Mon”中的C=“Mon,Tue” 我正在寻找该值所在范围内的线,以及最大目标值 我有以下简化的例子: # version 1.9.6 library(dat
DT1
)的特定行值(此处为TARGET
),其中筛选条件位于其他data.table(DT2
)中。
它不是一个精确的过滤器,因为如果我在DT2
中有值3,我在DT1
中有该值的最小值和最大值变量。我还有一个字符串,它将包含一个特定的模式。
例如:DT2
中的A=3
和DT1
中的相应行包含minA=3
,maxA=6
和C=“Mon”
中的C=“Mon,Tue”
我正在寻找该值所在范围内的线,以及最大目标值
我有以下简化的例子:
# version 1.9.6
library(data.table)
DT1 <- data.table(INDEX1 = 1:12,
minA = c(1,1,1,2,2,2,3,3,3,4,4,4),
maxA = c(4,5,6),
C = c("Mon,Tue", "Mon,Wed", "Tue,Thu", "Wed,Thu"),
TARGET = c(101:112))
size <- 2
DT2 <- data.table(A = rep(c(3,4), size),
C = rep(c("Mon", "Thu"), size),
INDEX2 = 1:(2*size))
我加入了大小
只是为了缩放和测试
到目前为止,我的解决方案如下:
rm(list = ls())
library(data.table)
DT1 <- data.table(INDEX1 = 1:12,
minA = c(1,1,1,2,2,2,3,3,3,4,4,4),
maxA = c(4,5,6),
C = c("Mon,Tue", "Mon,Wed", "Tue,Thu", "Wed,Thu"),
TARGET = c(101:112))
size <- 20000
DT2 <- data.table(A = rep(c(3,4), size),
C = rep(c("Mon", "Thu"), size),
INDEX2 = 1:(2*size))
foo <- function(i.A, i.C){
DT1[INDEX1 %in% grep(i.C, C) &
minA <= i.A &
maxA >= i.A,
][TARGET == max(TARGET),]
}
foo_new <- function(data, i.A, i.C){
data[C %in% i.C & A %between% i.A, INDEX2]
}
# with foo
DT2[, foo(i.A = A, i.C = C), by = INDEX2])
# with foo_new
DT1.ordered <- copy(DT1[order(TARGET, decreasing = TRUE)])
tmp.index <- list()
DT2[, TARGET := as.numeric(NA)]
for (i in c(1:dim(DT1.ordered)[1])) {
# i <- 1
restdata <- copy(DT2[is.na(TARGET),])
tmp.index <- foo_new(data = restdata,
i.A = unlist(DT1.ordered[i, list(minA, maxA)]),
i.C = DT1.ordered[i, strsplit(C, ",")[[1]]])
DT2[INDEX2 %in% tmp.index, TARGET := DT1.ordered[i, TARGET]]
}
我编写了一个函数foo()
有了outout:
INDEX2 INDEX1 minA maxA C TARGET
1: 1 9 3 6 Mon,Tue 109
2: 2 12 4 6 Wed,Thu 112
这是我的问题:
rm(list = ls())
library(data.table)
DT1 <- data.table(INDEX1 = 1:12,
minA = c(1,1,1,2,2,2,3,3,3,4,4,4),
maxA = c(4,5,6),
C = c("Mon,Tue", "Mon,Wed", "Tue,Thu", "Wed,Thu"),
TARGET = c(101:112))
size <- 20000
DT2 <- data.table(A = rep(c(3,4), size),
C = rep(c("Mon", "Thu"), size),
INDEX2 = 1:(2*size))
foo <- function(i.A, i.C){
DT1[INDEX1 %in% grep(i.C, C) &
minA <= i.A &
maxA >= i.A,
][TARGET == max(TARGET),]
}
foo_new <- function(data, i.A, i.C){
data[C %in% i.C & A %between% i.A, INDEX2]
}
# with foo
DT2[, foo(i.A = A, i.C = C), by = INDEX2])
# with foo_new
DT1.ordered <- copy(DT1[order(TARGET, decreasing = TRUE)])
tmp.index <- list()
DT2[, TARGET := as.numeric(NA)]
for (i in c(1:dim(DT1.ordered)[1])) {
# i <- 1
restdata <- copy(DT2[is.na(TARGET),])
tmp.index <- foo_new(data = restdata,
i.A = unlist(DT1.ordered[i, list(minA, maxA)]),
i.C = DT1.ordered[i, strsplit(C, ",")[[1]]])
DT2[INDEX2 %in% tmp.index, TARGET := DT1.ordered[i, TARGET]]
}
这对于小型data.tables很好,但我在DT2中有更多的行。这些函数需要更长的时间,我想知道是否有更好/更快的方法用于这种过滤?
也许可以“升级”foo()
,以便它可以处理整列而不是单行
如果可能,我希望避免像下面这样扩展DT1:
我想,我有一个比这些问题更复杂的过滤器:
提前感谢您的帮助。新解决方案
我意识到遍历更大的data.table的每一行都需要花费很多时间,因此我构建了一个新函数foo_new
,它的工作方式正好相反:
foo_new <- function(data, i.A, i.C){
data[C %in% i.C & A %between% i.A, INDEX2]
}
这可能是唯一的情况,当DT1小于DT2时——这是我的情况
这里是我的整个模拟代码:
rm(list = ls())
library(data.table)
DT1 <- data.table(INDEX1 = 1:12,
minA = c(1,1,1,2,2,2,3,3,3,4,4,4),
maxA = c(4,5,6),
C = c("Mon,Tue", "Mon,Wed", "Tue,Thu", "Wed,Thu"),
TARGET = c(101:112))
size <- 20000
DT2 <- data.table(A = rep(c(3,4), size),
C = rep(c("Mon", "Thu"), size),
INDEX2 = 1:(2*size))
foo <- function(i.A, i.C){
DT1[INDEX1 %in% grep(i.C, C) &
minA <= i.A &
maxA >= i.A,
][TARGET == max(TARGET),]
}
foo_new <- function(data, i.A, i.C){
data[C %in% i.C & A %between% i.A, INDEX2]
}
# with foo
DT2[, foo(i.A = A, i.C = C), by = INDEX2])
# with foo_new
DT1.ordered <- copy(DT1[order(TARGET, decreasing = TRUE)])
tmp.index <- list()
DT2[, TARGET := as.numeric(NA)]
for (i in c(1:dim(DT1.ordered)[1])) {
# i <- 1
restdata <- copy(DT2[is.na(TARGET),])
tmp.index <- foo_new(data = restdata,
i.A = unlist(DT1.ordered[i, list(minA, maxA)]),
i.C = DT1.ordered[i, strsplit(C, ",")[[1]]])
DT2[INDEX2 %in% tmp.index, TARGET := DT1.ordered[i, TARGET]]
}
rm(list=ls())
库(数据表)
DT1新解决方案
我意识到遍历更大的data.table的每一行都需要花费很多时间,因此我构建了一个新函数foo_new
,它的工作方式正好相反:
foo_new <- function(data, i.A, i.C){
data[C %in% i.C & A %between% i.A, INDEX2]
}
这可能是唯一的情况,当DT1小于DT2时——这是我的情况
这里是我的整个模拟代码:
rm(list = ls())
library(data.table)
DT1 <- data.table(INDEX1 = 1:12,
minA = c(1,1,1,2,2,2,3,3,3,4,4,4),
maxA = c(4,5,6),
C = c("Mon,Tue", "Mon,Wed", "Tue,Thu", "Wed,Thu"),
TARGET = c(101:112))
size <- 20000
DT2 <- data.table(A = rep(c(3,4), size),
C = rep(c("Mon", "Thu"), size),
INDEX2 = 1:(2*size))
foo <- function(i.A, i.C){
DT1[INDEX1 %in% grep(i.C, C) &
minA <= i.A &
maxA >= i.A,
][TARGET == max(TARGET),]
}
foo_new <- function(data, i.A, i.C){
data[C %in% i.C & A %between% i.A, INDEX2]
}
# with foo
DT2[, foo(i.A = A, i.C = C), by = INDEX2])
# with foo_new
DT1.ordered <- copy(DT1[order(TARGET, decreasing = TRUE)])
tmp.index <- list()
DT2[, TARGET := as.numeric(NA)]
for (i in c(1:dim(DT1.ordered)[1])) {
# i <- 1
restdata <- copy(DT2[is.na(TARGET),])
tmp.index <- foo_new(data = restdata,
i.A = unlist(DT1.ordered[i, list(minA, maxA)]),
i.C = DT1.ordered[i, strsplit(C, ",")[[1]]])
DT2[INDEX2 %in% tmp.index, TARGET := DT1.ordered[i, TARGET]]
}
rm(list=ls())
库(数据表)
DT1
rm(list = ls())
library(data.table)
DT1 <- data.table(INDEX1 = 1:12,
minA = c(1,1,1,2,2,2,3,3,3,4,4,4),
maxA = c(4,5,6),
C = c("Mon,Tue", "Mon,Wed", "Tue,Thu", "Wed,Thu"),
TARGET = c(101:112))
size <- 20000
DT2 <- data.table(A = rep(c(3,4), size),
C = rep(c("Mon", "Thu"), size),
INDEX2 = 1:(2*size))
foo <- function(i.A, i.C){
DT1[INDEX1 %in% grep(i.C, C) &
minA <= i.A &
maxA >= i.A,
][TARGET == max(TARGET),]
}
foo_new <- function(data, i.A, i.C){
data[C %in% i.C & A %between% i.A, INDEX2]
}
# with foo
DT2[, foo(i.A = A, i.C = C), by = INDEX2])
# with foo_new
DT1.ordered <- copy(DT1[order(TARGET, decreasing = TRUE)])
tmp.index <- list()
DT2[, TARGET := as.numeric(NA)]
for (i in c(1:dim(DT1.ordered)[1])) {
# i <- 1
restdata <- copy(DT2[is.na(TARGET),])
tmp.index <- foo_new(data = restdata,
i.A = unlist(DT1.ordered[i, list(minA, maxA)]),
i.C = DT1.ordered[i, strsplit(C, ",")[[1]]])
DT2[INDEX2 %in% tmp.index, TARGET := DT1.ordered[i, TARGET]]
}