R 使用子集（）进行矢量化？_R_Vectorization_Subset

R 使用子集（）进行矢量化？

R 使用子集（）进行矢量化？,r,vectorization,subset,R,Vectorization,Subset,对于一系列整数范围（V1到V2），我有一个分数（V3）的数据框该功能可以工作，但速度非常慢我怎样才能更有效地完成同样的事情 coord$V2=x&x>=scores[1]，3]）的速度大约是前者的两倍首先，重新创建数据： scores <- read.table(text=" V1 V2 V3 1 2037651 2037700 1.474269 2 2037659 2037708 1.021012 3 2037677 2037726 1.180993

对于一系列整数范围（

V1

到

V2

），我有一个

分数（V3
）的数据框
该功能可以工作，但速度非常慢
我怎样才能更有效地完成同样的事情
 coord$V2=x&x>=scores[1]，3]）
的速度大约是前者的两倍
首先，重新创建数据：
scores <- read.table(text="       V1      V2      V3
1 2037651 2037700 1.474269
2 2037659 2037708 1.021012
3 2037677 2037726 1.180993
4 2037685 2037734 1.717131
5 2037703 2037752 2.361985
6 2037715 2037764 1.257013", row.names=1)

coord <-data.frame(V1=c(2037652, 2037653, 2037654, 2037655, 2037656, 2037657, 2037658, 
           2037659, 2037660, 2037661, 2037662, 2037663, 2037664, 2037665, 
           2037666, 2037667, 2037668, 2037669, 2037670, 2037671))

coord$V2=x&x>=scores[1]，3]）
的速度大约是前者的两倍
首先，重新创建数据：
scores <- read.table(text="       V1      V2      V3
1 2037651 2037700 1.474269
2 2037659 2037708 1.021012
3 2037677 2037726 1.180993
4 2037685 2037734 1.717131
5 2037703 2037752 2.361985
6 2037715 2037764 1.257013", row.names=1)

coord <-data.frame(V1=c(2037652, 2037653, 2037654, 2037655, 2037656, 2037657, 2037658, 
           2037659, 2037660, 2037661, 2037662, 2037663, 2037664, 2037665, 
           2037666, 2037667, 2037668, 2037669, 2037670, 2037671))

以下是我提出的解决方案：
scores = read.table(header=FALSE,
                    text="2037651 2037700 1.474269
                          2037659 2037708 1.021012
                          2037677 2037726 1.180993
                          2037685 2037734 1.717131
                          2037703 2037752 2.361985
                          2037715 2037764 1.257013")

coord = data.frame(V1=c(2037652, 2037653, 2037654, 2037655, 2037656, 2037657,
                     2037658, 2037659, 2037660, 2037661, 2037662, 2037663,
                     2037664, 2037665, 2037666, 2037667, 2037668, 2037669,
                     2037670, 2037671))

coord_vec = coord$V1                  # Store as a vector instead of data.frame
scores_mat = as.matrix(scores)        # Store as a matrix instead of data.frame
results = numeric(length=nrow(coord)) # Pre-allocate vector to store results.

for (i in 1:nrow(coord)) {
    select_rows = ((scores_mat[, 1] <= coord_vec[i]) & 
                   (scores_mat[, 2] >= coord_vec[i]))
    scores_subset = scores_mat[select_rows, 3] # Use logical indexing.
    results[i] = mean(scores_subset)
}
results
#  [1] 1.474269 1.474269 1.474269 1.474269 1.474269 1.474269 1.474269 1.247641
#  [9] 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641
# [17] 1.247641 1.247641 1.247641 1.247641

# Benchmark results using @GSee's code. Needs library(rbenchmark).
#        test replications elapsed relative user.self sys.self
# 4 bdemarest          100   0.046 1.000000     0.046    0.001
# 2      gsee          100   0.170 3.695652     0.170    0.001
# 1      orig          100   0.358 7.782609     0.360    0.001
# 3    sepehr          100   0.163 3.543478     0.164    0.000

scores=read.table（标题=FALSE，
text=“2037651 2037700 1.474269
2037659 2037708 1.021012
2037677 2037726 1.180993
2037685 2037734 1.717131
2037703 2037752 2.361985
2037715 2037764 1.257013")
坐标=数据帧（V1=c（2037652、2037653、2037654、2037655、2037656、2037657、，
2037658, 2037659, 2037660, 2037661, 2037662, 2037663,
2037664, 2037665, 2037666, 2037667, 2037668, 2037669,
2037670, 2037671))
coord_vec=coord$V1#存储为向量而不是data.frame
scores_mat=as.matrix（scores）#存储为矩阵而不是data.frame
结果=数值（长度=nrow（坐标））#预分配向量以存储结果。
对于（i in 1:nrow（coord））{
选择行=（（分数矩阵[，1]=坐标向量[i]））
分数子集=分数矩阵[选择行，3]#使用逻辑索引。
结果[i]=平均值（分数_子集）
}
结果
#  [1] 1.474269 1.474269 1.474269 1.474269 1.474269 1.474269 1.474269 1.247641
#  [9] 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641
# [17] 1.247641 1.247641 1.247641 1.247641
#使用@GSee的代码对结果进行基准测试。需要图书馆（rbenchmark）。
#相对user.self sys.self测试复制已用时间
#4 bdemarest 100 0.046 1.000000 0.046 0.001
#2 gsee 100 0.170 3.695652 0.170 0.001
#1 orig 100 0.358 7.782609 0.360 0.001
#3小时100 0.163 3.543478 0.164 0.000

它似乎比其他提案快了一点。我非常肯定，这种优势是通过避免读取或写入data.frame（一种高开销函数）而获得的。此外，我使用逻辑索引代替subset（）
，以进一步减少开销。可能通过使用*ply策略可以加快速度？
以下是我建议的解决方案：
scores = read.table(header=FALSE,
                    text="2037651 2037700 1.474269
                          2037659 2037708 1.021012
                          2037677 2037726 1.180993
                          2037685 2037734 1.717131
                          2037703 2037752 2.361985
                          2037715 2037764 1.257013")

coord = data.frame(V1=c(2037652, 2037653, 2037654, 2037655, 2037656, 2037657,
                     2037658, 2037659, 2037660, 2037661, 2037662, 2037663,
                     2037664, 2037665, 2037666, 2037667, 2037668, 2037669,
                     2037670, 2037671))

coord_vec = coord$V1                  # Store as a vector instead of data.frame
scores_mat = as.matrix(scores)        # Store as a matrix instead of data.frame
results = numeric(length=nrow(coord)) # Pre-allocate vector to store results.

for (i in 1:nrow(coord)) {
    select_rows = ((scores_mat[, 1] <= coord_vec[i]) & 
                   (scores_mat[, 2] >= coord_vec[i]))
    scores_subset = scores_mat[select_rows, 3] # Use logical indexing.
    results[i] = mean(scores_subset)
}
results
#  [1] 1.474269 1.474269 1.474269 1.474269 1.474269 1.474269 1.474269 1.247641
#  [9] 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641
# [17] 1.247641 1.247641 1.247641 1.247641

# Benchmark results using @GSee's code. Needs library(rbenchmark).
#        test replications elapsed relative user.self sys.self
# 4 bdemarest          100   0.046 1.000000     0.046    0.001
# 2      gsee          100   0.170 3.695652     0.170    0.001
# 1      orig          100   0.358 7.782609     0.360    0.001
# 3    sepehr          100   0.163 3.543478     0.164    0.000

scores=read.table（标题=FALSE，
text=“2037651 2037700 1.474269
2037659 2037708 1.021012
2037677 2037726 1.180993
2037685 2037734 1.717131
2037703 2037752 2.361985
2037715 2037764 1.257013")
坐标=数据帧（V1=c（2037652、2037653、2037654、2037655、2037656、2037657、，
2037658, 2037659, 2037660, 2037661, 2037662, 2037663,
2037664, 2037665, 2037666, 2037667, 2037668, 2037669,
2037670, 2037671))
coord_vec=coord$V1#存储为向量而不是data.frame
scores_mat=as.matrix（scores）#存储为矩阵而不是data.frame
结果=数值（长度=nrow（坐标））#预分配向量以存储结果。
对于（i in 1:nrow（coord））{
选择行=（（分数矩阵[，1]=坐标向量[i]））
分数子集=分数矩阵[选择行，3]#使用逻辑索引。
结果[i]=平均值（分数_子集）
}
结果
#  [1] 1.474269 1.474269 1.474269 1.474269 1.474269 1.474269 1.474269 1.247641
#  [9] 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641
# [17] 1.247641 1.247641 1.247641 1.247641
#使用@GSee的代码对结果进行基准测试。需要图书馆（rbenchmark）。
#相对user.self sys.self测试复制已用时间
#4 bdemarest 100 0.046 1.000000 0.046 0.001
#2 gsee 100 0.170 3.695652 0.170 0.001
#1 orig 100 0.358 7.782609 0.360 0.001
#3小时100 0.163 3.543478 0.164 0.000

它似乎比其他提案快了一点。我非常肯定，这种优势是通过避免读取或写入data.frame（一种高开销函数）而获得的。此外，我使用逻辑索引代替subset（）
，以进一步减少开销。可能通过使用*ply策略可以加快速度？
你是指coords$V
还是coords$V1
？我想你可能想使用cut来创建一个新列，然后使用split-lapply组合，但很难准确猜测你想要的是什么。我没有得到与我使用你的代码时相同的输出。我的解决方案是：coord$v2x&x>分数[，1]，3]）
。然后我得到您显示的输出，但它比您的慢4倍：-（当我使用for循环时，所有V2都是1.474269）我刚刚发布了一个类似的代码，然后我看到了你的评论，我删除了我的代码。虽然sapply和lapply仍然是loop，但我认为它们处理数据的效率更高，应该更快！我想知道你为什么说它慢了4倍？对不起。我最初制作了coord
向量，所以我不得不将nrow
改为le当我制作了coord
adata.frame
时，我忘了把它改回来。现在，看起来使用sapply
要快一点，你是说coords$V
还是coords$V1
？我想你可能想使用cut来创建一个新的列，然后使用split-lapply组合，但很难猜测精确我没有得到与使用代码时相同的输出。我的解决方案是：coord$V2 x&x>scores[，1]，3]）。然后我得到了您显示的输出，但是它比您的慢4倍：-（当我使用for循环时，所有V2都是1.474269）我刚刚发布了一个类似的代码，然后
gsee <- function(coord) {
    coord$V2 <- sapply(coord$V1, function(x) mean(scores[scores[, 2] >= x & x >=  scores[, 1], 3]))
    coord
}

orig <- function(coord) {
    for(i in 1:NROW(coord)){
        range_scores<-subset(scores, scores$V1 <= coord$V1[i] & scores$V2 >= coord$V1[i]);
        coord$V2[i]<-mean(range_scores$V3)
    }
    coord
}
identical(gsee(coord), orig(coord))  # TRUE
benchmark(orig=orig(coord), gsee=gsee(coord))

test replications elapsed relative user.self sys.self user.child sys.child
2 gsee          100   0.175 1.000000     0.175    0.000          0         0
1 orig          100   0.379 2.165714     0.377    0.002          0         0 

sepehr <- function(coord) {
    coord$V2 <- unlist(lapply(coord$V1, function(x) mean(scores[scores[, 2] >= x & x >=  scores[, 1], 3])))
    coord
}
benchmark(orig=orig(coord), gsee=gsee(coord), sepehr=sepehr(coord))
test replications elapsed relative user.self sys.self user.child sys.child
2   gsee          100   0.171 1.023952     0.171    0.000          0         0
1   orig          100   0.369 2.209581     0.369    0.001          0         0
3 sepehr          100   0.167 1.000000     0.167    0.000          0         0

scores = read.table(header=FALSE,
                    text="2037651 2037700 1.474269
                          2037659 2037708 1.021012
                          2037677 2037726 1.180993
                          2037685 2037734 1.717131
                          2037703 2037752 2.361985
                          2037715 2037764 1.257013")

coord = data.frame(V1=c(2037652, 2037653, 2037654, 2037655, 2037656, 2037657,
                     2037658, 2037659, 2037660, 2037661, 2037662, 2037663,
                     2037664, 2037665, 2037666, 2037667, 2037668, 2037669,
                     2037670, 2037671))

coord_vec = coord$V1                  # Store as a vector instead of data.frame
scores_mat = as.matrix(scores)        # Store as a matrix instead of data.frame
results = numeric(length=nrow(coord)) # Pre-allocate vector to store results.

for (i in 1:nrow(coord)) {
    select_rows = ((scores_mat[, 1] <= coord_vec[i]) & 
                   (scores_mat[, 2] >= coord_vec[i]))
    scores_subset = scores_mat[select_rows, 3] # Use logical indexing.
    results[i] = mean(scores_subset)
}
results
#  [1] 1.474269 1.474269 1.474269 1.474269 1.474269 1.474269 1.474269 1.247641
#  [9] 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641 1.247641
# [17] 1.247641 1.247641 1.247641 1.247641

# Benchmark results using @GSee's code. Needs library(rbenchmark).
#        test replications elapsed relative user.self sys.self
# 4 bdemarest          100   0.046 1.000000     0.046    0.001
# 2      gsee          100   0.170 3.695652     0.170    0.001
# 1      orig          100   0.358 7.782609     0.360    0.001
# 3    sepehr          100   0.163 3.543478     0.164    0.000