R 对数据帧中的成对元素进行操作_R_For Loop_Apply_Reshape_Microbenchmark

R 对数据帧中的成对元素进行操作

r for-loop

R 对数据帧中的成对元素进行操作,r,for-loop,apply,reshape,microbenchmark,R,For Loop,Apply,Reshape,Microbenchmark,我有两个数据帧，x和weights，其中列是成对的。以下是示例数据帧： x = read.table(text = " yr1 yr2 yr3 yr4 10 15 6 8 10 20 30 NA NA 5 2 3 100 100 NA NA", sep = "", header = TRUE) weights = read.table(text = " yr1 yr2 yr3 yr4 2

我有两个数据帧，

和

weights

，其中列是成对的。以下是示例数据帧：

x = read.table(text = "
  yr1  yr2  yr3  yr4
   10   15    6    8
   10   20   30   NA
   NA    5    2    3
  100  100   NA   NA", 
sep = "", header = TRUE)

weights = read.table(text = "
  yr1  yr2  yr3  yr4
    2    4    1    3
    2    2    4    2
    3    2    2    3
    4    2    2    4", 
sep = "", header = TRUE)

列

yr1

和

yr2

是一对，列

yr3

和

yr4

是另一对。根据我的实际数据，列上升到

yr100

，共有50对列

如果

中缺少

yr1

或

yr2

，我想用以下内容填充缺少的观察值，例如：

(5 / 2) * 3

同样，对于

yr3

或

yr4

：

(30 / 4) * 2

其中5（或30）是

列中给定一对元素未缺失的元素。第一个示例的值2和3（以及第二个示例中的值4和2）是

weights

数据帧中

数据帧中给定元素对的对应元素。如果一对中的两个元素在

中都缺失，我想将它们保留为缺失

下面是

代码，它使用嵌套的

for循环执行上述操作。然而，在我的实际数据集中有2000或3000行，嵌套的for循环
现在已经运行了10个小时以上
for(i in 1: (ncol(x)/2)) {
  for(j in 1: nrow(x)) {

    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)] 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  NA 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  NA

 }
}

我已经意识到第三个和第四个if
语句可能是不必要的。如果我只删除这两条if
语句，那么运行此代码的时间可能会大大减少
但是，我还提出了以下替代解决方案，它使用重塑
而不是嵌套的进行循环
：
n.years <- 4

x2  <- reshape(x      , direction="long", varying = list(seq(1,(n.years-1),2), seq(2,n.years,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))
wt2 <- reshape(weights, direction="long", varying = list(seq(1,(n.years-1),2), seq(2,n.years,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))

x2$yr1  <- ifelse(is.na(x2$yr1), (x2$yr2 / wt2$yr2) * wt2$yr1, x2$yr1)
x2$yr2  <- ifelse(is.na(x2$yr2), (x2$yr1 / wt2$yr1) * wt2$yr2, x2$yr2)

x3  <- reshape(x2, direction="wide", varying = list(seq(1,3,2), seq(2,4,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))
x3

n.years这对你有用吗
请注意，我没有使用替换函数，因为我发现它有点混乱，所以您必须修复如何用公式替换yr1和yr2变量。此外，如果需要能够将结果附加到原始数据帧，则可能需要对结果进行重塑
newx <- 
reshape(x, direction="long",varying=list(1:50*2-1,1:50*2), v.names=c("v1","v2"))

newwt <- 
reshape(weights, direction="long",varying=list(1:50*2-1,1:50*2), v.names=c("w1","w2"))

condwtmean <- function(x,y,wtx,wty){
    if(xor(is.na(x),is.na(y))){
        if(is.na(x))
            x <- y # replacement function
        if(is.na(y))
            y <- x # replacement function
        return(weighted.mean(c(x,y),c(wtx,wty)))
    }
    else if(!is.na(x) & !is.na(y))
        return(weighted.mean(c(x,y),c(wtx,wty)))
    else
        return(NA)  
}
newx$wtmean <- mapply(condwtmean, newx$v1, newx$v2, newwt$w1, newwt$w2)

newx托马斯的答案比我尝试的三种方法中的任何一种都要好。在这里，我将这四种方法与microbenchmark
进行比较。我还没有用实际数据尝试托马斯的答案。我最初的嵌套for循环方法在22小时后仍在运行
Unit: milliseconds
             expr       min        lq   median       uq      max neval
 fn.1(x, weights)  98.69133  99.47574 100.5313 101.7315 108.8757    20
 fn.2(x, weights) 755.51583 758.12175 762.3775 776.0558 801.9615    20
 fn.3(x, weights) 564.21423 567.98822 568.5322 571.0975 575.1809    20
 fn.4(x, weights) 367.05862 370.52657 371.7439 373.7367 395.0423    20

#########################################################################################

# create data

set.seed(1234)

n.rows <- 40
n.cols <- 40
n.sample <- n.rows * n.cols

x <- sample(20, n.sample, replace=TRUE)
x.NA <- sample(n.rows*n.cols, 10*(n.sample / n.rows), replace=FALSE)
x[x.NA] <- NA
x <- as.data.frame(matrix(x, nrow = n.rows))

weights <- sample(4, n.sample, replace=TRUE)
weights <- as.data.frame(matrix(weights, nrow = n.rows))
weights

#########################################################################################

# Thomas's function

fn.1 <- function(x, weights){

newx <- reshape(x, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names=c("v1", "v2"))

newwt <- reshape(weights, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names=c("w1", "w2"))

condwtmean <- function(x,y,wtx,wty){
    if(xor(is.na(x),is.na(y))){
        if(is.na(x))
            x <- (y / wty) * wtx # replacement function
        if(is.na(y))
            y <- (x / wtx) * wty # replacement function
        return(weighted.mean(c(x,y),c(wtx,wty)))
    }
    else if(!is.na(x) & !is.na(y))
        return(weighted.mean(c(x,y),c(wtx,wty)))
    else
        return(NA)  
}

newx$wtmean <- mapply(condwtmean, newx$v1, newx$v2, newwt$w1, newwt$w2)

newx2 <- reshape(newx[,c(1,4:5)], v.names = "wtmean", timevar = "time", direction = "wide")

newx2 <- newx2[,2:(n.cols/2+1)]
names(newx2) <- paste('X', 1:(n.cols/2), sep = "")

return(newx2)

}

fn.1.output <- fn.1(x, weights)

#########################################################################################

# nested for-loops with 4 if statements

fn.2 <- function(x, weights){

for(i in 1: (ncol(x)/2)) {
  for(j in 1: nrow(x)) {

    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)] 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  NA 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  NA

 }
}

x.weights = x * weights

numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
  apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})

denominator <- sapply(seq(1,ncol(weights),2), function(i) {
  apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})

weighted.x <- numerator/denominator

for(i in 1: (ncol(x)/2)) {
  for(j in 1:   nrow(x)      ) {

    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  NA 

 }
}

return(weighted.x)

}

fn.2.output <- fn.2(x, weights)

fn.2.output <- as.data.frame(fn.2.output)
names(fn.2.output) <- paste('X', 1:(n.cols/2), sep = "")

#########################################################################################

# nested for-loops with 2 if statements

fn.3 <- function(x, weights){

for(i in 1: (ncol(x)/2)) {
  for(j in 1: nrow(x)) {

    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)] 

 }
}

x.weights = x * weights

numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
  apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})

denominator <- sapply(seq(1,ncol(weights),2), function(i) {
  apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})

weighted.x <- numerator/denominator

for(i in 1: (ncol(x)/2)) {
  for(j in 1:   nrow(x)      ) {

    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  NA 

 }
}

return(weighted.x)

}

fn.3.output <- fn.3(x, weights)

fn.3.output <- as.data.frame(fn.3.output)
names(fn.3.output) <- paste('X', 1:(n.cols/2), sep = "")

#########################################################################################

# my reshape solution

fn.4 <- function(x, weights){

new.x    <- reshape(x      , direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names = c("v1", "v2"))
wt       <- reshape(weights, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names = c("w1", "w2"))

new.x$v1 <- ifelse(is.na(new.x$v1), (new.x$v2 / wt$w2) * wt$w1, new.x$v1)
new.x$v2 <- ifelse(is.na(new.x$v2), (new.x$v1 / wt$w1) * wt$w2, new.x$v2)

x2  <- reshape(new.x, direction="wide", varying = list(seq(1,3,2), seq(2,4,2)), v.names = c("v1", "v2")) 

x <- x2[,2:(n.cols+1)]

x.weights = x * weights

numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
  apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})

denominator <- sapply(seq(1,ncol(weights),2), function(i) {
  apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})

weighted.x <- numerator/denominator

for(i in 1: (ncol(x)/2)) {
  for(j in 1:   nrow(x)      ) {

    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  NA 

 }
}

return(weighted.x)

}

fn.4.output <- fn.4(x, weights)

fn.4.output <- as.data.frame(fn.4.output)
names(fn.4.output) <- paste('X', 1:(n.cols/2), sep = "")

#########################################################################################

rownames(fn.1.output) <- NULL
rownames(fn.2.output) <- NULL
rownames(fn.3.output) <- NULL
rownames(fn.4.output) <- NULL

all.equal(fn.1.output, fn.2.output)
all.equal(fn.1.output, fn.3.output)
all.equal(fn.1.output, fn.4.output)
all.equal(fn.2.output, fn.3.output)
all.equal(fn.2.output, fn.4.output)
all.equal(fn.3.output, fn.4.output)

library(microbenchmark)

microbenchmark(fn.1(x, weights), fn.2(x, weights), fn.3(x, weights), fn.4(x, weights), times=20)

#########################################################################################

单位：毫秒
expr最小lq中值uq最大neval
fn.1（x，重量）98.69133 99.47574 100.5313 101.7315 108.8757 20
fn.2（x，重量）755.51583 758.12175 762.3775 776.0558 801.9615 20
fn.3（x，重量）564.21423 567.98822 568.5322 571.0975 575.1809 20
fn.4（x，重量）367.05862 370.52657 371.7439 373.7367 395.0423 20
#########################################################################################
#创建数据
种子集（1234）
n、 行您最终要对数据做什么？在这种情况下，我怀疑直接获得所需的输出可能比修改原始数据框更容易。一旦我填写了缺失的观察值，我希望使用权重数据框中的数据对每对元素执行加权平均。