Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/64.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
将这个丑陋的for循环转换为更R友好的东西_R_Loops_Aggregate_Vectorization - Fatal编程技术网

将这个丑陋的for循环转换为更R友好的东西

将这个丑陋的for循环转换为更R友好的东西,r,loops,aggregate,vectorization,R,Loops,Aggregate,Vectorization,一直在使用SO作为我工作的资源。谢谢你召集这么一个伟大的社区 我正在尝试做一些比较复杂的事情,现在我能想到的唯一方法就是使用一对嵌套的for循环(我知道这在R中是不受欢迎的)。。。我有三百多万的课程注册记录:学生用户ID与课程ID配对。在每一行中,都有一组数据,包括开始/结束日期和分数等。我需要做的是,对于每次注册,计算该用户在注册课程之前所修课程的平均分数 我用于for循环的代码如下所示: data$Mean.Prior.Score <- 0 for (i in as.numeric(r

一直在使用SO作为我工作的资源。谢谢你召集这么一个伟大的社区

我正在尝试做一些比较复杂的事情,现在我能想到的唯一方法就是使用一对嵌套的for循环(我知道这在R中是不受欢迎的)。。。我有三百多万的课程注册记录:学生用户ID与课程ID配对。在每一行中,都有一组数据,包括开始/结束日期和分数等。我需要做的是,对于每次注册,计算该用户在注册课程之前所修课程的平均分数

我用于for循环的代码如下所示:

data$Mean.Prior.Score <- 0
for (i in as.numeric(rownames(data)) {
    sum <- 0
    count <- 0
    for (j in as.numeric(rownames(data[data$UserID == data$UserID[i],]))) {
            if (data$Course.End.Date[j] < data$Course.Start.Date[i]) {
                sum <- sum + data$Score[j]
                count <- count + 1
            }
    }
if (count != 0)
    data$Mean.Prior.Score[i] <- sum / count
}

这似乎是你想要的

library(data.table) 
# create a data.table object
DT <- data.table(data)
# key by userID 
setkeyv(DT, 'userID')

# for each userID, where the Course.End.Date < Course.Start.Date
# return the mean score

# This is too simplistic
# DT[Course.End.Date < Course.Start.Date,
#   list(Mean.Prior.Score = mean(Score)) , 
#   by = list(userID)]
库(data.table)
#创建data.table对象

DT这只是我认为解决方案可能包含的内容的概要。为了简单起见,我将使用
plyr
来说明所需的步骤

让我们仅限于一名学生的情况。如果我们可以为一个学生计算,那么用某种拆分应用扩展它将是微不足道的

假设我们有一个特定学生的分数,按课程结束日期排序:

d <- sample(seq(as.Date("2011-01-01"),as.Date("2011-01-31"),by = 1),100,replace = TRUE)
dat <- data.frame(date = sort(d),val = rnorm(100))

d我认为类似的方法应该可以奏效,尽管每个用户有多个课程的测试数据会有所帮助。也可能需要findInterval中的开始日期+1,以使条件为End.Datefor
循环的东西:

# data.frame for testing
user <- sample.int(10000, 100)
course <- sample.int(10000, 100)
c_start <- sample(
  seq(as.Date("2004-01-01"), by="3 months", length.ou=12), 
  100, replace=TRUE
)
c_end <- c_start + as.difftime(11, units="weeks")
c_idx <- sample.int(100, 1000, replace=TRUE)
enroll <- data.frame(
  user=sample(user, 1000, replace=TRUE), 
  course=course[c_idx], 
  c_start=as.character(c_start[c_idx]), 
  c_end=as.character(c_end[c_idx]), 
  score=runif(1000),
  stringsAsFactors=FALSE
)

#variant 1: for-loop
system.time({
enroll$avg.p.score <- NA
for (i in 1:nrow(enroll)) {
  sum <- 0
  count <- 0
  for (j in which(enroll$user==enroll$user[[i]])) 
    if (enroll$c_end[[j]] < enroll$c_start[[i]]) {
      sum <- sum + enroll$score[[j]]
      count <- count + 1
    }
  if(count !=0) enroll$avg.p.score[[i]] <- sum / count
} 
})

#variant 2: sqlite
system.time({
library(RSQLite)
con <- dbConnect("SQLite", ":memory:")
dbWriteTable(con, "enroll", enroll, overwrite=TRUE)

sql <- paste("Select e.user, e.course, Avg(p.score)",
             "from enroll as e",
             "cross join enroll as p", 
             "where e.user=p.user and p.c_end < e.c_start",
             "group by e.user, e.course;")
res <- dbSendQuery(con, sql)
dat <- fetch(res, n=-1)
})
#用于测试的data.frame

用户我无法真正测试这一点,因为您的数据在任何组合中似乎都不满足不等式,但我会尝试以下方法:

library(plyr)
res <- ddply(data, .(User.ID), function(d) {
   with(subset(merge(d, d, by=NULL, suffixes=c(".i", ".j")),
               Course.End.Date.j < Course.Start.Date.i),
        c(Mean.Prior.Score = mean(Score.j)))
})
res$Mean.Prior.Score[is.nan(res$Mean.Prior.Score)] = 0
库(plyr)

res我发现
数据表
运行良好

# Create some data.
library(data.table)
set.seed(1)
n=3e6
numCourses=5 # Average courses per student
data=data.table(UserID=as.character(round(runif(n,1,round(n/numCourses)))),course=1:n,Score=runif(n),CourseStartDate=as.Date('2000-01-01')+round(runif(n,1,365)))
data$CourseEndDate=data$CourseStartDate+round(runif(n,1,100))
setkey(data,UserID)
# test=function(CourseEndDate,Score,CourseStartDate) sapply(CourseStartDate, function(y) mean(Score[y>CourseEndDate]))
# I vastly reduced the number of comparisons with a better "test" function.
test2=function(CourseEndDate,Score,CourseStartDate) {
    o.end = order(CourseEndDate)
    run.avg = cumsum(Score[o.end])/seq_along(CourseEndDate)
    idx=findInterval(CourseStartDate,CourseEndDate[o.end])
    idx=ifelse(idx==0,NA,idx)
    run.avg[idx]
}
system.time(data$MeanPriorScore<-data[,test2(CourseEndDate,Score,CourseStartDate),by=UserID]$V1) 
#  For three million courses, at an average of 5 courses per student:
#    user  system elapsed 
#    122.06    0.22  122.45 
#创建一些数据。
库(数据表)
种子(1)
n=3e6
numCourses=5#每个学生的平均课程数
data=data.table(UserID=as.character(round(runif(n,1,round(n/numCourses))),course=1:n,Score=runif(n),CourseStartDate=as.Date('2000-01-01')+round(runif(n,1365)))
数据$CourseEndDate=数据$CourseStartDate+round(runif(n,1100))
setkey(数据、用户ID)
#测试=函数(CourseEndDate、分数、CourseStartDate)平均值(CourseStartDate、函数(y)平均值(分数[y>CourseEndDate]))
#我使用更好的“测试”功能大大减少了比较的次数。
test2=函数(CourseEndDate、Score、CourseStartDate){
o、 结束=订单(CourseEndDate)
run.avg=cumsum(分数[o.end])/seq_(课程结束)
idx=findInterval(CourseStartDate,CourseEndDate[o.end])
idx=ifelse(idx==0,NA,idx)
run.avg[idx]
}

系统时间(数据$MEANPRIORSCOREPUT(头(数据,n=50))这样我们就可以理解您的数据了?data.table解决方案招手了。完成了…应该从一开始就这样做了;对不起。与其只显示它们,还不如在它们上使用dput,这样我们就可以复制itOr just
data+1,以便做出响应、礼貌并提供一个可复制的示例。FWIW,循环不受欢迎,如果您不重新分配您的对象。我强烈建议阅读Patrick Burns的R inferno(也经常访问此论坛)。我无法让dput处理此数据表,因为所有列都是因素,它希望编写所有级别,而不管我使用的子集是什么…@AndrewSannier use
subset()
的参数
drop=TRUE
@shujaa也不起作用。我想你的意思是类似于
dput(子集(数据[1:50,],drop=T))
数据\u 50您也可以使用
作为.character
而不是
液滴
。我对测试这一点很感兴趣,但似乎数据在
数据[,…,by=X]中被重新排序
,因此V1的赋值与原始未排序的数据不一致。我尝试使用
MeanPriorScore:=…
,但显然没有实现。有什么想法可以让它工作吗?抱歉,
test
函数中有一个小的输入错误。我添加了一个较小的测试集,并输出结果。数据会尽快重新排序n正如你所说的
setkey
。但是
MeanPriorScore
的顺序应该与新的顺序正确对齐,如上图所示。最后,我通过在
test
函数中使用
findInterval
大大提高了速度。我通过平均每个学生有5门课程来进行比较。非常感谢,@nograps。。。这是一个很好的解决方案。关于rownames的内容完全被破解了,其他的错误都是我在打字时犯的愚蠢错误,甚至在我运行的脚本中都没有。
# in the test data, one is POSIXct and the other a factor
data$Course.Start.Date = as.Date(data$Course.Start.Date)
data$Course.End.Date = as.Date(data$Course.End.Date)
data = data[order(data$Course.End.Date), ]
data$Mean.Prior.Score = ave(seq_along(data$User.ID), data$User.ID, FUN=function(i)
    c(NA, cumsum(data$Score[i]) / seq_along(i))[1L + findInterval(data$Course.Start.Date[i], data$Course.End.Date[i])])
# data.frame for testing
user <- sample.int(10000, 100)
course <- sample.int(10000, 100)
c_start <- sample(
  seq(as.Date("2004-01-01"), by="3 months", length.ou=12), 
  100, replace=TRUE
)
c_end <- c_start + as.difftime(11, units="weeks")
c_idx <- sample.int(100, 1000, replace=TRUE)
enroll <- data.frame(
  user=sample(user, 1000, replace=TRUE), 
  course=course[c_idx], 
  c_start=as.character(c_start[c_idx]), 
  c_end=as.character(c_end[c_idx]), 
  score=runif(1000),
  stringsAsFactors=FALSE
)

#variant 1: for-loop
system.time({
enroll$avg.p.score <- NA
for (i in 1:nrow(enroll)) {
  sum <- 0
  count <- 0
  for (j in which(enroll$user==enroll$user[[i]])) 
    if (enroll$c_end[[j]] < enroll$c_start[[i]]) {
      sum <- sum + enroll$score[[j]]
      count <- count + 1
    }
  if(count !=0) enroll$avg.p.score[[i]] <- sum / count
} 
})

#variant 2: sqlite
system.time({
library(RSQLite)
con <- dbConnect("SQLite", ":memory:")
dbWriteTable(con, "enroll", enroll, overwrite=TRUE)

sql <- paste("Select e.user, e.course, Avg(p.score)",
             "from enroll as e",
             "cross join enroll as p", 
             "where e.user=p.user and p.c_end < e.c_start",
             "group by e.user, e.course;")
res <- dbSendQuery(con, sql)
dat <- fetch(res, n=-1)
})
library(plyr)
res <- ddply(data, .(User.ID), function(d) {
   with(subset(merge(d, d, by=NULL, suffixes=c(".i", ".j")),
               Course.End.Date.j < Course.Start.Date.i),
        c(Mean.Prior.Score = mean(Score.j)))
})
res$Mean.Prior.Score[is.nan(res$Mean.Prior.Score)] = 0
# Create some data.
library(data.table)
set.seed(1)
n=3e6
numCourses=5 # Average courses per student
data=data.table(UserID=as.character(round(runif(n,1,round(n/numCourses)))),course=1:n,Score=runif(n),CourseStartDate=as.Date('2000-01-01')+round(runif(n,1,365)))
data$CourseEndDate=data$CourseStartDate+round(runif(n,1,100))
setkey(data,UserID)
# test=function(CourseEndDate,Score,CourseStartDate) sapply(CourseStartDate, function(y) mean(Score[y>CourseEndDate]))
# I vastly reduced the number of comparisons with a better "test" function.
test2=function(CourseEndDate,Score,CourseStartDate) {
    o.end = order(CourseEndDate)
    run.avg = cumsum(Score[o.end])/seq_along(CourseEndDate)
    idx=findInterval(CourseStartDate,CourseEndDate[o.end])
    idx=ifelse(idx==0,NA,idx)
    run.avg[idx]
}
system.time(data$MeanPriorScore<-data[,test2(CourseEndDate,Score,CourseStartDate),by=UserID]$V1) 
#  For three million courses, at an average of 5 courses per student:
#    user  system elapsed 
#    122.06    0.22  122.45 
set.seed(1)
n=1e2
data=data.table(UserID=as.character(round(runif(n,1,1000))),course=1:n,Score=runif(n),CourseStartDate=as.Date('2000-01-01')+round(runif(n,1,365)))
data$CourseEndDate=data$CourseStartDate+round(runif(n,1,100))
setkey(data,UserID)
data$MeanPriorScore<-data[,test2(CourseEndDate,Score,CourseStartDate),by=UserID]$V1
data["246"]
#   UserID course     Score CourseStartDate CourseEndDate MeanPriorScore
#1:    246     54 0.4531314      2000-08-09    2000-09-20      0.9437248
#2:    246     89 0.9437248      2000-02-19    2000-03-02             NA

# A comparison with your for loop (slightly modified)
data$MeanPriorScore.old<-NA # Set to NaN instead of zero for easy comparison.
# I think you forgot a bracket here. Also, There is no need to work with the rownames.
for (i in seq(nrow(data))) { 
    sum <- 0
    count <- 0
    # I reduced the complexity of figuring out the vector to loop through.
    # It will result in the exact same thing if there are no rownames.
    for (j in which(data$UserID == data$UserID[i])) {
            if (data$CourseEndDate[j] <= data$CourseStartDate[i]) {
                sum <- sum + data$Score[j]
                count <- count + 1
            }
    }
    # I had to add "[i]" here. I think that is what you meant.
    if (count != 0) data$MeanPriorScore.old[i] <- sum / count 
}
identical(data$MeanPriorScore,data$MeanPriorScore.old)
# [1] TRUE