Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/performance/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
提高R中坏的/可能不必要的应用程序的性能_R_Performance_Apply - Fatal编程技术网

提高R中坏的/可能不必要的应用程序的性能

提高R中坏的/可能不必要的应用程序的性能,r,performance,apply,R,Performance,Apply,提前感谢您在这方面的帮助。我不确定我是否使用了错误的apply,或者仅仅是违反了使我的代码变慢的其他规则。感谢您的帮助 概述:我有篮球数据,其中每一排都是篮球比赛中的一个时刻,包括球场上的10名球员、他们的球队、比赛以及该排在比赛中的时间(1-40分钟)。利用这些数据,我计算了每名球员在1-40分钟内他们在球场上的比赛百分比 例如,如果乔的球队打了20场比赛,如果在其中13场比赛中,乔在比赛的第5分钟出现在数据中,那么我们可以说乔在他的球队65%的比赛中出现在第5分钟。我在计算每个球员,每个赛季

提前感谢您在这方面的帮助。我不确定我是否使用了错误的
apply
,或者仅仅是违反了使我的代码变慢的其他规则。感谢您的帮助

概述:我有篮球数据,其中每一排都是篮球比赛中的一个时刻,包括球场上的10名球员、他们的球队、比赛以及该排在比赛中的时间(1-40分钟)。利用这些数据,我计算了每名球员在1-40分钟内他们在球场上的比赛百分比

例如,如果乔的球队打了20场比赛,如果在其中13场比赛中,乔在比赛的第5分钟出现在数据中,那么我们可以说乔在他的球队65%的比赛中出现在第5分钟。我在计算每个球员,每个赛季,每个1-40分钟的时间,在我不太小的数据中,我遇到了性能问题。以下是我目前执行此操作的功能:

library(dplyr)

# Raw Data Is Play-By-Play Data - Each Row contains stats for a pl (combination of 5 basketball players)
sheets_url <- 'https://docs.google.com/spreadsheets/d/1xmzaF6tpzVpjOmgfwHwFM_JE8LUszofjj25A5P0P21o/export?format=csv&id=1xmzaF6tpzVpjOmgfwHwFM_JE8LUszofjj25A5P0P21o&gid=630752085'
on.ct.data <- httr::content(httr::GET(url = sheets_url))

computeOnCourtByMinutePcts <- function(on.ct.data) {

  # Create Dataframe With Number Of Games Played By Team Each Season
  num.home.team.games <- on.ct.data %>%
    dplyr::group_by(homeTeamId, season) %>%
    dplyr::summarise(count = length(unique(gameId)))

  num.away.team.games <- on.ct.data %>%
    dplyr::group_by(awayTeamId, season) %>%
    dplyr::summarise(count = length(unique(gameId)))

  num.team.games <- num.home.team.games %>%
    dplyr::full_join(num.away.team.games, by = c('homeTeamId'='awayTeamId', 'season'='season')) %>%
    dplyr::mutate(gamesPlayed = rowSums(cbind(count.x, count.y), na.rm = TRUE)) %>%
    dplyr::rename(teamId = homeTeamId) %>%
    dplyr::mutate(season = as.character(season)) %>%
    dplyr::select(teamId, season, gamesPlayed)

  # Create Dataframe With Players By Season - Seems kind of bulky as well
  all.player.season.apperances <- rbind(
    on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId1, season) %>% dplyr::rename(playerId = onCtHomeId1, teamId = homeTeamId),
    on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId2, season) %>% dplyr::rename(playerId = onCtHomeId2, teamId = homeTeamId),
    on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId3, season) %>% dplyr::rename(playerId = onCtHomeId3, teamId = homeTeamId),
    on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId4, season) %>% dplyr::rename(playerId = onCtHomeId4, teamId = homeTeamId),
    on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId5, season) %>% dplyr::rename(playerId = onCtHomeId5, teamId = homeTeamId),
    on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId1, season) %>% dplyr::rename(playerId = onCtAwayId1, teamId = awayTeamId),
    on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId2, season) %>% dplyr::rename(playerId = onCtAwayId2, teamId = awayTeamId),
    on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId3, season) %>% dplyr::rename(playerId = onCtAwayId3, teamId = awayTeamId),
    on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId4, season) %>% dplyr::rename(playerId = onCtAwayId4, teamId = awayTeamId),
    on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId5, season) %>% dplyr::rename(playerId = onCtAwayId5, teamId = awayTeamId)) %>%
    dplyr::distinct(teamId, playerId, season) %>%
    dplyr::filter(!is.na(playerId))

  # For Each Player-Season, Compute Number Of Games On Court at each minute in game - this is the bad Apply
  playing.time.breakdowns <- apply(X = all.player.season.apperances, MARGIN = 1, FUN = function(thisRow) {

    # Set Player / Season Variables
    thisPlayerId = thisRow[2]
    thisSeason = thisRow[3]

    # Filter for each unique minute of each game with this player on court
    on.court.df = on.ct.data %>% 
      dplyr::filter(onCtHomeId1 == thisPlayerId | onCtHomeId2 == thisPlayerId | onCtHomeId3 == thisPlayerId | onCtHomeId4 == thisPlayerId | onCtHomeId5 == thisPlayerId |
                      onCtAwayId1 == thisPlayerId | onCtAwayId2 == thisPlayerId | onCtAwayId3 == thisPlayerId | onCtAwayId4 == thisPlayerId | onCtAwayId5 == thisPlayerId) %>%
      dplyr::filter(season == thisSeason) %>%
      dplyr::filter(!duplicated(paste0(gameId, minNumIntoGame)))

    # Turn This Into a table of minutes on court by game
    thisTable <- table(on.court.df$minNumIntoGame)

    this.player.distrubution.df <- data.frame(
      playerId = thisRow[2],
      teamId = thisRow[1],
      season = thisRow[3],
      minNumIntoGame = as.integer(names(thisTable)),
      numGamesAtMinNum = unname(thisTable) %>% as.vector(),
      stringsAsFactors = FALSE
    )

    # 40 minutes in basketball game, so previous dataframe needs 40 rows
    if(length(which(!(1:40 %in% this.player.distrubution.df$minNumIntoGame))) > 0) {
      zero.mins.played.df <- data.frame(
        playerId = thisRow[2],
        teamId = thisRow[1],
        season = thisRow[3],
        minNumIntoGame = which(!(1:40 %in% this.player.distrubution.df$minNumIntoGame)),
        numGamesAtMinNum = 0,
        stringsAsFactors = FALSE
      )

      this.player.distrubution.df <- plyr::rbind.fill(this.player.distrubution.df, zero.mins.played.df) %>% dplyr::arrange(minNumIntoGame)
    }

    # and return
    return(this.player.distrubution.df)
  })

  # Combine the output into one dataframe
  playing.time.breakdowns <- playing.time.breakdowns %>% do.call("rbind", .)

  # Join on Team-Games played
  playing.time.breakdowns <- playing.time.breakdowns %>%
    dplyr::left_join(num.team.games, by = c("teamId"="teamId", "season"="season")) %>%
    dplyr::rename(teamGamesPlayed = gamesPlayed)

  # Compute pct of games played
  playing.time.breakdowns <- playing.time.breakdowns %>%
    dplyr::mutate(pctMinNumPlayed = round(numGamesAtMinNum / teamGamesPlayed, 3))

  # Handle OT (minNumIntoGame > 40) needs a lower gamesPlayed denominator...

  # And Return
  return(playing.time.breakdowns);
}
on.ct.by.min <- computeOnCourtByMinutePcts(on.ct.data)
库(dplyr)
#原始数据是逐场数据-每行包含一个pl(5名篮球运动员的组合)的统计数据
工作表\u url%
dplyr::mutate(gamesPlayed=rowSums(cbind(count.x,count.y),na.rm=TRUE))%>%
dplyr::重命名(teamId=homeTeamId)%>%
dplyr::mutate(seasure=as.character(seasure))%>%
dplyr::选择(团队ID、季节、游戏显示)
#按赛季为球员创建数据框——看起来也有点笨重
all.player.seasure.appearances%dplyr::select(homeTeamId,onCtHomeId1,seasure)%%>%dplyr::rename(playerId=onCtHomeId1,teamId=homeTeamId),
on.ct.data%>%dplyr::select(homeTeamId,onCtHomeId2,seasure)%>%dplyr::rename(playerId=onCtHomeId2,teamId=homeTeamId),
on.ct.data%>%dplyr::select(homeTeamId,onCtHomeId3,seasure)%>%dplyr::rename(playerId=onCtHomeId3,teamId=homeTeamId),
on.ct.data%>%dplyr::select(homeTeamId,onCtHomeId4,seasure)%>%dplyr::rename(playerId=onCtHomeId4,teamId=homeTeamId),
on.ct.data%>%dplyr::select(homeTeamId,onCtHomeId5,seasure)%>%dplyr::rename(playerId=onCtHomeId5,teamId=homeTeamId),
on.ct.data%>%dplyr::select(awayTeamId,onctayedid1,seash)%>%dplyr::rename(playerId=onctayedid1,teamId=awayTeamId),
on.ct.data%>%dplyr::select(awayTeamId,onCtAwayId2,seash)%>%dplyr::rename(playerId=onCtAwayId2,teamId=awayTeamId),
on.ct.data%>%dplyr::select(awayTeamId,onCtAwayId3,seash)%>%dplyr::rename(playerId=onCtAwayId3,teamId=awayTeamId),
on.ct.data%>%dplyr::select(awayTeamId,onctayedid4,seasure)%>%dplyr::rename(playerId=onctayedid4,teamId=awayTeamId),
on.ct.data%%>%dplyr::select(awayTeamId,onctayedid5,seash)%%>%dplyr::rename(playerId=onctayedid5,teamId=awayTeamId))%%>%
dplyr::独特的(球队ID、球员ID、赛季)%>%
dplyr::过滤器(!is.na(playerId))
#对于每个球员赛季,计算比赛中每分钟在场上的比赛次数——这是最糟糕的
播放.time.breakdowns%
dplyr::filter(onCtHomeId1==thisPlayerId | onCtHomeId2==thisPlayerId | onCtHomeId3==thisPlayerId | onCtHomeId4==thisPlayerId | onCtHomeId5==thisPlayerId|
onCtAwayId1==thisPlayerId | onCtAwayId2==thisPlayerId | onCtAwayId3==thisPlayerId | onCtAwayId4==thisPlayerId | onCtAwayId5==thisPlayerId)%>%
dplyr::筛选器(季节==此季节)%>%
dplyr::筛选器(!已复制(paste0(gameId,MinnuminToName)))
#把这变成场上比赛的分钟表
(表0){
0.mins.played.df%
dplyr::重命名(teamGamesPlayed=gamesPlayed)
#计算所玩游戏的pct
播放.time.breakdowns%
dplyr::mutate(pctMinNumPlayed=round(numGamesAtMinNum/teamGamesPlayed,3))
#句柄OT(Minnumitogame>40)需要较低的游戏显示分母。。。
#返回
返回(播放、时间、故障);
}

on.ct.by.min我认为可以更简单地通过将数据转换为长格式和计算球员-分钟-团队-赛季组合来实现这一点。(从2008年开始,在这台旧电脑上运行大约需要5秒钟,这是大部分计算。)

库(tidyverse)
on.ct.data%>%
聚集(点、名称、onCtHomeId1:onCtAwayId5)%>%
变异(团队=如果其他(点%>%str)检测(“离开”),
awayTeamId,homeTeamId))%>%
选择(-spot)%>%#对于这部分,我只关心人和比赛时间。
distinct()%>%#删除在一分钟内重新定位的重复和实例。
下拉菜单()%>%
选择(-c(游戏ID:awayTeamId))%>%
计数(明尼苏达州名称、姓名、球队、赛季)
#A tibble:140581 x 5
明尼苏达州名称球队第n季
1 AahmaneSantos387c JAC 1819 1
2 1阿米尔西姆塞夫9克莱姆1819 13
3 1 AarenEdmead9cd6 NCAT 1718 1
4 AarenEdmead9cd6 NCAT 1819 1
5 1 AaronBrennanbee2 IUPU 1718 1
6 1 Aaroncalixte11d俄克拉荷马州1819 11
7 1 AaronCarver9cfa ODU 1819 2
8 1 AaronClarke3d67 SHU 1819 1
9 1 AaronFalzon213b西北1718 1
10 1加州大学洛杉矶分校奥隆假日酒店1718 11
现在我们有了这些,我们可以检查每个团队的游戏世界是什么样子的。在每个赛季的几场比赛中,每支球队每分钟踢多少场

on.ct.data.team.minutes <- on.ct.data.minute.counts %>%
  count(season, team, minNumIntoGame, gameId) %>%  
  count(season, team, minNumIntoGame) 

ggplot(on.ct.data.team.minutes %>% slice(1:1000),
       aes(minNumIntoGame, team, fill = n)) + 
  geom_tile() + facet_wrap(~season) + 
  labs(title = "# times each team played each minute (excerpt)")
on.ct.data.team.minutes%
计数(季节、团队、明尼苏达州比赛、玩家ID)%>%
计数(赛季、团队、明尼苏达州比赛)
ggplot(在.ct.data.team.minutes%>%切片上(1:1000),
aes(Minnumitogame,team,fill=n))+
geom_瓷砖()+镶嵌面_包裹(~季节)+
实验室(title=“#每队每分钟比赛次数(节选)”)

…我们可以对每个球员做同样的事情,并与他们的球队进行比较,看看他们在每一分钟为他们的球队踢了多少球

# How many games each season did each player play a given minute for each team?
on.ct.data.player.minutes <- on.ct.data.minute.counts %>%
  count(season, team, name, minNumIntoGame) %>%
  rename(player_n = n) %>%
  left_join(on.ct.data.team.minutes) %>%
  rename(team_n = n) %>% 
  mutate(player_time = player_n / team_n)

ggplot(on.ct.data.player.minutes %>% filter(name %>% str_detect("Can")),
       aes(minNumIntoGame, player_time, color = name)) +
  geom_line() + facet_wrap(~season) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1))
#每个赛季有多少场比赛
# How many games each season did each player play a given minute for each team?
on.ct.data.player.minutes <- on.ct.data.minute.counts %>%
  count(season, team, name, minNumIntoGame) %>%
  rename(player_n = n) %>%
  left_join(on.ct.data.team.minutes) %>%
  rename(team_n = n) %>% 
  mutate(player_time = player_n / team_n)

ggplot(on.ct.data.player.minutes %>% filter(name %>% str_detect("Can")),
       aes(minNumIntoGame, player_time, color = name)) +
  geom_line() + facet_wrap(~season) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1))