在R中,提高解析长XML文档的效率/速度

在R中,提高解析长XML文档的效率/速度,r,xml,R,Xml,我手头有一些XML格式的sportVU数据(篮球运动员的球员坐标跟踪),我正在试图找到一种有效的方法将其解析为R数据帧。以下是我正在处理的数据类型,仅供参考: xml_str = '<sequences period="1"> <moment game-clock="720.00" time="1453777809409" game-event-id="" shot-clock="24.00" locations="-1,-1,49.37744,29.

我手头有一些XML格式的sportVU数据(篮球运动员的球员坐标跟踪),我正在试图找到一种有效的方法将其解析为R数据帧。以下是我正在处理的数据类型,仅供参考:

xml_str = '<sequences period="1">
             <moment game-clock="720.00" time="1453777809409" game-event-id="" shot-clock="24.00" locations="-1,-1,49.37744,29.70218,4.18768;23,262882,20.60259,20.94833,0;23,266358,44.9396,19.39261,0;23,509450,29.94114,25.65156,0;23,604898,46.12643,31.1069,0;23,699950,65.75548,29.05962,0;5312,263884,63.23074,19.75536,0;5312,329824,46.12643,31.1069,0;5312,340730,65.75548,29.05962,0;5312,462980,65.2449,25.5923,0;5312,609567,44.9396,19.39261,0"/>
             <moment game-clock="720.00" time="1453777809448" game-event-id="" shot-clock="24.00" locations="-1,-1,49.37601,29.74255,4.25405;23,262882,20.47625,21.10303,0;23,266358,44.94889,19.41382,0;23,509450,29.95246,25.6483,0;23,604898,46.13548,31.11234,0;23,699950,65.54889,29.13574,0;5312,263884,63.19786,19.78119,0;5312,329824,46.13548,31.11234,0;5312,340730,65.54889,29.13574,0;5312,462980,65.22661,25.67978,0;5312,609567,44.94889,19.41382,0"/>
             <moment game-clock="720.00" time="1453777809489" game-event-id="" shot-clock="24.00" locations="-1,-1,49.27471,30.13754,4.01725;23,262882,20.34594,21.24155,0;23,266358,44.94417,19.49824,0;23,509450,29.972,25.6471,0;23,604898,46.14312,31.10966,0;23,699950,65.33004,29.25136,0;5312,263884,63.16254,19.80861,0;5312,329824,46.14312,31.10966,0;5312,340730,65.33004,29.25136,0;5312,462980,65.2156,25.73148,0;5312,609567,44.94417,19.49824,0"/>
             <moment game-clock="720.00" time="1453777809529" game-event-id="" shot-clock="24.00" locations="-1,-1,49.37694,29.72015,4.22522;23,262882,20.21595,21.39361,0;23,266358,44.94711,19.5666,0;23,509450,29.99479,25.64543,0;23,604898,46.15027,31.12124,0;23,699950,65.11179,29.36794,0;5312,263884,63.12304,19.83758,0;5312,329824,46.15027,31.12124,0;5312,340730,65.11179,29.36794,0;5312,462980,65.20283,25.7811,0;5312,609567,44.94711,19.5666,0"/>
             <moment game-clock="720.00" time="1453777809569" game-event-id="" shot-clock="24.00" locations="-1,-1,49.24878,29.90948,4.104;23,262882,20.08169,21.52092,0;23,266358,44.95597,19.55453,0;23,509450,30.01362,25.65796,0;23,604898,46.15381,31.12995,0;23,699950,64.87379,29.54509,0;5312,263884,63.08375,19.86668,0;5312,329824,46.15381,31.12995,0;5312,340730,64.87379,29.54509,0;5312,462980,65.19289,25.82793,0;5312,609567,44.95597,19.55453,0"/>
             <moment game-clock="720.00" time="1453777809609" game-event-id="" shot-clock="24.00" locations="-1,-1,49.34944,29.53386,4.37756;23,262882,19.93925,21.57641,0;23,266358,44.95367,19.55548,0;23,509450,30.02373,25.68459,0;23,604898,46.15968,31.14234,0;23,699950,64.63944,29.72012,0;5312,263884,63.03494,19.95414,0;5312,329824,46.15968,31.14234,0;5312,340730,64.63944,29.72012,0;5312,462980,65.18154,25.87779,0;5312,609567,44.95367,19.55548,0"/>
             <moment game-clock="720.00" time="1453777809649" game-event-id="" shot-clock="24.00" locations="-1,-1,49.29842,29.74836,4.29728;23,262882,19.7968,21.62425,0;23,266358,44.97828,19.51615,0;23,509450,30.02255,25.64997,0;23,604898,46.1634,31.14518,0;23,699950,64.44716,29.84182,0;5312,263884,62.98457,20.04141,0;5312,329824,46.1634,31.14518,0;5312,340730,64.44716,29.84182,0;5312,462980,65.15931,25.93168,0;5312,609567,44.97828,19.51615,0"/>
             <moment game-clock="720.00" time="1453777809689" game-event-id="" shot-clock="24.00" locations="-1,-1,49.37601,29.74255,4.25405;23,262882,19.65811,21.66964,0;23,266358,45.05661,19.46085,0;23,509450,30.01974,25.62911,0;23,604898,46.19779,31.17625,0;23,699950,64.29233,29.91812,0;5312,263884,62.93202,20.1291,0;5312,329824,46.19779,31.17625,0;5312,340730,64.29233,29.91812,0;5312,462980,65.14666,25.96346,0;5312,609567,45.05661,19.46085,0"/>
             <moment game-clock="720.00" time="1453777809728" game-event-id="" shot-clock="24.00" locations="-1,-1,49.42696,29.528,4.33445;23,262882,19.52976,21.72166,0;23,266358,45.07626,19.45181,0;23,509450,30.01819,25.62626,0;23,604898,46.20207,31.19139,0;23,699950,64.10344,29.96732,0;5312,263884,62.87931,20.21896,0;5312,329824,46.20207,31.19139,0;5312,340730,64.10344,29.96732,0;5312,462980,65.12518,26.01317,0;5312,609567,45.07626,19.45181,0"/>
             <moment game-clock="720.00" time="1453777809769" game-event-id="" shot-clock="24.00" locations="-1,-1,49.42696,29.528,4.33445;23,262882,19.40259,21.77219,0;23,266358,45.01633,19.41528,0;23,509450,30.01316,25.61478,0;23,604898,46.20748,31.21032,0;23,699950,63.94112,30.00692,0;5312,263884,62.82417,20.31417,0;5312,329824,46.20748,31.21032,0;5312,340730,63.94112,30.00692,0;5312,462980,65.10361,26.05668,0;5312,609567,45.01633,19.41528,0"/>
             <moment game-clock="720.00" time="1453777809809" game-event-id="" shot-clock="24.00" locations="-1,-1,49.40035,29.31999,4.45764;23,262882,19.26225,21.74403,0;23,266358,45.03757,19.35151,0;23,509450,30.00625,25.60483,0;23,604898,46.21089,31.22762,0;23,699950,63.76999,30.04214,0;5312,263884,62.7722,20.407,0;5312,329824,46.21089,31.22762,0;5312,340730,63.76999,30.04214,0;5312,462980,65.0792,26.10155,0;5312,609567,45.03757,19.35151,0"/>
             <moment game-clock="720.00" time="1453777809849" game-event-id="" shot-clock="24.00" locations="-1,-1,49.17246,29.8749,4.08078;23,262882,19.13395,21.78725,0;23,266358,45.04715,19.21508,0;23,509450,29.9845,25.51368,0;23,604898,46.26867,31.28159,0;23,699950,63.61645,30.0221,0;5312,263884,62.71828,20.50007,0;5312,329824,46.26867,31.28159,0;5312,340730,63.61645,30.0221,0;5312,462980,65.04425,26.18745,0;5312,609567,45.04715,19.21508,0"/>
             <moment game-clock="720.00" time="1453777809889" game-event-id="" shot-clock="24.00" locations="-1,-1,49.24667,29.2957,4.46901;23,262882,19.00912,21.82333,0;23,266358,45.02115,19.10097,0;23,509450,29.96265,25.43249,0;23,604898,46.26457,31.32984,0;23,699950,63.46754,29.99107,0;5312,263884,62.66985,20.58952,0;5312,329824,46.26457,31.32984,0;5312,340730,63.46754,29.99107,0;5312,462980,65.01059,26.25959,0;5312,609567,45.02115,19.10097,0"/>
             <moment game-clock="720.00" time="1453777809928" game-event-id="" shot-clock="24.00" locations="-1,-1,49.01807,29.85051,4.09221;23,262882,18.88202,21.79858,0;23,266358,44.95699,19.07127,0;23,509450,29.93991,25.35426,0;23,604898,46.25301,31.35973,0;23,699950,63.29378,30.00075,0;5312,263884,62.61706,20.68884,0;5312,329824,46.25301,31.35973,0;5312,340730,63.29378,30.00075,0;5312,462980,64.98385,26.29224,0;5312,609567,44.95699,19.07127,0"/>
             <moment game-clock="720.00" time="1453777809969" game-event-id="" shot-clock="24.00" locations="-1,-1,48.93946,29.87874,4.1644;23,262882,18.75688,21.75509,0;23,266358,44.95388,19.00199,0;23,509450,29.91814,25.27759,0;23,604898,46.23982,31.38857,0;23,699950,63.12091,30.01038,0;5312,263884,62.57866,20.72805,0;5312,329824,46.23982,31.38857,0;5312,340730,63.12091,30.01038,0;5312,462980,64.95801,26.3292,0;5312,609567,44.95388,19.00199,0"/>
          </sequences>'
xml\u str='1〕
'
这是实际数据的一个子集,其中一场篮球比赛大致相当于(25/秒)*(10名球员)*(48*60秒比赛)=每场比赛720000个瞬间节点。我的目标是将其解析为一个有组织的数据帧-前4列是瞬间节点的前4个属性(游戏时钟、时间、游戏事件id、射门时钟),然后有几列数据来自位置属性,位置属性存储10个不同玩家加上球的坐标数据,全部在一个字符串中(每个位置属性有55个数字,10名球员各5个,球5个)

这是我的代码,它可以正常工作,但为了提高效率,我需要加快一些。我使用XML包和其中的简单函数,但可以切换到xml2或做任何其他需要加速的事情

sportvu = xmlParse(xml_str)

# quick load the data into a dataframe
sportvu_df <- data.frame(
  game_clock = sapply(sportvu["//sequences/moment/@game-clock"], as, "numeric"),
  time = sapply(sportvu["//sequences/moment/@time"], as, "character"),
  game_event_id = sapply(sportvu["//sequences/moment/@game-event-id"], as, "integer"),
  shot_clock = sapply(sportvu["//sequences/moment/@shot-clock"], as, "numeric"), 
  locations = sapply(sportvu["//sequences/moment/@locations"], as, "character")
)

# convert locations into what we want, many more columns 
locations = as.character(sportvu_df$locations)
locations = strsplit(gsub(";", ",", locations), ",")
reps = sapply(locations, length) / 5
locations = as.data.frame(matrix(unlist(locations), ncol = 5, byrow = TRUE), stringsAsFactors = FALSE)
colnames(locations) = c("team_id", "global.player.id", "x_loc", "y_loc", "radius")

locations$global.player.id = as.integer(locations$global.player.id); locations$team_id = as.integer(locations$team_id); locations$x_loc = as.numeric(locations$x_loc); locations$y_loc = as.numeric(locations$y_loc); locations$radius = as.numeric(locations$radius)

# connect locations back with sportvu_df
sportvu_df = sportvu_df[!(names(sportvu_df) %in% "locations")]
sportvu_df = sportvu_df[rep(row.names(sportvu_df), reps), ]
sportvu_df = cbind(sportvu_df, locations)
sportvu_df$order = seq(1,nrow(sportvu_df),by=1)
sportvu=xmlParse(xml_str)
#快速将数据加载到数据帧中

sportvu_df不需要将类型转换包装到
sapply
函数中。它们是矢量化的,因此直接调用就足够了:

sportvu_df <- data.frame(
  game_clock = as.numeric(sportvu["//sequences/moment/@game-clock"]),
  time = as.character(sportvu["//sequences/moment/@time"]),
  game_event_id = as.integer(sportvu["//sequences/moment/@game-event-id"]),
  shot_clock = as.numeric(sportvu["//sequences/moment/@shot-clock"]), 
  locations = as.character(sportvu["//sequences/moment/@locations"])
)

sportvu_df不需要将类型转换包装到
sapply
函数中。它们是矢量化的,因此直接调用就足够了:

sportvu_df <- data.frame(
  game_clock = as.numeric(sportvu["//sequences/moment/@game-clock"]),
  time = as.character(sportvu["//sequences/moment/@time"]),
  game_event_id = as.integer(sportvu["//sequences/moment/@game-event-id"]),
  shot_clock = as.numeric(sportvu["//sequences/moment/@shot-clock"]), 
  locations = as.character(sportvu["//sequences/moment/@locations"])
)

sportvu_df显著加速,未使用microbenchmark测量,但速度似乎快了10-100倍。非常感谢!显著加速,未使用microbenchmark测量,但速度似乎快了10-100倍。非常感谢!