rvest,信息分布在多个视图中
我想把左边的排名刮一下,它分布在34个视图中,我相信(刮的新手)是Java生成的。所有视图都有相同的url,因此我无法循环这些视图 据我所知,每个视图似乎都有节点rvest,信息分布在多个视图中,r,rvest,rselenium,R,Rvest,Rselenium,我想把左边的排名刮一下,它分布在34个视图中,我相信(刮的新手)是Java生成的。所有视图都有相同的url,因此我无法循环这些视图 据我所知,每个视图似乎都有节点#elferspielerhistorie\u subc\u td,从j=0开始 我可以用铅笔把第一个条目擦掉 library(rvest) library(tidyverse) elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bund
#elferspielerhistorie\u subc\u td
,从j=0开始
我可以用铅笔把第一个条目擦掉
library(rvest)
library(tidyverse)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
# first page
elfmeter <- read_html(elfer_url)
Schuetzen <- elfmeter %>% html_nodes("#elferspielerhistorie_subcont_0 td") %>% html_text()
因为视图是动态生成的,所以每次都必须获取页面源代码。可能是,下一个按钮的ID发生了变化,所以每次迭代时都会保存该按钮
下面的代码应该可以工作。请注意,我还读取了循环完成后删除的空行:
library(rvest)
library(tidyverse)
library(RSelenium)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
rD <- rsDriver(port = 4447L, browser = "firefox")
remDr <- rD$client
remDr$navigate(elfer_url)
getTable <- function(x) {
remDr$getPageSource()[[1]] %>%
read_html %>%
html_nodes(paste0("#elferspielerhistorie_subcont_", x, " table")) %>%
html_table(fill = T) %>%
.[[1]] %>%
data.frame
}
# first page
data <- getTable(0)
for(j in 1:33) {
next_button <- remDr$findElements("css","a[id=\"ctl00_PlaceHolderContent_elfer_blaettern_elferhistorie_PagerForward\"]") %>% .[[1]]
remDr$executeScript(script = "arguments[0].scrollIntoView(true);", args = list(next_button))
next_button$clickElement()
# sometimes the loop is too fast and it cannot fetch the table. so pause here
Sys.sleep(1)
data <- rbind(data, getTable(j))
j <- j+1
}
rD$server$stop()
data <- data[-which(data$Spieler == ""),]
dim(data)
> [1] 935 10
库(rvest)
图书馆(tidyverse)
图书馆(资源库)
elfer_url%
.[[1]] %>%
数据帧
}
#首页
数据
library(rvest)
library(tidyverse)
library(RSelenium)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
rD <- rsDriver(port = 4447L, browser = "firefox")
remDr <- rD$client
remDr$navigate(elfer_url)
getTable <- function(x) {
remDr$getPageSource()[[1]] %>%
read_html %>%
html_nodes(paste0("#elferspielerhistorie_subcont_", x, " table")) %>%
html_table(fill = T) %>%
.[[1]] %>%
data.frame
}
# first page
data <- getTable(0)
for(j in 1:33) {
next_button <- remDr$findElements("css","a[id=\"ctl00_PlaceHolderContent_elfer_blaettern_elferhistorie_PagerForward\"]") %>% .[[1]]
remDr$executeScript(script = "arguments[0].scrollIntoView(true);", args = list(next_button))
next_button$clickElement()
# sometimes the loop is too fast and it cannot fetch the table. so pause here
Sys.sleep(1)
data <- rbind(data, getTable(j))
j <- j+1
}
rD$server$stop()
data <- data[-which(data$Spieler == ""),]
dim(data)
> [1] 935 10