R:使用rvest抓取表中每个单元格的悬停文本标题
我使用rvest从一些javascript表(比如这里的表)中提取数据 如您所见,在该表中,每个单元格都有一个值,并且当您将鼠标悬停在上面时,还有另一个附加值 我可以用rvest刮桌子,就像这样:R:使用rvest抓取表中每个单元格的悬停文本标题,r,R,我使用rvest从一些javascript表(比如这里的表)中提取数据 如您所见,在该表中,每个单元格都有一个值,并且当您将鼠标悬停在上面时,还有另一个附加值 我可以用rvest刮桌子,就像这样: tips <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd") %>% html_table(header=TRUE) tips% html_表格(标题=TRUE) 但我不确定如
tips <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd") %>%
html_table(header=TRUE)
tips%
html_表格(标题=TRUE)
但我不确定如何获取悬停值。我可以用rvest实现这一点吗?您可以扩展/修改
rvest:::html\u table.xml\u node
方法,如下所示:见R评论
my_html_table <- function(x, header = NA, trim = TRUE, fill = FALSE, dec = ".", attr_name = ""){
rows <- html_nodes(x, "tr")
n <- length(rows)
cells <- lapply(rows, "html_nodes", xpath = ".//td|.//th")
ncols <- lapply(cells, html_attr, "colspan", default = "1")
ncols <- lapply(ncols, as.integer)
p <- unique(vapply(ncols, sum, integer(1)))
if (length(p) > 1) {
if (!fill) {
stop("Table has inconsistent number of columns. ",
"Do you want fill = TRUE?", call. = FALSE)
}
else {
p <- max(p)
}
}
#############################
## The following line is the only one that was changed
#############################
values <- lapply(cells, html_attr, attr_name)
# insted of
# values <- lapply(cells, html_text, trim = trim)
out <- matrix(NA_character_, nrow = n, ncol = p)
for (i in seq_len(n)) {
row <- values[[i]]
ncol <- ncols[[i]]
col <- 1
for (j in seq_len(p)) {
if (j > length(row))
next
out[i, col] <- row[[j]]
col <- col + ncol[j]
}
}
if (is.na(header)) {
header <- all(html_name(cells[[1]]) == "th")
}
if (header) {
col_names <- out[1, , drop = FALSE]
out <- out[-1, , drop = FALSE]
} else {
col_names <- paste0("X", seq_len(ncol(out)))
}
df <- lapply(seq_len(p), function(i) {
utils::type.convert(out[, i], as.is = TRUE, dec = dec)
})
names(df) <- col_names
class(df) <- "data.frame"
attr(df, "row.names") <- .set_row_names(length(df[[1]]))
df
}
my\u html\u table您可以扩展/修改rvest::html\u table.xml\u节点
方法,如下所示:
见R评论
my_html_table <- function(x, header = NA, trim = TRUE, fill = FALSE, dec = ".", attr_name = ""){
rows <- html_nodes(x, "tr")
n <- length(rows)
cells <- lapply(rows, "html_nodes", xpath = ".//td|.//th")
ncols <- lapply(cells, html_attr, "colspan", default = "1")
ncols <- lapply(ncols, as.integer)
p <- unique(vapply(ncols, sum, integer(1)))
if (length(p) > 1) {
if (!fill) {
stop("Table has inconsistent number of columns. ",
"Do you want fill = TRUE?", call. = FALSE)
}
else {
p <- max(p)
}
}
#############################
## The following line is the only one that was changed
#############################
values <- lapply(cells, html_attr, attr_name)
# insted of
# values <- lapply(cells, html_text, trim = trim)
out <- matrix(NA_character_, nrow = n, ncol = p)
for (i in seq_len(n)) {
row <- values[[i]]
ncol <- ncols[[i]]
col <- 1
for (j in seq_len(p)) {
if (j > length(row))
next
out[i, col] <- row[[j]]
col <- col + ncol[j]
}
}
if (is.na(header)) {
header <- all(html_name(cells[[1]]) == "th")
}
if (header) {
col_names <- out[1, , drop = FALSE]
out <- out[-1, , drop = FALSE]
} else {
col_names <- paste0("X", seq_len(ncol(out)))
}
df <- lapply(seq_len(p), function(i) {
utils::type.convert(out[, i], as.is = TRUE, dec = dec)
})
names(df) <- col_names
class(df) <- "data.frame"
attr(df, "row.names") <- .set_row_names(length(df[[1]]))
df
}
my_html_table或稍微省力一点:
library(rvest)
library(dplyr)
pg <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd")
tips <- html_table(pg, header=TRUE)[[1]]
bind_rows(lapply(html_nodes(pg, "tbody > tr"), function(x) {
cbind.data.frame(t(c(html_text(html_nodes(x, "td"))[2],
html_attr(html_nodes(x, "td.tooltip"), "title"))))
}))
## 1 2 3 4 5 6 7 8 9 10 11 12
## (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr)
## 1 Gilly2311 11 53 42 72 51 41 5 45 20 75 39
## 2 Harts 27 50 56 57 53 28 15 32 8 63 51
## 3 mygypsyrose 3 49 64 62 35 61 1 37 20 47 53
## 4 Scraggie_93 19 58 81 32 39 31 12 54 35 44 44
## 5 Deb1967 4 33 54 60 35 72 21 53 20 118 66
## 6 svolaris 6 52 45 76 50 24 19 45 19 64 58
## 7 dazza power 14 56 61 45 62 54 2 64 60 40 36
## 8 Flamingoflames 28 33 35 83 34 76 1 17 9 83 46
## 9 FEARTHEBEARD 27 34 47 59 50 59 6 50 5 54 38
## 10 Jules23 11 35 57 47 42 65 34 38 4 61 37
## .. ... ... ... ... ... ... ... ... ... ... ... ...
## Variables not shown: 13 (chr), 14 (chr), 15 (chr), 16 (chr), 17 (chr), 18 (chr), 19
## (chr), 20 (chr), 21 (chr), 22 (chr), 23 (chr), 24 (chr)
库(rvest)
图书馆(dplyr)
pg或者用更少的努力来做:
library(rvest)
library(dplyr)
pg <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd")
tips <- html_table(pg, header=TRUE)[[1]]
bind_rows(lapply(html_nodes(pg, "tbody > tr"), function(x) {
cbind.data.frame(t(c(html_text(html_nodes(x, "td"))[2],
html_attr(html_nodes(x, "td.tooltip"), "title"))))
}))
## 1 2 3 4 5 6 7 8 9 10 11 12
## (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr)
## 1 Gilly2311 11 53 42 72 51 41 5 45 20 75 39
## 2 Harts 27 50 56 57 53 28 15 32 8 63 51
## 3 mygypsyrose 3 49 64 62 35 61 1 37 20 47 53
## 4 Scraggie_93 19 58 81 32 39 31 12 54 35 44 44
## 5 Deb1967 4 33 54 60 35 72 21 53 20 118 66
## 6 svolaris 6 52 45 76 50 24 19 45 19 64 58
## 7 dazza power 14 56 61 45 62 54 2 64 60 40 36
## 8 Flamingoflames 28 33 35 83 34 76 1 17 9 83 46
## 9 FEARTHEBEARD 27 34 47 59 50 59 6 50 5 54 38
## 10 Jules23 11 35 57 47 42 65 34 38 4 61 37
## .. ... ... ... ... ... ... ... ... ... ... ... ...
## Variables not shown: 13 (chr), 14 (chr), 15 (chr), 16 (chr), 17 (chr), 18 (chr), 19
## (chr), 20 (chr), 21 (chr), 22 (chr), 23 (chr), 24 (chr)
库(rvest)
图书馆(dplyr)
pg