R：使用rvest抓取表中每个单元格的悬停文本标题_R

R：使用rvest抓取表中每个单元格的悬停文本标题

R：使用rvest抓取表中每个单元格的悬停文本标题,r,R,我使用rvest从一些javascript表（比如这里的表）中提取数据如您所见，在该表中，每个单元格都有一个值，并且当您将鼠标悬停在上面时，还有另一个附加值我可以用rvest刮桌子，就像这样： tips <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd") %>% html_table(header=TRUE) tips% html_表格（标题=TRUE）但我不确定如

我使用rvest从一些javascript表（比如这里的表）中提取数据

如您所见，在该表中，每个单元格都有一个值，并且当您将鼠标悬停在上面时，还有另一个附加值

我可以用rvest刮桌子，就像这样：

tips <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd") %>%
   html_table(header=TRUE)

tips%
html_表格（标题=TRUE）

但我不确定如何获取悬停值。我可以用rvest实现这一点吗？

您可以扩展/修改

rvest:：：html\u table.xml\u node

方法，如下所示：
见R评论

my_html_table <- function(x, header = NA, trim = TRUE, fill = FALSE, dec = ".", attr_name = ""){
  rows <- html_nodes(x, "tr")
  n <- length(rows)
  cells <- lapply(rows, "html_nodes", xpath = ".//td|.//th")
  ncols <- lapply(cells, html_attr, "colspan", default = "1")
  ncols <- lapply(ncols, as.integer)
  p <- unique(vapply(ncols, sum, integer(1)))
  if (length(p) > 1) {
    if (!fill) {
      stop("Table has inconsistent number of columns. ", 
           "Do you want fill = TRUE?", call. = FALSE)
    }
    else {
      p <- max(p)
    }
  }
############################# 
## The following line is the only one that was changed
############################# 
  values <- lapply(cells, html_attr, attr_name)
# insted of
# values <- lapply(cells, html_text, trim = trim)
  out <- matrix(NA_character_, nrow = n, ncol = p)
  for (i in seq_len(n)) {
    row <- values[[i]]
    ncol <- ncols[[i]]
    col <- 1
    for (j in seq_len(p)) {
      if (j > length(row)) 
        next
      out[i, col] <- row[[j]]
      col <- col + ncol[j]
    }
  }
  if (is.na(header)) {
    header <- all(html_name(cells[[1]]) == "th")
  }
  if (header) {
    col_names <- out[1, , drop = FALSE]
    out <- out[-1, , drop = FALSE]
  } else {
    col_names <- paste0("X", seq_len(ncol(out)))
  }
  df <- lapply(seq_len(p), function(i) {
    utils::type.convert(out[, i], as.is = TRUE, dec = dec)
  })
  names(df) <- col_names
  class(df) <- "data.frame"
  attr(df, "row.names") <- .set_row_names(length(df[[1]]))
  df
}

my\u html\u table您可以扩展/修改rvest:：html\u table.xml\u节点
方法，如下所示：

见R评论
my_html_table <- function(x, header = NA, trim = TRUE, fill = FALSE, dec = ".", attr_name = ""){
  rows <- html_nodes(x, "tr")
  n <- length(rows)
  cells <- lapply(rows, "html_nodes", xpath = ".//td|.//th")
  ncols <- lapply(cells, html_attr, "colspan", default = "1")
  ncols <- lapply(ncols, as.integer)
  p <- unique(vapply(ncols, sum, integer(1)))
  if (length(p) > 1) {
    if (!fill) {
      stop("Table has inconsistent number of columns. ", 
           "Do you want fill = TRUE?", call. = FALSE)
    }
    else {
      p <- max(p)
    }
  }
############################# 
## The following line is the only one that was changed
############################# 
  values <- lapply(cells, html_attr, attr_name)
# insted of
# values <- lapply(cells, html_text, trim = trim)
  out <- matrix(NA_character_, nrow = n, ncol = p)
  for (i in seq_len(n)) {
    row <- values[[i]]
    ncol <- ncols[[i]]
    col <- 1
    for (j in seq_len(p)) {
      if (j > length(row)) 
        next
      out[i, col] <- row[[j]]
      col <- col + ncol[j]
    }
  }
  if (is.na(header)) {
    header <- all(html_name(cells[[1]]) == "th")
  }
  if (header) {
    col_names <- out[1, , drop = FALSE]
    out <- out[-1, , drop = FALSE]
  } else {
    col_names <- paste0("X", seq_len(ncol(out)))
  }
  df <- lapply(seq_len(p), function(i) {
    utils::type.convert(out[, i], as.is = TRUE, dec = dec)
  })
  names(df) <- col_names
  class(df) <- "data.frame"
  attr(df, "row.names") <- .set_row_names(length(df[[1]]))
  df
}

my_html_table或稍微省力一点：
library(rvest)
library(dplyr)

pg <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd")
tips <- html_table(pg, header=TRUE)[[1]]

bind_rows(lapply(html_nodes(pg, "tbody > tr"), function(x) {
  cbind.data.frame(t(c(html_text(html_nodes(x, "td"))[2], 
                       html_attr(html_nodes(x, "td.tooltip"), "title"))))
}))

##                 1     2     3     4     5     6     7     8     9    10    11    12
##             (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr)
## 1       Gilly2311    11    53    42    72    51    41     5    45    20    75    39
## 2           Harts    27    50    56    57    53    28    15    32     8    63    51
## 3     mygypsyrose     3    49    64    62    35    61     1    37    20    47    53
## 4     Scraggie_93    19    58    81    32    39    31    12    54    35    44    44
## 5         Deb1967     4    33    54    60    35    72    21    53    20   118    66
## 6        svolaris     6    52    45    76    50    24    19    45    19    64    58
## 7     dazza power    14    56    61    45    62    54     2    64    60    40    36
## 8  Flamingoflames    28    33    35    83    34    76     1    17     9    83    46
## 9    FEARTHEBEARD    27    34    47    59    50    59     6    50     5    54    38
## 10        Jules23    11    35    57    47    42    65    34    38     4    61    37
## ..            ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...
## Variables not shown: 13 (chr), 14 (chr), 15 (chr), 16 (chr), 17 (chr), 18 (chr), 19
##   (chr), 20 (chr), 21 (chr), 22 (chr), 23 (chr), 24 (chr)

库（rvest）
图书馆（dplyr）
pg或者用更少的努力来做：
library(rvest)
library(dplyr)

pg <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd")
tips <- html_table(pg, header=TRUE)[[1]]

bind_rows(lapply(html_nodes(pg, "tbody > tr"), function(x) {
  cbind.data.frame(t(c(html_text(html_nodes(x, "td"))[2], 
                       html_attr(html_nodes(x, "td.tooltip"), "title"))))
}))

##                 1     2     3     4     5     6     7     8     9    10    11    12
##             (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr)
## 1       Gilly2311    11    53    42    72    51    41     5    45    20    75    39
## 2           Harts    27    50    56    57    53    28    15    32     8    63    51
## 3     mygypsyrose     3    49    64    62    35    61     1    37    20    47    53
## 4     Scraggie_93    19    58    81    32    39    31    12    54    35    44    44
## 5         Deb1967     4    33    54    60    35    72    21    53    20   118    66
## 6        svolaris     6    52    45    76    50    24    19    45    19    64    58
## 7     dazza power    14    56    61    45    62    54     2    64    60    40    36
## 8  Flamingoflames    28    33    35    83    34    76     1    17     9    83    46
## 9    FEARTHEBEARD    27    34    47    59    50    59     6    50     5    54    38
## 10        Jules23    11    35    57    47    42    65    34    38     4    61    37
## ..            ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...
## Variables not shown: 13 (chr), 14 (chr), 15 (chr), 16 (chr), 17 (chr), 18 (chr), 19
##   (chr), 20 (chr), 21 (chr), 22 (chr), 23 (chr), 24 (chr)

库（rvest）
图书馆（dplyr）
pg