Warning: file_get_contents(/data/phpspider/zhask/data//catemap/8/grails/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R:使用rvest抓取表中每个单元格的悬停文本标题_R - Fatal编程技术网

R:使用rvest抓取表中每个单元格的悬停文本标题

R:使用rvest抓取表中每个单元格的悬停文本标题,r,R,我使用rvest从一些javascript表(比如这里的表)中提取数据 如您所见,在该表中,每个单元格都有一个值,并且当您将鼠标悬停在上面时,还有另一个附加值 我可以用rvest刮桌子,就像这样: tips <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd") %>% html_table(header=TRUE) tips% html_表格(标题=TRUE) 但我不确定如

我使用rvest从一些javascript表(比如这里的表)中提取数据

如您所见,在该表中,每个单元格都有一个值,并且当您将鼠标悬停在上面时,还有另一个附加值

我可以用rvest刮桌子,就像这样:

tips <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd") %>%
   html_table(header=TRUE)
tips%
html_表格(标题=TRUE)

但我不确定如何获取悬停值。我可以用rvest实现这一点吗?

您可以扩展/修改
rvest:::html\u table.xml\u node
方法,如下所示:
见R评论

my_html_table <- function(x, header = NA, trim = TRUE, fill = FALSE, dec = ".", attr_name = ""){
  rows <- html_nodes(x, "tr")
  n <- length(rows)
  cells <- lapply(rows, "html_nodes", xpath = ".//td|.//th")
  ncols <- lapply(cells, html_attr, "colspan", default = "1")
  ncols <- lapply(ncols, as.integer)
  p <- unique(vapply(ncols, sum, integer(1)))
  if (length(p) > 1) {
    if (!fill) {
      stop("Table has inconsistent number of columns. ", 
           "Do you want fill = TRUE?", call. = FALSE)
    }
    else {
      p <- max(p)
    }
  }
############################# 
## The following line is the only one that was changed
############################# 
  values <- lapply(cells, html_attr, attr_name)
# insted of
# values <- lapply(cells, html_text, trim = trim)
  out <- matrix(NA_character_, nrow = n, ncol = p)
  for (i in seq_len(n)) {
    row <- values[[i]]
    ncol <- ncols[[i]]
    col <- 1
    for (j in seq_len(p)) {
      if (j > length(row)) 
        next
      out[i, col] <- row[[j]]
      col <- col + ncol[j]
    }
  }
  if (is.na(header)) {
    header <- all(html_name(cells[[1]]) == "th")
  }
  if (header) {
    col_names <- out[1, , drop = FALSE]
    out <- out[-1, , drop = FALSE]
  } else {
    col_names <- paste0("X", seq_len(ncol(out)))
  }
  df <- lapply(seq_len(p), function(i) {
    utils::type.convert(out[, i], as.is = TRUE, dec = dec)
  })
  names(df) <- col_names
  class(df) <- "data.frame"
  attr(df, "row.names") <- .set_row_names(length(df[[1]]))
  df
}

my\u html\u table您可以扩展/修改
rvest::html\u table.xml\u节点
方法,如下所示:
见R评论

my_html_table <- function(x, header = NA, trim = TRUE, fill = FALSE, dec = ".", attr_name = ""){
  rows <- html_nodes(x, "tr")
  n <- length(rows)
  cells <- lapply(rows, "html_nodes", xpath = ".//td|.//th")
  ncols <- lapply(cells, html_attr, "colspan", default = "1")
  ncols <- lapply(ncols, as.integer)
  p <- unique(vapply(ncols, sum, integer(1)))
  if (length(p) > 1) {
    if (!fill) {
      stop("Table has inconsistent number of columns. ", 
           "Do you want fill = TRUE?", call. = FALSE)
    }
    else {
      p <- max(p)
    }
  }
############################# 
## The following line is the only one that was changed
############################# 
  values <- lapply(cells, html_attr, attr_name)
# insted of
# values <- lapply(cells, html_text, trim = trim)
  out <- matrix(NA_character_, nrow = n, ncol = p)
  for (i in seq_len(n)) {
    row <- values[[i]]
    ncol <- ncols[[i]]
    col <- 1
    for (j in seq_len(p)) {
      if (j > length(row)) 
        next
      out[i, col] <- row[[j]]
      col <- col + ncol[j]
    }
  }
  if (is.na(header)) {
    header <- all(html_name(cells[[1]]) == "th")
  }
  if (header) {
    col_names <- out[1, , drop = FALSE]
    out <- out[-1, , drop = FALSE]
  } else {
    col_names <- paste0("X", seq_len(ncol(out)))
  }
  df <- lapply(seq_len(p), function(i) {
    utils::type.convert(out[, i], as.is = TRUE, dec = dec)
  })
  names(df) <- col_names
  class(df) <- "data.frame"
  attr(df, "row.names") <- .set_row_names(length(df[[1]]))
  df
}

my_html_table或稍微省力一点:

library(rvest)
library(dplyr)

pg <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd")
tips <- html_table(pg, header=TRUE)[[1]]

bind_rows(lapply(html_nodes(pg, "tbody > tr"), function(x) {
  cbind.data.frame(t(c(html_text(html_nodes(x, "td"))[2], 
                       html_attr(html_nodes(x, "td.tooltip"), "title"))))
}))

##                 1     2     3     4     5     6     7     8     9    10    11    12
##             (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr)
## 1       Gilly2311    11    53    42    72    51    41     5    45    20    75    39
## 2           Harts    27    50    56    57    53    28    15    32     8    63    51
## 3     mygypsyrose     3    49    64    62    35    61     1    37    20    47    53
## 4     Scraggie_93    19    58    81    32    39    31    12    54    35    44    44
## 5         Deb1967     4    33    54    60    35    72    21    53    20   118    66
## 6        svolaris     6    52    45    76    50    24    19    45    19    64    58
## 7     dazza power    14    56    61    45    62    54     2    64    60    40    36
## 8  Flamingoflames    28    33    35    83    34    76     1    17     9    83    46
## 9    FEARTHEBEARD    27    34    47    59    50    59     6    50     5    54    38
## 10        Jules23    11    35    57    47    42    65    34    38     4    61    37
## ..            ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...
## Variables not shown: 13 (chr), 14 (chr), 15 (chr), 16 (chr), 17 (chr), 18 (chr), 19
##   (chr), 20 (chr), 21 (chr), 22 (chr), 23 (chr), 24 (chr)
库(rvest)
图书馆(dplyr)

pg或者用更少的努力来做:

library(rvest)
library(dplyr)

pg <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd")
tips <- html_table(pg, header=TRUE)[[1]]

bind_rows(lapply(html_nodes(pg, "tbody > tr"), function(x) {
  cbind.data.frame(t(c(html_text(html_nodes(x, "td"))[2], 
                       html_attr(html_nodes(x, "td.tooltip"), "title"))))
}))

##                 1     2     3     4     5     6     7     8     9    10    11    12
##             (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr)
## 1       Gilly2311    11    53    42    72    51    41     5    45    20    75    39
## 2           Harts    27    50    56    57    53    28    15    32     8    63    51
## 3     mygypsyrose     3    49    64    62    35    61     1    37    20    47    53
## 4     Scraggie_93    19    58    81    32    39    31    12    54    35    44    44
## 5         Deb1967     4    33    54    60    35    72    21    53    20   118    66
## 6        svolaris     6    52    45    76    50    24    19    45    19    64    58
## 7     dazza power    14    56    61    45    62    54     2    64    60    40    36
## 8  Flamingoflames    28    33    35    83    34    76     1    17     9    83    46
## 9    FEARTHEBEARD    27    34    47    59    50    59     6    50     5    54    38
## 10        Jules23    11    35    57    47    42    65    34    38     4    61    37
## ..            ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...
## Variables not shown: 13 (chr), 14 (chr), 15 (chr), 16 (chr), 17 (chr), 18 (chr), 19
##   (chr), 20 (chr), 21 (chr), 22 (chr), 23 (chr), 24 (chr)
库(rvest)
图书馆(dplyr)
pg