Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/80.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
如何刮<;div数据键“&引用&燃气轮机;在R_R_Rvest - Fatal编程技术网

如何刮<;div数据键“&引用&燃气轮机;在R

如何刮<;div数据键“&引用&燃气轮机;在R,r,rvest,R,Rvest,我想使用R来刮取下面的HTML文本。我知道我可以使用%HTML_节点(css)%%>%HTML_属性%>%ifelse(相同(,字符(0)),NA.)%%>% 未列出() 返回(txt) } URL如果您想获取数据键列表,请查找包含数据键属性的标记并提取它们: library(rvest) library(purrr) last_page <- 5 # change it to 1655 to scrape all pages datakeys <- flatten_chr(m

我想使用R来刮取下面的HTML文本。我知道我可以使用%HTML_节点(css)%%>%HTML_属性%>%ifelse(相同(,字符(0)),NA.)%%>% 未列出() 返回(txt) }
URL如果您想获取数据键列表,请查找包含
数据键属性的
标记并提取它们:

library(rvest)
library(purrr)
last_page <- 5 # change it to 1655 to scrape all pages
datakeys <- 
  flatten_chr(map(
    1:last_page,
    function(i) {
      read_html(paste0("https://www.rijdendetreinen.nl/storingen?page=", i)) %>% 
        html_nodes(xpath = "//div[@id='w0']/div[@data-key]") %>% 
        html_attr("data-key")
    }
  ))
库(rvest)
图书馆(purrr)
最后一页%
html_属性(“数据键”)
}
))
或者您可以直接刮取这些标记的数据:

library(rvest)
library(purrr)
last_page <- 5 # change it to 1655 to scrape all pages
data <- 
  map_dfr(
    1:last_page,
    function(i) {
      items <- read_html(paste0("https://www.rijdendetreinen.nl/storingen?page=", i)) %>%
        html_nodes(xpath = "//div[@id='w0']/div[@data-key]")
      print(i)
      data.frame(
        date = items %>% html_node(xpath = "(./preceding-sibling::h4)[last()]") %>% html_text(trim = T),
        datakey = items %>% html_attr("data-key"),
        link = paste0("https://www.rijdendetreinen.nl/", items %>% html_node(xpath = "./a[1]") %>% html_attr("href")),
        title = items %>% html_node(xpath = "./descendant::div[@class='disruption-content']/descendant::span[@class='disruption-line']") %>% html_text(trim = T),
        description = items %>% html_node(xpath = "./descendant::div[@class='disruption-content']/descendant::em") %>% html_text(trim = T),
        timestamp = items %>% html_node(xpath = "./descendant::div[@class='timestamp']") %>% html_text(trim = T)
      )
    }
  )
库(rvest)
图书馆(purrr)
最后一页%html文本(修剪=T),
数据键=项%>%html\u属性(“数据键”),
链接=粘贴0(“https://www.rijdendetreinen.nl/,items%>%html\u节点(xpath=“./a[1]”%>%html\u属性(“href”),
title=items%>%html_节点(xpath=“./substant::div[@class='disruption-content']/substant::span[@class='disruption-line']”)%>%html_文本(trim=T),
description=items%>%html_节点(xpath=“./substant::div[@class='disruption-content']/substant::em”)%>%html_文本(trim=T),
timestamp=items%>%html\u节点(xpath=“./substant::div[@class='timestamp']”)%>%html\u文本(trim=T)
)
}
)

如果您想获取数据键列表,请找到包含
数据键属性的
标记并提取它们:

library(rvest)
library(purrr)
last_page <- 5 # change it to 1655 to scrape all pages
datakeys <- 
  flatten_chr(map(
    1:last_page,
    function(i) {
      read_html(paste0("https://www.rijdendetreinen.nl/storingen?page=", i)) %>% 
        html_nodes(xpath = "//div[@id='w0']/div[@data-key]") %>% 
        html_attr("data-key")
    }
  ))
库(rvest)
图书馆(purrr)
最后一页%
html_属性(“数据键”)
}
))
或者您可以直接刮取这些标记的数据:

library(rvest)
library(purrr)
last_page <- 5 # change it to 1655 to scrape all pages
data <- 
  map_dfr(
    1:last_page,
    function(i) {
      items <- read_html(paste0("https://www.rijdendetreinen.nl/storingen?page=", i)) %>%
        html_nodes(xpath = "//div[@id='w0']/div[@data-key]")
      print(i)
      data.frame(
        date = items %>% html_node(xpath = "(./preceding-sibling::h4)[last()]") %>% html_text(trim = T),
        datakey = items %>% html_attr("data-key"),
        link = paste0("https://www.rijdendetreinen.nl/", items %>% html_node(xpath = "./a[1]") %>% html_attr("href")),
        title = items %>% html_node(xpath = "./descendant::div[@class='disruption-content']/descendant::span[@class='disruption-line']") %>% html_text(trim = T),
        description = items %>% html_node(xpath = "./descendant::div[@class='disruption-content']/descendant::em") %>% html_text(trim = T),
        timestamp = items %>% html_node(xpath = "./descendant::div[@class='timestamp']") %>% html_text(trim = T)
      )
    }
  )
库(rvest)
图书馆(purrr)
最后一页%html文本(修剪=T),
数据键=项%>%html\u属性(“数据键”),
链接=粘贴0(“https://www.rijdendetreinen.nl/,items%>%html\u节点(xpath=“./a[1]”%>%html\u属性(“href”),
title=items%>%html_节点(xpath=“./substant::div[@class='disruption-content']/substant::span[@class='disruption-line']”)%>%html_文本(trim=T),
description=items%>%html_节点(xpath=“./substant::div[@class='disruption-content']/substant::em”)%>%html_文本(trim=T),
timestamp=items%>%html\u节点(xpath=“./substant::div[@class='timestamp']”)%>%html\u文本(trim=T)
)
}
)