如何刮<;div数据键“&引用&燃气轮机;在R
我想使用R来刮取下面的HTML文本。我知道我可以使用%HTML_节点(css)%%>%HTML_属性%>%ifelse(相同(,字符(0)),NA.)%%>% 未列出() 返回(txt) }如何刮<;div数据键“&引用&燃气轮机;在R,r,rvest,R,Rvest,我想使用R来刮取下面的HTML文本。我知道我可以使用%HTML_节点(css)%%>%HTML_属性%>%ifelse(相同(,字符(0)),NA.)%%>% 未列出() 返回(txt) } URL如果您想获取数据键列表,请查找包含数据键属性的标记并提取它们: library(rvest) library(purrr) last_page <- 5 # change it to 1655 to scrape all pages datakeys <- flatten_chr(m
URL如果您想获取数据键列表,请查找包含
数据键属性的
标记并提取它们:
library(rvest)
library(purrr)
last_page <- 5 # change it to 1655 to scrape all pages
datakeys <-
flatten_chr(map(
1:last_page,
function(i) {
read_html(paste0("https://www.rijdendetreinen.nl/storingen?page=", i)) %>%
html_nodes(xpath = "//div[@id='w0']/div[@data-key]") %>%
html_attr("data-key")
}
))
库(rvest)
图书馆(purrr)
最后一页%
html_属性(“数据键”)
}
))
或者您可以直接刮取这些标记的数据:
library(rvest)
library(purrr)
last_page <- 5 # change it to 1655 to scrape all pages
data <-
map_dfr(
1:last_page,
function(i) {
items <- read_html(paste0("https://www.rijdendetreinen.nl/storingen?page=", i)) %>%
html_nodes(xpath = "//div[@id='w0']/div[@data-key]")
print(i)
data.frame(
date = items %>% html_node(xpath = "(./preceding-sibling::h4)[last()]") %>% html_text(trim = T),
datakey = items %>% html_attr("data-key"),
link = paste0("https://www.rijdendetreinen.nl/", items %>% html_node(xpath = "./a[1]") %>% html_attr("href")),
title = items %>% html_node(xpath = "./descendant::div[@class='disruption-content']/descendant::span[@class='disruption-line']") %>% html_text(trim = T),
description = items %>% html_node(xpath = "./descendant::div[@class='disruption-content']/descendant::em") %>% html_text(trim = T),
timestamp = items %>% html_node(xpath = "./descendant::div[@class='timestamp']") %>% html_text(trim = T)
)
}
)
库(rvest)
图书馆(purrr)
最后一页%html文本(修剪=T),
数据键=项%>%html\u属性(“数据键”),
链接=粘贴0(“https://www.rijdendetreinen.nl/,items%>%html\u节点(xpath=“./a[1]”%>%html\u属性(“href”),
title=items%>%html_节点(xpath=“./substant::div[@class='disruption-content']/substant::span[@class='disruption-line']”)%>%html_文本(trim=T),
description=items%>%html_节点(xpath=“./substant::div[@class='disruption-content']/substant::em”)%>%html_文本(trim=T),
timestamp=items%>%html\u节点(xpath=“./substant::div[@class='timestamp']”)%>%html\u文本(trim=T)
)
}
)
如果您想获取数据键列表,请找到包含数据键属性的
标记并提取它们:
library(rvest)
library(purrr)
last_page <- 5 # change it to 1655 to scrape all pages
datakeys <-
flatten_chr(map(
1:last_page,
function(i) {
read_html(paste0("https://www.rijdendetreinen.nl/storingen?page=", i)) %>%
html_nodes(xpath = "//div[@id='w0']/div[@data-key]") %>%
html_attr("data-key")
}
))
库(rvest)
图书馆(purrr)
最后一页%
html_属性(“数据键”)
}
))
或者您可以直接刮取这些标记的数据:
library(rvest)
library(purrr)
last_page <- 5 # change it to 1655 to scrape all pages
data <-
map_dfr(
1:last_page,
function(i) {
items <- read_html(paste0("https://www.rijdendetreinen.nl/storingen?page=", i)) %>%
html_nodes(xpath = "//div[@id='w0']/div[@data-key]")
print(i)
data.frame(
date = items %>% html_node(xpath = "(./preceding-sibling::h4)[last()]") %>% html_text(trim = T),
datakey = items %>% html_attr("data-key"),
link = paste0("https://www.rijdendetreinen.nl/", items %>% html_node(xpath = "./a[1]") %>% html_attr("href")),
title = items %>% html_node(xpath = "./descendant::div[@class='disruption-content']/descendant::span[@class='disruption-line']") %>% html_text(trim = T),
description = items %>% html_node(xpath = "./descendant::div[@class='disruption-content']/descendant::em") %>% html_text(trim = T),
timestamp = items %>% html_node(xpath = "./descendant::div[@class='timestamp']") %>% html_text(trim = T)
)
}
)
库(rvest)
图书馆(purrr)
最后一页%html文本(修剪=T),
数据键=项%>%html\u属性(“数据键”),
链接=粘贴0(“https://www.rijdendetreinen.nl/,items%>%html\u节点(xpath=“./a[1]”%>%html\u属性(“href”),
title=items%>%html_节点(xpath=“./substant::div[@class='disruption-content']/substant::span[@class='disruption-line']”)%>%html_文本(trim=T),
description=items%>%html_节点(xpath=“./substant::div[@class='disruption-content']/substant::em”)%>%html_文本(trim=T),
timestamp=items%>%html\u节点(xpath=“./substant::div[@class='timestamp']”)%>%html\u文本(trim=T)
)
}
)