rvest获取表中的html超链接
我正在尝试删除超链接中的地理代码,并希望创建一个包含所有表以及地理代码的表 我现在所做的是使用以下代码获取一个表rvest获取表中的html超链接,r,web-scraping,rvest,R,Web Scraping,Rvest,我正在尝试删除超链接中的地理代码,并希望创建一个包含所有表以及地理代码的表 我现在所做的是使用以下代码获取一个表 library(rvest) url<-"http://www.city-data.com/accidents/acc-Nashua-New-Hampshire.html" citidata<- html(url) ta<- citidata %>% html_nodes("table") %>% .[1:29] %>% html_table(
library(rvest)
url<-"http://www.city-data.com/accidents/acc-Nashua-New-Hampshire.html"
citidata<- html(url)
ta<- citidata %>%
html_nodes("table") %>%
.[1:29] %>%
html_table()
dat<-do.call(rbind, lapply(ta, data.frame, stringsAsFactors=FALSE))
citystate <- citidata %>%
html_node("h1 span") %>%
html_text()
citystate <- gsub("Fatal car crashes and road traffic accidents in ",
"", citystate)
loc<-data.frame(matrix(unlist(strsplit(citystate, ",", fixed = TRUE)), ncol=2, byrow=TRUE))
dat$City<-loc$X1
dat$State<-loc$X2
然后我尝试将地理代码添加到数据框中,但不知道怎么做
下面是在hyperlink中取消地理编码的代码
pg <- html("http://www.city-data.com/accidents/acc-Nashua-New-Hampshire.html")
geo <- data.frame(gsub("javascript:showGoogleSView","",pg %>% html_nodes("a") %>% html_attr("href") %>% .[31:60]))
pg%html\u attr(“href”)%%>%[31:60]))
并非所有事件都有相关的lat/lon对。以下代码使用事件日期(显然)唯一这一事实,并将坐标与先前构建的主dat
合并:
library(rvest)
library(stringr)
library(dplyr)
url <- "http://www.city-data.com/accidents/acc-Nashua-New-Hampshire.html"
# Get all incident tables -------------------------------------------------
citidata <- html(url)
ta <- citidata %>%
html_nodes("table") %>%
.[1:29] %>%
html_table()
# rbind them together -----------------------------------------------------
dat <- do.call(rbind, lapply(ta, data.frame, stringsAsFactors=FALSE))
citystate <- citidata %>%
html_node("h1 span") %>%
html_text()
# Get city/state and add it to the data.frame -------------------------------
citystate <- gsub("Fatal car crashes and road traffic accidents in ",
"", citystate)
loc <- data.frame(matrix(unlist(strsplit(citystate, ",", fixed=TRUE)),
ncol=2, byrow=TRUE))
dat$City <- loc$X1
dat$State <- loc$X2
# Get GPS coords where available ------------------------------------------
coords <- citidata %>%
html_nodes(xpath="//a[@class='showStreetViewLink']") %>%
html_attr("href") %>%
str_extract("([[:digit:]-,\\.]+)") %>%
str_split(",") %>%
unlist() %>%
matrix(ncol=2, byrow=2) %>%
data.frame(stringsAsFactors=FALSE) %>%
rename(lat=X1, lon=X2) %>%
mutate(lat=as.numeric(lat), lon=as.numeric(lon))
# Get GPS coordinates associated incident time for merge ------------------
coord_time <- pg %>%
html_nodes(xpath="//a[@class='showStreetViewLink']/../preceding-sibling::td[1]") %>%
html_text() %>%
data_frame(Date=.)
# Merge the coordinates with the data.frame we built earlier --------------
left_join(dat, bind_cols(coords, coord_time))
库(rvest)
图书馆(stringr)
图书馆(dplyr)
url%
html_表()
#把它们绑在一起-----------------------------------------------------
dat%
html_text()
#获取城市/州并将其添加到data.frame-------------------------------
城市国家%
取消列表()%>%
矩阵(ncol=2,byrow=2)%>%
数据帧(stringsAsFactors=FALSE)%>%
重命名(纬度=X1,经度=X2)%>%
变异(lat=as.numeric(lat),lon=as.numeric(lon))
#获取与合并事件时间相关的GPS坐标------------------
协调时间%
html_节点(xpath=“//a[@class='showStreetViewLink']/../previous sibling::td[1]”%>%
html_text()%>%
数据帧(日期=)
#将坐标与前面构建的data.frame合并--------------
左连接(数据,绑定(坐标,坐标时间))
一个问题(最初)是,dat
有98行,geo
有30行是的,不是所有的数据都带有geo。是的,有一些坐标不可用,我想我可以在合并它们之前将它们分开。但另一个问题是,如果坐标不在序列中(中间缺失),我如何将时间与坐标匹配?
library(rvest)
library(stringr)
library(dplyr)
url <- "http://www.city-data.com/accidents/acc-Nashua-New-Hampshire.html"
# Get all incident tables -------------------------------------------------
citidata <- html(url)
ta <- citidata %>%
html_nodes("table") %>%
.[1:29] %>%
html_table()
# rbind them together -----------------------------------------------------
dat <- do.call(rbind, lapply(ta, data.frame, stringsAsFactors=FALSE))
citystate <- citidata %>%
html_node("h1 span") %>%
html_text()
# Get city/state and add it to the data.frame -------------------------------
citystate <- gsub("Fatal car crashes and road traffic accidents in ",
"", citystate)
loc <- data.frame(matrix(unlist(strsplit(citystate, ",", fixed=TRUE)),
ncol=2, byrow=TRUE))
dat$City <- loc$X1
dat$State <- loc$X2
# Get GPS coords where available ------------------------------------------
coords <- citidata %>%
html_nodes(xpath="//a[@class='showStreetViewLink']") %>%
html_attr("href") %>%
str_extract("([[:digit:]-,\\.]+)") %>%
str_split(",") %>%
unlist() %>%
matrix(ncol=2, byrow=2) %>%
data.frame(stringsAsFactors=FALSE) %>%
rename(lat=X1, lon=X2) %>%
mutate(lat=as.numeric(lat), lon=as.numeric(lon))
# Get GPS coordinates associated incident time for merge ------------------
coord_time <- pg %>%
html_nodes(xpath="//a[@class='showStreetViewLink']/../preceding-sibling::td[1]") %>%
html_text() %>%
data_frame(Date=.)
# Merge the coordinates with the data.frame we built earlier --------------
left_join(dat, bind_cols(coords, coord_time))