rvest获取表中的html超链接

rvest获取表中的html超链接,r,web-scraping,rvest,R,Web Scraping,Rvest,我正在尝试删除超链接中的地理代码,并希望创建一个包含所有表以及地理代码的表 我现在所做的是使用以下代码获取一个表 library(rvest) url<-"http://www.city-data.com/accidents/acc-Nashua-New-Hampshire.html" citidata<- html(url) ta<- citidata %>% html_nodes("table") %>% .[1:29] %>% html_table(

我正在尝试删除超链接中的地理代码,并希望创建一个包含所有表以及地理代码的表

我现在所做的是使用以下代码获取一个表

library(rvest)

url<-"http://www.city-data.com/accidents/acc-Nashua-New-Hampshire.html"

citidata<- html(url)
ta<- citidata %>%
html_nodes("table") %>%
.[1:29] %>%
html_table()

dat<-do.call(rbind, lapply(ta, data.frame, stringsAsFactors=FALSE))

citystate <- citidata %>%
 html_node("h1 span") %>%
 html_text()

citystate <- gsub("Fatal car crashes and road traffic accidents in ",
                  "", citystate)

loc<-data.frame(matrix(unlist(strsplit(citystate, ",", fixed = TRUE)), ncol=2, byrow=TRUE))
dat$City<-loc$X1
dat$State<-loc$X2
然后我尝试将地理代码添加到数据框中,但不知道怎么做

下面是在hyperlink中取消地理编码的代码

pg <- html("http://www.city-data.com/accidents/acc-Nashua-New-Hampshire.html")
geo <- data.frame(gsub("javascript:showGoogleSView","",pg %>% html_nodes("a") %>% html_attr("href") %>% .[31:60]))
pg%html\u attr(“href”)%%>%[31:60]))

并非所有事件都有相关的lat/lon对。以下代码使用事件日期(显然)唯一这一事实,并将坐标与先前构建的主
dat
合并:

library(rvest)
library(stringr)
library(dplyr)

url <- "http://www.city-data.com/accidents/acc-Nashua-New-Hampshire.html"

# Get all incident tables -------------------------------------------------

citidata <- html(url)

ta <- citidata %>%
  html_nodes("table") %>%
  .[1:29] %>%
  html_table()

# rbind them together -----------------------------------------------------

dat <- do.call(rbind, lapply(ta, data.frame, stringsAsFactors=FALSE))

citystate <- citidata %>%
  html_node("h1 span") %>%
  html_text()

# Get city/state and add it to the data.frame -------------------------------

citystate <- gsub("Fatal car crashes and road traffic accidents in ", 
                  "", citystate)

loc <- data.frame(matrix(unlist(strsplit(citystate, ",", fixed=TRUE)), 
                         ncol=2, byrow=TRUE))

dat$City <- loc$X1
dat$State <- loc$X2

# Get GPS coords where available ------------------------------------------

coords <- citidata %>% 
  html_nodes(xpath="//a[@class='showStreetViewLink']") %>% 
  html_attr("href") %>% 
  str_extract("([[:digit:]-,\\.]+)") %>% 
  str_split(",") %>% 
  unlist() %>% 
  matrix(ncol=2, byrow=2) %>% 
  data.frame(stringsAsFactors=FALSE) %>% 
  rename(lat=X1, lon=X2) %>% 
  mutate(lat=as.numeric(lat), lon=as.numeric(lon))

# Get GPS coordinates associated incident time for merge ------------------

coord_time <- pg %>% 
  html_nodes(xpath="//a[@class='showStreetViewLink']/../preceding-sibling::td[1]") %>%
  html_text() %>% 
  data_frame(Date=.)

# Merge the coordinates with the data.frame we built earlier --------------

left_join(dat, bind_cols(coords, coord_time))
库(rvest)
图书馆(stringr)
图书馆(dplyr)
url%
html_表()
#把它们绑在一起-----------------------------------------------------
dat%
html_text()
#获取城市/州并将其添加到data.frame-------------------------------
城市国家%
取消列表()%>%
矩阵(ncol=2,byrow=2)%>%
数据帧(stringsAsFactors=FALSE)%>%
重命名(纬度=X1,经度=X2)%>%
变异(lat=as.numeric(lat),lon=as.numeric(lon))
#获取与合并事件时间相关的GPS坐标------------------
协调时间%
html_节点(xpath=“//a[@class='showStreetViewLink']/../previous sibling::td[1]”%>%
html_text()%>%
数据帧(日期=)
#将坐标与前面构建的data.frame合并--------------
左连接(数据,绑定(坐标,坐标时间))

一个问题(最初)是,
dat
有98行,
geo
有30行是的,不是所有的数据都带有geo。是的,有一些坐标不可用,我想我可以在合并它们之前将它们分开。但另一个问题是,如果坐标不在序列中(中间缺失),我如何将时间与坐标匹配?
library(rvest)
library(stringr)
library(dplyr)

url <- "http://www.city-data.com/accidents/acc-Nashua-New-Hampshire.html"

# Get all incident tables -------------------------------------------------

citidata <- html(url)

ta <- citidata %>%
  html_nodes("table") %>%
  .[1:29] %>%
  html_table()

# rbind them together -----------------------------------------------------

dat <- do.call(rbind, lapply(ta, data.frame, stringsAsFactors=FALSE))

citystate <- citidata %>%
  html_node("h1 span") %>%
  html_text()

# Get city/state and add it to the data.frame -------------------------------

citystate <- gsub("Fatal car crashes and road traffic accidents in ", 
                  "", citystate)

loc <- data.frame(matrix(unlist(strsplit(citystate, ",", fixed=TRUE)), 
                         ncol=2, byrow=TRUE))

dat$City <- loc$X1
dat$State <- loc$X2

# Get GPS coords where available ------------------------------------------

coords <- citidata %>% 
  html_nodes(xpath="//a[@class='showStreetViewLink']") %>% 
  html_attr("href") %>% 
  str_extract("([[:digit:]-,\\.]+)") %>% 
  str_split(",") %>% 
  unlist() %>% 
  matrix(ncol=2, byrow=2) %>% 
  data.frame(stringsAsFactors=FALSE) %>% 
  rename(lat=X1, lon=X2) %>% 
  mutate(lat=as.numeric(lat), lon=as.numeric(lon))

# Get GPS coordinates associated incident time for merge ------------------

coord_time <- pg %>% 
  html_nodes(xpath="//a[@class='showStreetViewLink']/../preceding-sibling::td[1]") %>%
  html_text() %>% 
  data_frame(Date=.)

# Merge the coordinates with the data.frame we built earlier --------------

left_join(dat, bind_cols(coords, coord_time))