rvest |将数据拖入长格式

rvest |将数据拖入长格式,r,google-maps,data.table,dplyr,transformation,R,Google Maps,Data.table,Dplyr,Transformation,在抓取网页时,我遇到了以下问题,我认为可能有更好的解决方案: 拥有这些数据: dat <- data.frame(query = c("Washington, USA", "Frankfurt, Germany")) query 1 Washington, USA 2 Frankfurt, Germany 我现在要做的是: require(RCurl) require(rvest) require(magrittr) build_url <-

在抓取网页时,我遇到了以下问题,我认为可能有更好的解决方案:

拥有这些数据:

dat <- data.frame(query = c("Washington, USA", "Frankfurt, Germany"))

               query
1    Washington, USA
2 Frankfurt, Germany
我现在要做的是:

require(RCurl)
require(rvest)
require(magrittr)

build_url <- function(x, base_url = "https://maps.googleapis.com/maps/api/geocode/xml?address="){
  paste0(base_url, RCurl::curlEscape(x))
}

l <- lapply(dat$query, function(q){
  formatted_address <- q %>% build_url %>% read_xml %>% xml_nodes("formatted_address") %>% xml_text
  data.frame(query = q, formatted_address)
})

do.call(rbind, l) # This can be done via data.table::rbindlist as well
require(RCurl)
要求(rvest)
需要(magrittr)
生成\u url%xml\u节点(“格式化的\u地址”)%%>%xml\u文本
data.frame(查询=q,格式化的\u地址)
})
do.call(rbind,l)#这也可以通过data.table::rbindlist完成
有更好的解决办法吗?可能有更多的
数据。table
dplyr
样式?

我已经编写了使用有效API密钥访问google maps API的软件包(因此,如果您的数据超过2500项,您可以购买API密钥)

要获取地址详细信息,请使用
google\u geocode()

library(googleway)
key我编写了使用有效API密钥访问google maps API的包(因此,如果您的数据超过2500项,您可以为API密钥付费)

要获取地址详细信息,请使用
google\u geocode()

library(googleway)

键请包括
/
要求
调用以确保代码的可复制性。刚刚添加了
require
语句,从
stringsAsFactors=FALSE
开始在
data.frame
创建中添加。我建议在
lappl
中添加
sleep
,并确保将调用次数限制在2500或更少的IIRC(信息)。请包括
/
要求
调用,以确保代码的重现性。刚刚添加了
require
语句,从
stringsAsFactors=FALSE
开始在
data.frame
创建中添加。我建议在
lappl
中添加
sleep
,并确保将调用次数限制在2500或更少的IIRC(信息)。
require(RCurl)
require(rvest)
require(magrittr)

build_url <- function(x, base_url = "https://maps.googleapis.com/maps/api/geocode/xml?address="){
  paste0(base_url, RCurl::curlEscape(x))
}

l <- lapply(dat$query, function(q){
  formatted_address <- q %>% build_url %>% read_xml %>% xml_nodes("formatted_address") %>% xml_text
  data.frame(query = q, formatted_address)
})

do.call(rbind, l) # This can be done via data.table::rbindlist as well
library(googleway)

key <- "your_api_key"

dat <- data.frame(query = c("Washington, USA", "Frankfurt, Germany"))

## To get all the data:
res <- apply(dat, 1, function(x){
  google_geocode(address = x["query"],
                 key = key)  ## use simplify = F to return JSON
})

## to access the 'formatted address' part, see
res[[1]]$results$formatted_address
# [1] "Washington, DC, USA"       "Washington, UT, USA"       "Washington, VA 22747, USA" "Washington, IA 52353, USA"
# [5] "Washington, GA 30673, USA" "Washington, PA 15301, USA"

## so to get everything as a list
lapply(res, function(x){
  x$results$formatted_address
})

# [[1]]
# [1] "Washington, DC, USA"       "Washington, UT, USA"       "Washington, VA 22747, USA" "Washington, IA 52353, USA"
# [5] "Washington, GA 30673, USA" "Washington, PA 15301, USA"
# 
# [[2]]
# [1] "Frankfurt, Germany"

## and to put back onto your original data.frame:
lst <- lapply(1:length(res), function(x){
  data.frame(query = dat[x, "query"],
             formatted_address = res[[x]]$results$formatted_address)
})

data.table::rbindlist(lst)
#                 query         formatted_address
# 1:    Washington, USA       Washington, DC, USA
# 2:    Washington, USA       Washington, UT, USA
# 3:    Washington, USA Washington, VA 22747, USA
# 4:    Washington, USA Washington, IA 52353, USA
# 5:    Washington, USA Washington, GA 30673, USA
# 6:    Washington, USA Washington, PA 15301, USA
# 7: Frankfurt, Germany        Frankfurt, Germany