如果错误大于R
我正在从网上下载天气数据。为此,我创建了简单的for循环,它将带有数据的数据帧添加到列表中(一个城市一个列表)。它工作正常,但若并没有数据(在web上并没有包含特定日期的天气条件的表),它将返回一个错误-例如,返回到此url(“”) Lublin[i]%读取\u html()%%>%html\u节点时出错(xpath=“/*[@id=\”obsTable\“]”%>%: 替换长度为零 当错误发生时,如何放置if语句,该语句返回带有NA的行(13个观察值),并将其放入列表中 还有没有比for循环更快的方法下载所有数据 我的代码:如果错误大于R,r,error-handling,web-scraping,R,Error Handling,Web Scraping,我正在从网上下载天气数据。为此,我创建了简单的for循环,它将带有数据的数据帧添加到列表中(一个城市一个列表)。它工作正常,但若并没有数据(在web上并没有包含特定日期的天气条件的表),它将返回一个错误-例如,返回到此url(“”) Lublin[i]%读取\u html()%%>%html\u节点时出错(xpath=“/*[@id=\”obsTable\“]”%>%: 替换长度为零 当错误发生时,如何放置if语句,该语句返回带有NA的行(13个观察值),并将其放入列表中 还有没有比for循环
c<-seq(as.Date("2015/1/1"), as.Date("2016/12/31"), "days")
Warszawa <- list()
Wroclaw <- list()
Bydgoszcz <- list()
Lublin <- list()
Gorzow <- list()
Lodz <- list()
Krakow <- list()
Opole <- list()
Rzeszow <- list()
Bialystok <- list()
Gdansk <- list()
Katowice <- list()
Kielce <- list()
Olsztyn <- list()
Poznan <- list()
Szczecin <- list()
date <- list()
for(i in 1:length(c)) {
y<-as.numeric(format(c[i],'%Y'))
m<-as.numeric(format(c[i],'%m'))
d<-as.numeric(format(c[i],'%d'))
date[i] <- c[i]
url1 <- sprintf("https://www.wunderground.com/history/airport/EPWA/%d/%d/%d/DailyHistory.html?req_city=Warszawa&req_state=MZ&req_statename=Poland", y, m, d)
url2 <- sprintf("https://www.wunderground.com/history/airport/EPWR/%d/%d/%d/DailyHistory.html?req_city=Wrocław&req_statename=Poland", y, m, d)
url3 <- sprintf("https://www.wunderground.com/history/airport/EPBY/%d/%d/%d/DailyHistory.html?req_city=Bydgoszcz&req_statename=Poland", y, m, d)
url4 <- sprintf("https://www.wunderground.com/history/airport/EPLB/%d/%d/%d/DailyHistory.html?req_city=Abramowice%%20Koscielne&req_statename=Poland", y, m, d)
url5 <- sprintf("https://www.wunderground.com/history/airport/EPZG/%d/%d/%d/DailyHistory.html?req_city=Gorzow%%20Wielkopolski&req_statename=Poland", y, m, d)
url6 <- sprintf("https://www.wunderground.com/history/airport/EPLL/%d/%d/%d/DailyHistory.html?req_city=Lodz&req_statename=Poland", y, m, d)
url7 <- sprintf("https://www.wunderground.com/history/airport/EPKK/%d/%d/%d/DailyHistory.html?req_city=Krakow&req_statename=Poland", y, m, d)
url8 <- sprintf("https://www.wunderground.com/history/airport/EPWR/%d/%d/%d/DailyHistory.html?req_city=Opole&req_statename=Poland", y, m, d)
url9 <- sprintf("https://www.wunderground.com/history/airport/EPRZ/%d/%d/%d/DailyHistory.html?req_city=Rzeszow&req_statename=Poland", y, m, d)
url10 <- sprintf("https://www.wunderground.com/history/airport/UMMG/%d/%d/%d/DailyHistory.html?req_city=Dojlidy&req_statename=Poland", y, m, d)
url11 <- sprintf("https://www.wunderground.com/history/airport/EPGD/%d/%d/%d/DailyHistory.html?req_city=Gdansk&req_statename=Poland", y, m, d)
url12 <- sprintf("https://www.wunderground.com/history/airport/EPKM/%d/%d/%d/DailyHistory.html?req_city=Katowice&req_statename=Poland", y, m, d)
url13 <- sprintf("https://www.wunderground.com/history/airport/EPKT/%d/%d/%d/DailyHistory.html?req_city=Chorzow%%20Batory&req_statename=Poland", y, m, d)
url14 <- sprintf("https://www.wunderground.com/history/airport/EPSY/%d/%d/%d/DailyHistory.html", y, m, d)
url15 <- sprintf("https://www.wunderground.com/history/airport/EPPO/%d/%d/%d/DailyHistory.html?req_city=Poznan%%20Old%%20Town&req_statename=Poland", y, m, d)
url16 <- sprintf("https://www.wunderground.com/history/airport/EPSC/%d/%d/%d/DailyHistory.html?req_city=Szczecin&req_statename=Poland", y, m, d)
Warszawa[i] <- url1 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Wroclaw[i] <- url2 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Bydgoszcz[i] <- url3 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Lublin[i] <- url4 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Gorzow[i] <- url5 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Lodz[i] <- url6 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Krakow[i] <- url7 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Opole[i] <- url8 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Rzeszow[i] <- url9 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Bialystok[i] <- url10 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Gdansk[i] <- url11 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Katowice[i] <- url12 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Kielce[i] <- url13 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Olsztyn[i] <- url14 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Poznan[i] <- url15 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
Szczecin[i] <- url16 %>%
read_html() %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
}
c既然所有这些URL本质上都是一样的,有着细微的、非常可预测的差异,为什么不循环一个数组,将所有内容连接在一起,然后运行它呢
这是我所指的一个例子
library(rvest)
library(stringr)
#create a master dataframe to store all of the results
complete <- data.frame()
yearsVector <- c("2010", "2011", "2012", "2013", "2014", "2015")
#position is not needed since all of the info is stored on the page
#positionVector <- c("qb", "rb", "wr", "te", "ol", "dl", "lb", "cb", "s")
positionVector <- c("qb")
for (i in 1:length(yearsVector)) {
for (j in 1:length(positionVector)) {
# create a url template
URL.base <- "http://www.nfl.com/draft/"
URL.intermediate <- "/tracker?icampaign=draft-sub_nav_bar-drafteventpage-tracker#dt-tabs:dt-by-position/dt-by-position-input:"
#create the dataframe with the dynamic values
URL <- paste0(URL.base, yearsVector[i], URL.intermediate, positionVector[j])
#print(URL)
#read the page - store the page to make debugging easier
page <- read_html(URL)
库(rvest)
图书馆(stringr)
#创建一个主数据框来存储所有结果
完成首先,我有点忘乎所以,答案比最初计划的要长一点。我决定帮你解决三个问题:识别有效URL的重复问题;在获取这些URL的相关信息时出现重复问题;还有刮削时的错误问题
现在,我们开始,您希望以一种更简单的方式获取您想要获取的链接:
library(httr)
library(rvest)
## All the dates:
dates <- seq(as.Date("2015/1/1"), as.Date("2016/12/31"), "days")
dates <- gsub("-", "/", x = dates)
## All the regions and links:
abbreviations <- c("EPWA", "EPWR", "EPBY", "EPLB", "EPZG", "EPLL", "EPKK",
"EPWR", "EPRZ", "UMMG", "EPGD", "EPKM", "EPKT",
"EPSY", "EPPO", "EPSC")
links <- paste0("https://www.wunderground.com/history/airport/",
abbreviations, "/")
links <- lapply(links, function(x){paste0(x, dates, "/DailyHistory.html")})
运行此函数应该适用于我们确定的所有URL,包括您在问题中发布的错误URL
# A little test-run, to see if your faulty URL works:
testlink <- "https://www.wunderground.com/history/airport/EPLB/2015/12/25/DailyHistory.html?req_city=Abramowice%20Koscielne&req_statename=Poland"
links[[1]][5] <- testlink
tested <- sapply(links[[1]][1:6], get_table, USE.NAMES = FALSE)
# [1] "Just scraped Warsaw, Poland Thursday, January 1, 2015"
# [1] "Just scraped Warsaw, Poland Friday, January 2, 2015"
# [1] "Just scraped Warsaw, Poland Saturday, January 3, 2015"
# [1] "Just scraped Warsaw, Poland Sunday, January 4, 2015"
# [1] "No weather table available for this day"
# [1] "Just scraped Warsaw, Poland Tuesday, January 6, 2015"
#进行一次小测试,以查看错误的URL是否有效:
testlink您可以使用tryCatch
获取该提示:请参阅将c
用作变量。因为它是用来在R中创建向量的。你也有相当多的重复代码。我认为你可以创建一个函数,在这里你可以交换你需要的变量。至于错误,我会遵循@docendo discimus的建议。@docendo discimus我应该在所有%%>%read_html()%%>%html_节点中使用tryCatch
(xpath='/*[@id=“obsTable”])%%>%html_table()
部分代码吗?@ErikSchutte谢谢我会尽量不重复代码:)谢谢!这是伟大的:)我设法摆脱了没有数据错误,并简化了我的代码,但有一些问题,找出哪些输出是哪个。。。这简直太棒了:)但是当循环从一个元素转到另一个元素时会出现一个小问题[1]“刚刚刮到波兰华沙,2016年12月28日星期三”[1]“刚刚刮到波兰华沙,2016年12月29日星期四”[1]“刚刚刮到波兰华沙,2016年12月30日星期五”[1]“2016年12月31日,星期六,波兰华沙”城市错误[[i]]是的,你说得对。我通过创建一个带有子列表的空列表来修复它
## Get the weather report & name
get_table <- function(link){
# Get the html from a link
html <- try(link %>%
read_html())
if("try-error)" %in% class(html)){
print("HTML not found, skipping to next link")
return("HTML not found, skipping to next link")
}
# Get the weather table from that page
weather_table <- html %>%
html_nodes(xpath='//*[@id="obsTable"]') %>%
html_table()
if(length(weather_table) == 0){
print("No weather table available for this day")
return("No weather table available for this day")
}
# Use info from the html to get the city, for naming the list
region <- html %>%
html_nodes(xpath = '//*[@id="location"]') %>%
html_text()
region <- strsplit(region, "[1-9]")[[1]][1]
region <- gsub("\n", "", region)
region <- gsub("\t\t", "", region)
# Use info from the html to get the date, and name the list
which_date <- html %>%
html_nodes(xpath = '//*[@class="history-date"]') %>%
html_text()
city_date <- paste0(region, which_date)
# Name the output
names(weather_table) <- city_date
print(paste0("Just scraped ", city_date))
return(weather_table)
}
# A little test-run, to see if your faulty URL works:
testlink <- "https://www.wunderground.com/history/airport/EPLB/2015/12/25/DailyHistory.html?req_city=Abramowice%20Koscielne&req_statename=Poland"
links[[1]][5] <- testlink
tested <- sapply(links[[1]][1:6], get_table, USE.NAMES = FALSE)
# [1] "Just scraped Warsaw, Poland Thursday, January 1, 2015"
# [1] "Just scraped Warsaw, Poland Friday, January 2, 2015"
# [1] "Just scraped Warsaw, Poland Saturday, January 3, 2015"
# [1] "Just scraped Warsaw, Poland Sunday, January 4, 2015"
# [1] "No weather table available for this day"
# [1] "Just scraped Warsaw, Poland Tuesday, January 6, 2015"
# For all sublists in links (corresponding to cities)
# scrape all links (corresponding to days)
city <- rep(list(list()), length(abbreviations))
for(i in 1:length(links)){
city[[i]] <- sapply(links[[i]], get_table, USE.NAMES = FALSE)
}