Javascript 使用R-to-Scrape(rvest)接受JS网页上的条款和条件
我还有一个关于网络垃圾的问题。我正在使用Rvest试图从警察报告网站上搜集一些数据。我一直在四处寻找,但似乎找不到绕过网站“我同意”按钮的“接受条款和条件”的方法。我如何提交“我同意”才能访问该网站 网站=Javascript 使用R-to-Scrape(rvest)接受JS网页上的条款和条件,javascript,r,web-scraping,rvest,Javascript,R,Web Scraping,Rvest,我还有一个关于网络垃圾的问题。我正在使用Rvest试图从警察报告网站上搜集一些数据。我一直在四处寻找,但似乎找不到绕过网站“我同意”按钮的“接受条款和条件”的方法。我如何提交“我同意”才能访问该网站 网站= require(httr) 需要(XML) 图书馆(RCurl) 图书馆(rvest) url您需要了解如何让selenium在您的系统上运行,以及如何让remoteDr(…)调用继续进行。之后,这将帮助您开始: library(seleniumPipes) library(rvest) l
require(httr)
需要(XML)
图书馆(RCurl)
图书馆(rvest)
url您需要了解如何让selenium在您的系统上运行,以及如何让remoteDr(…)
调用继续进行。之后,这将帮助您开始:
library(seleniumPipes)
library(rvest)
library(dplyr)
library(stringi)
library(purrr)
remDr <- remoteDr(...)
remDr %>% go("http://www.wspdp2c.org/Summary_Disclaimer.aspx")
submit <- remDr %>% findElement("xpath", ".//input[@type='submit']")
submit %>% elementClick()
from_date <- remDr %>% findElement("xpath", ".//input[@name='MasterPage$mainContent$txtDateFrom2']")
from_date %>% elementClear()
from_date %>% elementSendKeys("12/22/2016")
to_date %>% elementSendKeys("12/23/2016", selKeys$escape) # esc clears the popup calednar
to_date <- remDr %>% findElement("xpath", ".//input[@name='MasterPage$mainContent$txtDateTo2']")
to_date %>% elementClear()
to_date %>% elementSendKeys("12/23/2016", selKeys$escape)
search <- remDr %>% findElement("class name", "ui-icon-search")
search %>% elementClick()
remDr %>% getPageSource() -> pg
html_nodes(pg, "table.DataGridText") -> tab
html_nodes(tab, xpath=".//td[2]")[1:9] %>%
html_text() %>%
as.POSIXct(format="%m/%d/%Y %H:%M") -> occurred
html_nodes(tab, xpath=".//td[3]")[1:9] %>%
html_text() -> incident_or_arrest
html_nodes(tab, xpath=".//td[4]")[1:9] %>%
html_text() %>%
stri_trim_both() -> case_or_arrestee
stri_match_all_regex(case_or_arrestee,
paste0(c("Case #: ([[:digit:]]+)",
"Primary Offense: ([[:print:]]+)",
"Arrestee: ([[:print:]]+)",
"Charge: ([[:print:]]+)"), collapse="|")) %>%
map(~apply(.[,2:5], 1, discard, is.na)) %>%
map_df(function(x) {
x <- as.list(x)
if (stri_detect_regex(x[[1]], "[[:alpha:]]")) {
setNames(x, c("arrestee", "charge"))
} else {
setNames(x, c("case_number", "primary_offense"))
}
}) -> case_or_arrestee
html_nodes(tab, xpath=".//td[5]")[1:9] %>%
html_text() -> location
data_frame(occurred, incident_or_arrest, location) %>%
bind_cols(case_or_arrestee) %>%
glimpse()
## Observations: 9
## Variables: 7
## $ occurred <dttm> 2016-12-22 00:00:00, 2016-12-22 00:00:00, 2016-12-22 00:0...
## $ incident_or_arrest <chr> "Incident", "Incident", "Arrest", "Incident", "Incident", ...
## $ location <chr> "2600-BLK TODDLER PLACE DR", "300-BLK ALSPAUGH DR", ...
## $ case_number <chr> "1667276", "1667273", NA, "1667249", "1667248", NA, NA, "1...
## $ primary_offense <chr> "BREAKING & ENTERING WITH FORCE", "MALICIOUS INJURY TO PRO...
## $ arrestee <chr> NA, NA, "THOMAS, KERRY MARTIN", NA, NA, "LOZANO, MIGUEL AR...
## $ charge <chr> NA, NA, "PANHANDLING W/ NO PRIVLEDGE LICENSE", NA, NA, "AN...
库(seleniumPipes)
图书馆(rvest)
图书馆(dplyr)
图书馆(stringi)
图书馆(purrr)
remDr%go(“http://www.wspdp2c.org/Summary_Disclaimer.aspx")
提交%findElement(“xpath”,“//input[@type='submit']”)
提交%>%elementClick()
from_date%findElement(“xpath”,“//input[@name='MasterPage$mainContent$txtDateFrom2']”)
from_date%>%elementClear()
自年月日起%>%elementSendKeys(“12/22/2016”)
截止日期%>%elementSendKeys(“12/23/2016”,selKeys$escape)#esc清除弹出的calednar
截止日期%findElement(“xpath”,“//input[@name='MasterPage$mainContent$txtDateTo2']”)
截止日期%>%elementClear()
截止日期%>%elementSendKeys(“2016年12月23日”,selKeys$escape)
搜索%findElement(“类名”,“ui图标搜索”)
搜索%>%element单击()
remDr%>%getPageSource()->pg
html_节点(pg,“table.DataGridText”)->tab
html_节点(选项卡,xpath=“../td[2]”[1:9]]>%
html_text()%>%
as.POSIXct(format=“%m/%d/%Y%H:%m”)->发生
html_节点(选项卡,xpath=“../td[3]”[1:9]]>%
html_text()->事件或逮捕
html_节点(选项卡,xpath=“../td[4]”[1:9]]>%
html_text()%>%
stri_trim_both()->案例或被捕者
严格匹配所有规则(案例或被捕者),
paste0(c(“大小写:([:数字:]]+)”,
“主犯:([:print:][]+)”,
“被逮捕者:([:打印:]+)”,
“费用:([:打印:]+)”,折叠=“|”)%>%
映射(~apply([2:5],1,discard,is.na))%>%
地图测向(功能(x){
x案件或被逮捕者
html_节点(选项卡,xpath=“../td[5]”[1:9]]>%
html_text()->位置
数据帧(发生、事件或停止、位置)%>%
捆绑(案件或被逮捕人)%>%
一瞥
##意见:9
##变量:7
##$2016-12-22 00:00:00,2016-12-22 00:00:00,2016-12-22 00:00。。。
##$incident_或_逮捕“事件”、“事件”、“逮捕”、“事件”、“事件”。。。
##$location“2600-BLK幼儿区DR”、“300-BLK Alspagh DR”、。。。
##$case_编号“1667276”,“1667273”,NA,“1667249”,“1667248”,NA,NA,“1。。。
##$primary_违法行为“强行闯入”、“恶意伤害专业人士”。。。
##$arrestee不,不,“托马斯,克里·马丁”,不,不,“洛扎诺,米格尔·阿尔。。。
##$charge不适用,不适用,“无特权许可证的乞讨”,不适用,不适用,“和。。。
这是一个sharepoint驱动的网站。只需使用硒或硒mpipes即可。这非常完美。谢谢!
> wspd.form
[[1]]
<form> 'Form1' (POST ./Summary_Disclaimer.aspx)
<input hidden> '_popupBlockerExists': true
<input hidden> '__VIEWSTATE': /wEPDwUKLTUwMDM5Nzk4OA9....
<input hidden> '__VIEWSTATEGENERATOR': 27903AD3
<input hidden> '__EVENTVALIDATION': /wEdAAky7XCY2Cjbe0DHcJ....
<select> 'ctl00$MasterPage$DDLSiteMap1$ddlQuickLinks' [1/7]
<input submit> 'ctl00$MasterPage$mainContent$CenterColumnContent$btnContinue': I Agree
library(seleniumPipes)
library(rvest)
library(dplyr)
library(stringi)
library(purrr)
remDr <- remoteDr(...)
remDr %>% go("http://www.wspdp2c.org/Summary_Disclaimer.aspx")
submit <- remDr %>% findElement("xpath", ".//input[@type='submit']")
submit %>% elementClick()
from_date <- remDr %>% findElement("xpath", ".//input[@name='MasterPage$mainContent$txtDateFrom2']")
from_date %>% elementClear()
from_date %>% elementSendKeys("12/22/2016")
to_date %>% elementSendKeys("12/23/2016", selKeys$escape) # esc clears the popup calednar
to_date <- remDr %>% findElement("xpath", ".//input[@name='MasterPage$mainContent$txtDateTo2']")
to_date %>% elementClear()
to_date %>% elementSendKeys("12/23/2016", selKeys$escape)
search <- remDr %>% findElement("class name", "ui-icon-search")
search %>% elementClick()
remDr %>% getPageSource() -> pg
html_nodes(pg, "table.DataGridText") -> tab
html_nodes(tab, xpath=".//td[2]")[1:9] %>%
html_text() %>%
as.POSIXct(format="%m/%d/%Y %H:%M") -> occurred
html_nodes(tab, xpath=".//td[3]")[1:9] %>%
html_text() -> incident_or_arrest
html_nodes(tab, xpath=".//td[4]")[1:9] %>%
html_text() %>%
stri_trim_both() -> case_or_arrestee
stri_match_all_regex(case_or_arrestee,
paste0(c("Case #: ([[:digit:]]+)",
"Primary Offense: ([[:print:]]+)",
"Arrestee: ([[:print:]]+)",
"Charge: ([[:print:]]+)"), collapse="|")) %>%
map(~apply(.[,2:5], 1, discard, is.na)) %>%
map_df(function(x) {
x <- as.list(x)
if (stri_detect_regex(x[[1]], "[[:alpha:]]")) {
setNames(x, c("arrestee", "charge"))
} else {
setNames(x, c("case_number", "primary_offense"))
}
}) -> case_or_arrestee
html_nodes(tab, xpath=".//td[5]")[1:9] %>%
html_text() -> location
data_frame(occurred, incident_or_arrest, location) %>%
bind_cols(case_or_arrestee) %>%
glimpse()
## Observations: 9
## Variables: 7
## $ occurred <dttm> 2016-12-22 00:00:00, 2016-12-22 00:00:00, 2016-12-22 00:0...
## $ incident_or_arrest <chr> "Incident", "Incident", "Arrest", "Incident", "Incident", ...
## $ location <chr> "2600-BLK TODDLER PLACE DR", "300-BLK ALSPAUGH DR", ...
## $ case_number <chr> "1667276", "1667273", NA, "1667249", "1667248", NA, NA, "1...
## $ primary_offense <chr> "BREAKING & ENTERING WITH FORCE", "MALICIOUS INJURY TO PRO...
## $ arrestee <chr> NA, NA, "THOMAS, KERRY MARTIN", NA, NA, "LOZANO, MIGUEL AR...
## $ charge <chr> NA, NA, "PANHANDLING W/ NO PRIVLEDGE LICENSE", NA, NA, "AN...