R抓取一份谷歌+；使用purrr包的url_R_Web Scraping_Rvest_Purrr_Rselenium

R抓取一份谷歌+；使用purrr包的url

r web-scraping

R抓取一份谷歌+；使用purrr包的url,r,web-scraping,rvest,purrr,rselenium,R,Web Scraping,Rvest,Purrr,Rselenium,我正在从事一个网络抓取项目，该项目旨在从一组儿童医院提取Google+评论。我的方法如下： 1）定义一个Google+URL列表，以导航到该列表进行查看和删除。URL与定义医院的其他变量一起位于数据框中 2）为与给定url相关的所有评论刮取评论、星星数和发布时间 3）将这些元素保存在数据框中，并在数据框中与url对应的另一个变量后命名数据框 4）转到下一个url。。。依此类推，直到所有URL都被删除目前，代码可以从单个url中提取。我尝试使用purr包中的map创建函数。然而它似乎不起作用，我

我正在从事一个网络抓取项目，该项目旨在从一组儿童医院提取Google+评论。我的方法如下：

1）定义一个Google+URL列表，以导航到该列表进行查看和删除。URL与定义医院的其他变量一起位于数据框中

2）为与给定url相关的所有评论刮取评论、星星数和发布时间

3）将这些元素保存在数据框中，并在数据框中与url对应的另一个变量后命名数据框

4）转到下一个url。。。依此类推，直到所有URL都被删除

目前，代码可以从单个url中提取。我尝试使用

purr

包中的

map

创建函数。然而它似乎不起作用，我做错了什么

以下是我的尝试，并对每一步的目的进行了评论

#Load the necessary libraries
devtools::install_github("ropensci/RSelenium")
library(purrr)
library(dplyr)
library(stringr)
library(rvest)
library(xml2)
library(RSelenium)
#To avoid any SSL error messages
library(httr)
set_config( config( ssl_verifypeer = 0L ) )

定义URL数据框

#Now to define the dataframe with the urls
urls_df =data.frame(Name=c("CHKD","AIDHC")
                    ,ID=c("AAWZ12","AAWZ13")
                    ,GooglePlus_URL=c("https://www.google.co.uk/search?ei=fJUKW9DcJuqSgAbPsZ3gDQ&q=Childrens+Hospital+of+the+Kings+Daughter+&oq=Childrens+Hospital+of+the+Kings+Daughter+&gs_l=psy-ab.3..0i13k1j0i22i10i30k1j0i22i30k1l7.8445.8445.0.9118.1.1.0.0.0.0.144.144.0j1.1.0....0...1c.1.64.psy-ab..0.1.143....0.qDMr7IDA-uA#lrd=0x89ba9869b87f1a69:0x384861b1e3a4efd3,1,,,",
                                      "https://www.google.co.uk/search?q=Alfred+I+DuPont+Hospital+for+Children&oq=Alfred+I+DuPont+Hospital+for+Children&aqs=chrome..69i57.341j0j8&sourceid=chrome&ie=UTF-8#lrd=0x89c6fce9425c92bd:0x80e502f2175fb19c,1,,,"
                                      ))

创建函数

extract_google_review=function(googleplus_urls) {

  #Opens a Chrome session
  rmDr=rsDriver(browser = "chrome",check = F)
  myclient= rmDr$client

  #Creates a sub-dataframe for the filtered hospital, which I will later use to name the dataframe
  urls_df_sub=urls_df %>% filter(GooglePlus_URL %in% googleplus_urls)

  #Navigate to the url
  myclient$navigate(googleplus_urls)

  #click on the snippet to switch focus----------
  webEle <- myclient$findElement(using = "css",value = ".review-snippet")
  webEle$clickElement()
  # Save page source
  pagesource= myclient$getPageSource()[[1]]

  #simulate scroll down for several times-------------
  count=read_html(pagesource) %>%
    html_nodes(".p13zmc") %>%
    html_text()

  #Stores the number of reviews for the url, so we know how many times to scroll down
  scroll_down_times=count %>%
    str_sub(1,nchar(count)-5) %>%
    as.numeric()

  for(i in 1 :scroll_down_times){
    webEle$sendKeysToActiveElement(sendKeys = list(key="page_down"))
    #the content needs time to load,wait 1.2 second every 5 scroll downs
    if(i%%5==0){
      Sys.sleep(1.2)
    }
  }

  #loop and simulate clicking on all "click on more" elements-------------
  webEles <- myclient$findElements(using = "css",value = ".review-more-link")
  for(webEle in webEles){
    tryCatch(webEle$clickElement(),error=function(e){print(e)})
  }

  pagesource= myclient$getPageSource()[[1]]
  #this should get the full review, including translation and original text
    reviews=read_html(pagesource) %>%
    html_nodes(".review-full-text") %>%
    html_text()

  #number of stars
  stars <- read_html(pagesource) %>%
    html_node(".review-dialog-list") %>%
    html_nodes("g-review-stars > span") %>%
    html_attr("aria-label")

  #time posted
  post_time <- read_html(pagesource) %>%
    html_node(".review-dialog-list") %>%
    html_nodes(".dehysf") %>%
    html_text()

  #Consolidating everything into a dataframe
  reviews=head(reviews,min(length(reviews),length(stars),length(post_time)))
  stars=head(stars,min(length(reviews),length(stars),length(post_time))) 
  post_time=head(post_time,min(length(reviews),length(stars),length(post_time)))
  reviews_df=data.frame(review=reviews,rating=stars,time=post_time)

  #Assign the dataframe a name based on the value in column 'Name' of the dataframe urls_df, defined above
  df_name <- tolower(urls_df_sub$Name)

  if(exists(df_name)) {
    assign(df_name, unique(rbind(get(df_name), reviews_df)))
  } else {
    assign(df_name, reviews_df)
  }


} #End function

#Now that the function is defined, it is time to create a vector of urls and feed this vector into the function
googleplus_urls=urls_df$GooglePlus_URL
googleplus_urls %>% map(extract_google_review)

函数中似乎有一个错误，它阻止它像预期的那样将数据刮取并存储到单独的数据帧中

我的预期输出

2个数据帧，每个数据帧有3列

任何关于如何改进的建议都将不胜感激。

您最好使用for loop而不是map。因为您需要使用selenium打开其他窗口进行web抓取。您可以检查selenium平行刮削导轨。目前，我认为没有R版本的thouh.Ok。所以R目前不可能这样做？检查一下：我想我错了，可以用其他软件包来做。上面的链接应该可以工作。我试着使用

foreach

，就像你在上面的链接中建议的那样，但它似乎仍然不起作用