使用带有可变标记的rvest进行刮取_R_Xml_Web Scraping_Rvest

使用带有可变标记的rvest进行刮取

r xml web-scraping

使用带有可变标记的rvest进行刮取,r,xml,web-scraping,rvest,R,Xml,Web Scraping,Rvest,我的问题我正在尝试从此URL中删除文档： url <- https://www.bger.ch/ext/eurospider/live/de/php/aza/http/index.php?lang=de&type=simple_query&query_words=&lang=de&top_subcollection_aza=all&from_date=01.01.2017&to_date=05.01.2017&x=0&y=0

我的问题

我正在尝试从此URL中删除文档：

url <- https://www.bger.ch/ext/eurospider/live/de/php/aza/http/index.php?lang=de&type=simple_query&query_words=&lang=de&top_subcollection_aza=all&from_date=01.01.2017&to_date=05.01.2017&x=0&y=0

但是，我不知道如何实现变量节点的代码

找到了解决方案：

#read the html
pg <- read_html("url <- https://www.bger.ch/ext/eurospider/live/de/php/aza/http/index.php?lang=de&type=simple_query&query_words=&lang=de&top_subcollection_aza=all&from_date=01.01.2017&to_date=05.01.2017&x=0&y=0")

xdf <- pg %>% 
        html_nodes("div.ranklist_content ol li")  %>%    # select enclosing nodes
        # iterate over each, pulling out desired parts and coerce to data.frame
      map_df(~list(link = html_nodes(.x, ".rank_title a") %>% 
                     html_attr("href") %>% 
                     {if(length(.) == 0) NA else .},    # replace length-0 elements with NA
                 title = html_nodes(.x, ".rank_title a") %>% 
                   html_text() %>% 
                   {if(length(.) == 0) NA else .},
                 publication_link = html_nodes(.x, ".published_info a") %>% 
                    html_attr("href") %>% 
                 {if(length(.) == 0) NA else .},  

                  publication = html_nodes(.x, ".published_info a") %>% 
                   html_text() %>% 
                   {if(length(.) == 0) NA else .},

                 court = html_nodes(.x, ".rank_data .court") %>% 
                   html_text(trim=TRUE) %>% 
                   {if(length(.) == 0) NA else .},

                 subject = html_nodes(.x,  ".rank_data .subject") %>% 
                   html_text(trim=TRUE) %>% 
                   {if(length(.) == 0) NA else .},
                 object = html_nodes(.x,   ".rank_data .object") %>% 
                   html_text(trim=TRUE) %>% 
                   {if(length(.) == 0) NA else .}))

#阅读html
pg%#选择封闭节点
#对每个部分进行迭代，取出所需的部分并强制执行data.frame
map_df（~list（link=html_节点（.x，“.rank_title a”）%>%
html_attr（“href”）%%>%
{if（length（.）==0）NA else.}，则用NA替换length-0元素
title=html\u节点（.x，“.rank\u title a”）%>%
html_text（）%>%
{if（length（.）==0）NA else.}，
publication_link=html_节点（.x，“.published_info a”）%>%
html_attr（“href”）%%>%
{if（length（.）==0）NA else.}，
publication=html_节点（.x，“.published_信息a”）%>%
html_text（）%>%
{if（length（.）==0）NA else.}，
court=html\u节点（.x，“.rank\u data.court”）%>%
html_文本（trim=TRUE）%>%
{if（length（.）==0）NA else.}，
subject=html\u节点（.x，“.rank\u data.subject”）%>%
html_文本（trim=TRUE）%>%
{if（length（.）==0）NA else.}，
object=html\u节点（.x，“.rank\u data.object”）%>%
html_文本（trim=TRUE）%>%
{if（长度（..==0）NA else.}）

如果有人能帮我提取

class=“published\u info small normal”

的标题，那就太好了

library(XML)
doc <- xmlTreeParse(url, asText = TRUE, useInternalNodes = TRUE)

#read the html
pg <- read_html("url <- https://www.bger.ch/ext/eurospider/live/de/php/aza/http/index.php?lang=de&type=simple_query&query_words=&lang=de&top_subcollection_aza=all&from_date=01.01.2017&to_date=05.01.2017&x=0&y=0")

xdf <- pg %>% 
        html_nodes("div.ranklist_content ol li")  %>%    # select enclosing nodes
        # iterate over each, pulling out desired parts and coerce to data.frame
      map_df(~list(link = html_nodes(.x, ".rank_title a") %>% 
                     html_attr("href") %>% 
                     {if(length(.) == 0) NA else .},    # replace length-0 elements with NA
                 title = html_nodes(.x, ".rank_title a") %>% 
                   html_text() %>% 
                   {if(length(.) == 0) NA else .},
                 publication_link = html_nodes(.x, ".published_info a") %>% 
                    html_attr("href") %>% 
                 {if(length(.) == 0) NA else .},  

                  publication = html_nodes(.x, ".published_info a") %>% 
                   html_text() %>% 
                   {if(length(.) == 0) NA else .},

                 court = html_nodes(.x, ".rank_data .court") %>% 
                   html_text(trim=TRUE) %>% 
                   {if(length(.) == 0) NA else .},

                 subject = html_nodes(.x,  ".rank_data .subject") %>% 
                   html_text(trim=TRUE) %>% 
                   {if(length(.) == 0) NA else .},
                 object = html_nodes(.x,   ".rank_data .object") %>% 
                   html_text(trim=TRUE) %>% 
                   {if(length(.) == 0) NA else .}))