使用带有可变标记的rvest进行刮取

使用带有可变标记的rvest进行刮取,r,xml,web-scraping,rvest,R,Xml,Web Scraping,Rvest,我的问题 我正在尝试从此URL中删除文档: url <- https://www.bger.ch/ext/eurospider/live/de/php/aza/http/index.php?lang=de&type=simple_query&query_words=&lang=de&top_subcollection_aza=all&from_date=01.01.2017&to_date=05.01.2017&x=0&y=0

我的问题

我正在尝试从此URL中删除文档:

url <- https://www.bger.ch/ext/eurospider/live/de/php/aza/http/index.php?lang=de&type=simple_query&query_words=&lang=de&top_subcollection_aza=all&from_date=01.01.2017&to_date=05.01.2017&x=0&y=0
但是,我不知道如何实现变量节点的代码

找到了解决方案:

#read the html
pg <- read_html("url <- https://www.bger.ch/ext/eurospider/live/de/php/aza/http/index.php?lang=de&type=simple_query&query_words=&lang=de&top_subcollection_aza=all&from_date=01.01.2017&to_date=05.01.2017&x=0&y=0")

xdf <- pg %>% 
        html_nodes("div.ranklist_content ol li")  %>%    # select enclosing nodes
        # iterate over each, pulling out desired parts and coerce to data.frame
      map_df(~list(link = html_nodes(.x, ".rank_title a") %>% 
                     html_attr("href") %>% 
                     {if(length(.) == 0) NA else .},    # replace length-0 elements with NA
                 title = html_nodes(.x, ".rank_title a") %>% 
                   html_text() %>% 
                   {if(length(.) == 0) NA else .},
                 publication_link = html_nodes(.x, ".published_info a") %>% 
                    html_attr("href") %>% 
                 {if(length(.) == 0) NA else .},  

                  publication = html_nodes(.x, ".published_info a") %>% 
                   html_text() %>% 
                   {if(length(.) == 0) NA else .},

                 court = html_nodes(.x, ".rank_data .court") %>% 
                   html_text(trim=TRUE) %>% 
                   {if(length(.) == 0) NA else .},

                 subject = html_nodes(.x,  ".rank_data .subject") %>% 
                   html_text(trim=TRUE) %>% 
                   {if(length(.) == 0) NA else .},
                 object = html_nodes(.x,   ".rank_data .object") %>% 
                   html_text(trim=TRUE) %>% 
                   {if(length(.) == 0) NA else .}))
#阅读html
pg%#选择封闭节点
#对每个部分进行迭代,取出所需的部分并强制执行data.frame
map_df(~list(link=html_节点(.x,“.rank_title a”)%>%
html_attr(“href”)%%>%
{if(length(.)==0)NA else.},则用NA替换length-0元素
title=html\u节点(.x,“.rank\u title a”)%>%
html_text()%>%
{if(length(.)==0)NA else.},
publication_link=html_节点(.x,“.published_info a”)%>%
html_attr(“href”)%%>%
{if(length(.)==0)NA else.},
publication=html_节点(.x,“.published_信息a”)%>%
html_text()%>%
{if(length(.)==0)NA else.},
court=html\u节点(.x,“.rank\u data.court”)%>%
html_文本(trim=TRUE)%>%
{if(length(.)==0)NA else.},
subject=html\u节点(.x,“.rank\u data.subject”)%>%
html_文本(trim=TRUE)%>%
{if(length(.)==0)NA else.},
object=html\u节点(.x,“.rank\u data.object”)%>%
html_文本(trim=TRUE)%>%
{if(长度(..==0)NA else.})
如果有人能帮我提取
class=“published\u info small normal”
的标题,那就太好了

library(XML)
doc <- xmlTreeParse(url, asText = TRUE, useInternalNodes = TRUE)
#read the html
pg <- read_html("url <- https://www.bger.ch/ext/eurospider/live/de/php/aza/http/index.php?lang=de&type=simple_query&query_words=&lang=de&top_subcollection_aza=all&from_date=01.01.2017&to_date=05.01.2017&x=0&y=0")

xdf <- pg %>% 
        html_nodes("div.ranklist_content ol li")  %>%    # select enclosing nodes
        # iterate over each, pulling out desired parts and coerce to data.frame
      map_df(~list(link = html_nodes(.x, ".rank_title a") %>% 
                     html_attr("href") %>% 
                     {if(length(.) == 0) NA else .},    # replace length-0 elements with NA
                 title = html_nodes(.x, ".rank_title a") %>% 
                   html_text() %>% 
                   {if(length(.) == 0) NA else .},
                 publication_link = html_nodes(.x, ".published_info a") %>% 
                    html_attr("href") %>% 
                 {if(length(.) == 0) NA else .},  

                  publication = html_nodes(.x, ".published_info a") %>% 
                   html_text() %>% 
                   {if(length(.) == 0) NA else .},

                 court = html_nodes(.x, ".rank_data .court") %>% 
                   html_text(trim=TRUE) %>% 
                   {if(length(.) == 0) NA else .},

                 subject = html_nodes(.x,  ".rank_data .subject") %>% 
                   html_text(trim=TRUE) %>% 
                   {if(length(.) == 0) NA else .},
                 object = html_nodes(.x,   ".rank_data .object") %>% 
                   html_text(trim=TRUE) %>% 
                   {if(length(.) == 0) NA else .}))