使用rvest从表中的列提取超文本和超链接_R_Web Scraping_Rvest

使用rvest从表中的列提取超文本和超链接

r web-scraping

使用rvest从表中的列提取超文本和超链接,r,web-scraping,rvest,R,Web Scraping,Rvest,我想从下表中的“名称”列中提取超文本和超链接：。我的目标是创建一个数据框架，其中一列作为名称，另一列作为链接。使用下面的代码，我可以收集超链接，但我不知道如何将链接与实际名称匹配 library(rvest) library(dplyr) page <- read_html('http://www.ema.europa.eu/ema/index.jsp?curl=pages/medicines/landing/smop_search.jsp&mid=WC0b01ac05800

我想从下表中的“名称”列中提取超文本和超链接：。我的目标是创建一个数据框架，其中一列作为名称，另一列作为链接。使用下面的代码，我可以收集超链接，但我不知道如何将链接与实际名称匹配

library(rvest)
library(dplyr)

page <- read_html('http://www.ema.europa.eu/ema/index.jsp?curl=pages/medicines/landing/smop_search.jsp&mid=WC0b01ac058001d127&startLetter=View%20all&applicationType=Initial%20authorisation&applicationType=Post%20authorisation&keyword=Enter%20keywords&keyword=Enter%20keywords&searchkwByEnter=false&searchType=Name&alreadyLoaded=true&status=Positive&status=Negative&jsenabled=false&orderBy=opinionDate&pageNo=1') %>%
  html_nodes('tbody a') %>% html_attr('href')

dfpage <- data.frame(page)

库（rvest）
图书馆（dplyr）
页数%
html_节点（'tbody a'）%>%html_属性（'href'））
dfpage我将使用以下代码：
library(rvest)
library(tidyverse)

page <- read_html('http://www.ema.europa.eu/ema/index.jsp?curl=pages/medicines/landing/smop_search.jsp&mid=WC0b01ac058001d127&startLetter=View%20all&applicationType=Initial%20authorisation&applicationType=Post%20authorisation&keyword=Enter%20keywords&keyword=Enter%20keywords&searchkwByEnter=false&searchType=Name&alreadyLoaded=true&status=Positive&status=Negative&jsenabled=false&orderBy=opinionDate&pageNo=1') %>%
  html_nodes("table") %>% 
  rvest::html_table()

data <- as_data_frame(page[[1]])

page_link <- read_html('http://www.ema.europa.eu/ema/index.jsp?curl=pages/medicines/landing/smop_search.jsp&mid=WC0b01ac058001d127&startLetter=View%20all&applicationType=Initial%20authorisation&applicationType=Post%20authorisation&keyword=Enter%20keywords&keyword=Enter%20keywords&searchkwByEnter=false&searchType=Name&alreadyLoaded=true&status=Positive&status=Negative&jsenabled=false&orderBy=opinionDate&pageNo=1') %>% 
  html_nodes(".key-detail a , .alt~ .alt th") %>% 
  html_attr('href')

link <- as_data_frame(page_link)
links <- as_data_frame(link$value[-1])

result <- cbind(data, links)

final <- result[, c("Name", "value")]


我希望这有帮助。顺便说一下，我通过使用用于Chrome的SelectorGadget加载项获得了正确的标记。
库（rvest）
library(rvest)
library(tidyverse)

url_template <- "http://www.ema.europa.eu/ema/index.jsp?searchType=Name&applicationType=Initial+authorisation&applicationType=Post+authorisation&searchkwByEnter=false&mid=WC0b01ac058001d127&status=Positive&status=Negative&keyword=Enter+keywords&keyword=Enter+keywords&alreadyLoaded=true&curl=pages%%2Fmedicines%%2Flanding%%2Fsmop_search.jsp&startLetter=View+all&pageNo=%s"

图书馆（tidyverse）
url\u模板%
尾部（1）%>%
html_文本（trim=TRUE）%>%
as.numeric（）->总页数

只有3个，但将来可能会有很多，因此，请设置一个进度条，以满足您的需要，并对表进行清理，然后提取链接并将其添加到表中：
pb <- progress_estimated(total_pages)

sprintf(url_template, 1:total_pages) %>% 
  map_df(function(URL) {

    pb$tick()$print()

    pg <- read_html(URL)

    html_table(pg, trim = TRUE) %>%
      .[[1]] %>%
      set_names(c("name", "active_substance", "inn", "adopted", "outcome")) %>%
      as_tibble() %>%
      mutate(url = html_nodes(pg, "th[scope='row'] > a") %>% html_attr("href"))

  }) -> pending_df

glimpse(pending_df)
## Observations: 67
## Variables: 6
## $ name             <chr> "Lifmior", "Tamiflu", "Jylamvo", "Terrosa", "...
## $ active_substance <chr> "etanercept", "oseltamivir", "methotrexate", ...
## $ inn              <chr> "etanercept", "oseltamivir", "methotrexate", ...
## $ adopted          <chr> "2016-12-15", "2015-03-26", "2017-01-26", "20...
## $ outcome          <chr> "Positive", "Positive", "Positive", "Positive...
## $ url              <chr> "index.jsp?curl=pages/medicines/human/medicin...

pb%
映射函数（URL）{
pb$tick（）$print（）
pg%
.[[1]] %>%
设置名称（c（“名称”、“活性物质”、“inn”、“采用”、“结果”））%>%
as_tible（）%>%
突变（url=html\u节点（pg，“th[scope='row']>a”）%%%html\u attr（“href”））
})->待定
一瞥（待定）
##意见：67
##变量：6
##$name“Lifmior”、“达菲”、“Jylamvo”、“Terrosa”、“。。。
##$活性物质“依那西普”、“奥司他韦”、“甲氨蝶呤”。。。
##依那西普、奥司他韦、甲氨蝶呤等$inn酒店。。。
##$2016-12-15、2015-03-26、2017-01-26、20。。。
##$结果“积极”、“积极”、“积极”、“积极…”。。。
##$url“index.jsp？curl=pages/medicines/human/medicin。。。

library(rvest)
library(tidyverse)

url_template <- "http://www.ema.europa.eu/ema/index.jsp?searchType=Name&applicationType=Initial+authorisation&applicationType=Post+authorisation&searchkwByEnter=false&mid=WC0b01ac058001d127&status=Positive&status=Negative&keyword=Enter+keywords&keyword=Enter+keywords&alreadyLoaded=true&curl=pages%%2Fmedicines%%2Flanding%%2Fsmop_search.jsp&startLetter=View+all&pageNo=%s"

first <- sprintf(url_template, 1)

pg <- read_html(first)

html_nodes(pg, "div.pagination > ul > li:not([class])") %>%
  tail(1) %>%
  html_text(trim = TRUE) %>%
  as.numeric() -> total_pages

pb <- progress_estimated(total_pages)

sprintf(url_template, 1:total_pages) %>% 
  map_df(function(URL) {

    pb$tick()$print()

    pg <- read_html(URL)

    html_table(pg, trim = TRUE) %>%
      .[[1]] %>%
      set_names(c("name", "active_substance", "inn", "adopted", "outcome")) %>%
      as_tibble() %>%
      mutate(url = html_nodes(pg, "th[scope='row'] > a") %>% html_attr("href"))

  }) -> pending_df

glimpse(pending_df)
## Observations: 67
## Variables: 6
## $ name             <chr> "Lifmior", "Tamiflu", "Jylamvo", "Terrosa", "...
## $ active_substance <chr> "etanercept", "oseltamivir", "methotrexate", ...
## $ inn              <chr> "etanercept", "oseltamivir", "methotrexate", ...
## $ adopted          <chr> "2016-12-15", "2015-03-26", "2017-01-26", "20...
## $ outcome          <chr> "Positive", "Positive", "Positive", "Positive...
## $ url              <chr> "index.jsp?curl=pages/medicines/human/medicin...