Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/75.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Html 从R中具有某些缺失值的列表中提取HREF_Html_R_List_Rvest_Purrr - Fatal编程技术网

Html 从R中具有某些缺失值的列表中提取HREF

Html 从R中具有某些缺失值的列表中提取HREF,html,r,list,rvest,purrr,Html,R,List,Rvest,Purrr,我热衷于将一组jekyll主题的源URL和演示URL提取到data.frame中 library(rvest) info <- read_html("https://github.com/jekyll/jekyll/wiki/themes") data <- info %>% html_nodes(" #wiki-body li") data {xml_nodeset (115)} [11] <li>Typewriter - (<a href="h

我热衷于将一组jekyll主题的源URL和演示URL提取到data.frame中

library(rvest)

info <- read_html("https://github.com/jekyll/jekyll/wiki/themes")

data <- info %>%
 html_nodes(" #wiki-body li")

data
{xml_nodeset (115)}


[11] <li>Typewriter - (<a href="https://github.com/alixedi/typewriter">source</a>, <a href="http://alixedi.github.io/typewriter">demo</a>)</li>
[12] <li>block-log - (<a href="https://github.com/anandubajith/block-log">source</a>), <a href="https://anandu.net/demo/block-log/">demo</a>)</li>
[13] <li>Otter Pop - (<a href="https://github.com/tybenz/otter-pop">source</a>)</li>
我能够将所有的HREF提取为一个向量,但正如您所看到的,从[13]中,没有针对某些站点的演示,因此我遇到了一些困难


有没有一种简单的方法可以从数据创建df?可能使用purrr库

data\u out您可以使用xpath分别收集有演示数据的组和没有演示数据的组:

data_out <- c()
for (i in 1:length(data)) {
  row <- data.frame(html_text(data[i]), as.character(html_children(data[[i]]))[1], as.character(html_children(data[[i]]))[2])
  data_out <- rbind(data_out, row)
}
names(data_out) <- c("name", "source", "demo")
data_out$name <- gsub(" - [(]source, demo[)]", "", data_out$name)
data_out$source <- gsub("<a href=\"|\">source</a>", "", data_out$source)
data_out$demo <- gsub("<a href=\"|\">demo</a>", "", data_out$demo)
withDemo <- info %>%
    html_nodes(xpath = "//li[contains(., 'source') and contains(., 'demo')]")

withoutDemo <- info %>%
    html_nodes(xpath = "//li[contains(., 'source') and not(contains(.,'demo'))]")
“名称”列现在包含“Jalpc-(源代码,演示)”和“Bitwiser Material(源代码,演示)”等条目。您可以使用gsub除去额外的(源代码,演示)位:

allInfo$name <- sub("\\s(-\\s)?\\(.+$", "", allInfo$name, perl = TRUE)

allInfo$name以下是您的
purr
-ish答案:

library(rvest)
library(purrr)
library(dplyr)

info <- read_html("https://github.com/jekyll/jekyll/wiki/themes")

themes <- html_nodes(info, xpath=".//div[@class='markdown-body']/*/li")

zero_to_na <- function(x) { ifelse(length(x)==0, NA, x) }

df <- data_frame(name=gsub(" [- ]*\\(.*$", "", html_text(themes)),
                 source=map_chr(themes, ~html_attr(html_nodes(., xpath=".//a[contains(., 'source')]"), "href")),
                 demo=map_chr(themes, ~zero_to_na(html_attr(html_nodes(., xpath=".//a[contains(., 'demo')]"), "href"))))

glimpse(df)
## Observations: 115
## Variables: 3
## $ name   <chr> "Jalpc", "Pixyll", "Jekyll Metro", "Midnight", "Leap Day", "F...
## $ source <chr> "https://github.com/Jack614/jalpc_jekyll_theme", "https://git...
## $ demo   <chr> "http://www.jack003.com", "http://pixyll.com/", "http://blog-...

gsub
/
sub
/等您不想要的“名称”的任何部分。

感谢您提供了另一种替代方法这似乎是最快的方法,可能是更适合更大输入的替代方法。感谢inc purrr函数。这是一条学习曲线
source <- withoutDemo %>% 
    html_children() %>%
    html_attr("href")

# set demo = NA for easy rbind-ing
source <- data.frame(name = html_text(withoutDemo), source = source, demo = NA)
allInfo <- rbind(sourceNdemo, source)
allInfo$name <- sub("\\s(-\\s)?\\(.+$", "", allInfo$name, perl = TRUE)
library(rvest)
library(purrr)
library(dplyr)

info <- read_html("https://github.com/jekyll/jekyll/wiki/themes")

themes <- html_nodes(info, xpath=".//div[@class='markdown-body']/*/li")

zero_to_na <- function(x) { ifelse(length(x)==0, NA, x) }

df <- data_frame(name=gsub(" [- ]*\\(.*$", "", html_text(themes)),
                 source=map_chr(themes, ~html_attr(html_nodes(., xpath=".//a[contains(., 'source')]"), "href")),
                 demo=map_chr(themes, ~zero_to_na(html_attr(html_nodes(., xpath=".//a[contains(., 'demo')]"), "href"))))

glimpse(df)
## Observations: 115
## Variables: 3
## $ name   <chr> "Jalpc", "Pixyll", "Jekyll Metro", "Midnight", "Leap Day", "F...
## $ source <chr> "https://github.com/Jack614/jalpc_jekyll_theme", "https://git...
## $ demo   <chr> "http://www.jack003.com", "http://pixyll.com/", "http://blog-...
map_df(themes, function(x) {
  data_frame(name=gsub(" [- ]*\\(.*$", "", html_text(x)),
             source=html_attr(html_nodes(x, xpath=".//a[contains(., 'source')]"), "href"),
             demo=zero_to_na(html_attr(html_nodes(x, xpath=".//a[contains(., 'demo')]"), "href")))
})