如何创建从html_节点提取数据并填充表的for循环
我从RePEc数据库中获得了一系列发布标识符。我需要从数据库中获取参考列表,我可以这样做:如何创建从html_节点提取数据并填充表的for循环,r,R,我从RePEc数据库中获得了一系列发布标识符。我需要从数据库中获取参考列表,我可以这样做: identifier <- "RePEc:imf:imfwpa:01/191" url_base <- "http://citec.repec.org/api/amf/" url <- paste0(url_base, identifier) get_data <- read_html(url) references <- html_nodes(get_data,'refer
identifier <- "RePEc:imf:imfwpa:01/191"
url_base <- "http://citec.repec.org/api/amf/"
url <- paste0(url_base, identifier)
get_data <- read_html(url)
references <- html_nodes(get_data,'references') %>% html_nodes("text")
print(references)
{xml_nodeset (6)}
[1] <text ref="RePEc:rio:texdis:400"></text>
[2] <text ref="RePEc:fip:fednrp:9608"></text>
[3] <text ref="RePEc:nbr:nberwo:1172"></text>
[4] <text ref="RePEc:bla:ecnote:v:28:y:1999:i:3:p:335-355"></text>
[5] <text ref="RePEc:imf:imfwpa:00/69"></text>
[6] <text ref="RePEc:eee:jbfina:v:24:y:2000:i:1-2:p:203-227"></text>
identifier <- c("RePEc:imf:imfwpa:01/191", "RePEc:imf:imfwpa:01/191", "RePEc:imf:imfwpa:01/191", "RePEc:imf:imfwpa:01/191", "RePEc:imf:imfwpa:01/191", "RePEc:imf:imfwpa:01/191")
references <- c("RePEc:rio:texdis:400", "RePEc:fip:fednrp:9608", "RePEc:nbr:nberwo:1172", "RePEc:bla:ecnote:v:28:y:1999:i:3:p:335-355", "RePEc:imf:imfwpa:00/69", "RePEc:eee:jbfina:v:24:y:2000:i:1-2:p:203-227")
df <- data.frame(identifier, references)
我尝试使用html\u text(references)
但它只给了我一系列空单元格
一旦我有了这些数据,我想用这些值中的每一个在原始标识符旁边创建一个数据框。换句话说,我需要这样的东西:
identifier <- "RePEc:imf:imfwpa:01/191"
url_base <- "http://citec.repec.org/api/amf/"
url <- paste0(url_base, identifier)
get_data <- read_html(url)
references <- html_nodes(get_data,'references') %>% html_nodes("text")
print(references)
{xml_nodeset (6)}
[1] <text ref="RePEc:rio:texdis:400"></text>
[2] <text ref="RePEc:fip:fednrp:9608"></text>
[3] <text ref="RePEc:nbr:nberwo:1172"></text>
[4] <text ref="RePEc:bla:ecnote:v:28:y:1999:i:3:p:335-355"></text>
[5] <text ref="RePEc:imf:imfwpa:00/69"></text>
[6] <text ref="RePEc:eee:jbfina:v:24:y:2000:i:1-2:p:203-227"></text>
identifier <- c("RePEc:imf:imfwpa:01/191", "RePEc:imf:imfwpa:01/191", "RePEc:imf:imfwpa:01/191", "RePEc:imf:imfwpa:01/191", "RePEc:imf:imfwpa:01/191", "RePEc:imf:imfwpa:01/191")
references <- c("RePEc:rio:texdis:400", "RePEc:fip:fednrp:9608", "RePEc:nbr:nberwo:1172", "RePEc:bla:ecnote:v:28:y:1999:i:3:p:335-355", "RePEc:imf:imfwpa:00/69", "RePEc:eee:jbfina:v:24:y:2000:i:1-2:p:203-227")
df <- data.frame(identifier, references)
标识符文档是XML。我认为使用xml2
更合适
library(xml2)
identifier <- "RePEc:imf:imfwpa:01/191"
url_base <- "http://citec.repec.org/api/amf/"
url <- paste0(url_base, identifier)
references <- read_xml(url) %>%
xml_find_all("//d1:references/d1:text") %>%
xml_attr("ref")
您需要安装xml2
软件包才能使其正常工作
install.packages("xml2")
或者,正如Ben所提到的,使用rvest
,只需将html\u attr(“ref”)
添加到脚本中即可
get_data <- read_html(url)
references <- html_nodes(get_data,'references') %>%
html_nodes("text") %>%
html_attr("ref")
输出
# [1] "RePEc:rio:texdis:400"
# [2] "RePEc:fip:fednrp:9608"
# [3] "RePEc:nbr:nberwo:1172"
# [4] "RePEc:bla:ecnote:v:28:y:1999:i:3:p:335-355"
# [5] "RePEc:imf:imfwpa:00/69"
# [6] "RePEc:eee:jbfina:v:24:y:2000:i:1-2:p:203-227"
# identifier references
# 1 RePEc:imf:imfwpa:01/191 RePEc:rio:texdis:400
# 2 RePEc:imf:imfwpa:01/191 RePEc:fip:fednrp:9608
# 3 RePEc:imf:imfwpa:01/191 RePEc:nbr:nberwo:1172
# 4 RePEc:imf:imfwpa:01/191 RePEc:bla:ecnote:v:28:y:1999:i:3:p:335-355
# 5 RePEc:imf:imfwpa:01/191 RePEc:imf:imfwpa:00/69
# 6 RePEc:imf:imfwpa:01/191 RePEc:eee:jbfina:v:24:y:2000:i:1-2:p:203-227
# 7 RePEc:imf:imfwpa:02/191 RePEc:wck:wckewp:34/99
# 8 RePEc:imf:imfwpa:02/191 RePEc:nbr:nberwo:7018
# 9 RePEc:imf:imfwpa:02/191 RePEc:wop:wispod:1132-97
# 10 RePEc:imf:imfwpa:02/191 RePEc:aea:aecrev:v:88:y:1998:i:3:p:478-94
# 11 RePEc:imf:imfwpa:02/191 RePEc:mie:wpaper:341
# 12 RePEc:imf:imfwpa:02/191 RePEc:eee:inecon:v:4:y:1974:i:2:p:177-185
# 13 RePEc:imf:imfwpa:02/191 RePEc:imf:imfwpa:97/116
# 14 RePEc:imf:imfwpa:02/191 RePEc:nbr:nberwo:7539
# 15 RePEc:imf:imfwpa:02/191 RePEc:aea:aecrev:v:90:y:2000:i:2:p:161-167
# 16 RePEc:imf:imfwpa:02/191 RePEc:eee:inecon:v:50:y:2000:i:1:p:51-71
# 17 RePEc:imf:imfwpa:02/191 RePEc:nbr:nberwo:5427
# 18 RePEc:imf:imfwpa:02/191 RePEc:eee:ecochp:5-58
# 19 RePEc:imf:imfwpa:02/191 RePEc:nbr:nberwo:6591
在html\u节点(“文本”)
之后添加%%>%html\u attr(“ref”)
,您应该会得到所需的文本谢谢!您知道如何将此输出转换为数据帧(即问题的第二部分)吗?