如何在R中获取XML中的子级数

如何在R中获取XML中的子级数,r,xml,R,Xml,我是R方面的新手。现在我想解析一个XML文件,除了每篇文章的作者数量之外,其他的每一部分都会完成。我采用了以下代码: 但它返回XML格式的作者总数。其余的解析由 library(tidyverse) library(XML) library(methods) xmldata <- xmlParse("pubmedsample.jun18.xml", useInternalNodes = TRUE) publication <- tibble(PMID = as.numeric(xpa

我是R方面的新手。现在我想解析一个XML文件,除了每篇文章的作者数量之外,其他的每一部分都会完成。我采用了以下代码:

但它返回XML格式的作者总数。其余的解析由

library(tidyverse)
library(XML)
library(methods)
xmldata <- xmlParse("pubmedsample.jun18.xml", useInternalNodes = TRUE)
publication <- tibble(PMID = as.numeric(xpathSApply(xmldata, '//MedlineCitation/PMID', xmlValue)),

                       ISSN = xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./Article/Journal/ISSN)")) {
                          xpathSApply(x, "./Article/Journal/ISSN", xmlValue)
                         } else {
                           NA
                         }}),#parse ISSN

                      data_completed_year = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateCompleted/Year)")) {
                          xpathSApply(x, "./DateCompleted/Year", xmlValue)
                         } else {
                           NA
                         }})),
                      data_completed_month = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateCompleted/Month)")) {
                          xpathSApply(x, "./DateCompleted/Month", xmlValue)
                         } else {
                           NA
                         }})),
                      data_completed_day = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateCompleted/Day)")) {
                          xpathSApply(x, "./DateCompleted/Day", xmlValue)
                         } else {
                           NA
                         }})),
                      data_revised_year = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateRevised/Year)")) {
                          xpathSApply(x, "./DateRevised/Year", xmlValue)
                         } else {
                           NA
                         }})),
                      data_revised_month = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateRevised/Month)")) {
                          xpathSApply(x, "./DateRevised/Month", xmlValue)
                         } else {
                           NA
                         }})),
                      data_revised_day = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateRevised/Day)")) {
                          xpathSApply(x, "./DateRevised/Day", xmlValue)
                         } else {
                           NA
                         }})),
                      publication_type = as.character(xpathSApply(xmldata, '//PublicationTypeList', xmlValue))[1],#parse the first type, if more than one
                      article_title = as.character(xpathSApply(xmldata, '//ArticleTitle', xmlValue))) %>%

  mutate(completed_date = as.character(make_date(data_completed_year, data_completed_month, data_completed_day)), revised_date = as.character(make_date(data_revised_year, data_revised_month, data_revised_day))) %>%

  select(PMID, ISSN, completed_date, revised_date, publication_type, article_title)

有人能教我如何获得每篇文章的作者数量吗?非常感谢

为了让您开始,我会这样做:

# Convert MedlineCitation node from XML file to a list
lst <- lapply(xmlToList(xmldata), function(x) x$MedlineCitation)

# Extract the AuthorList node
lst.author <- lapply(lst, function(x) x$Article$AuthorList);

# Count the number of authors
n.author <- sapply(lst.author, function(x) sum(names(x) == "Author"));
#PubmedArticle PubmedArticle PubmedArticle PubmedArticle PubmedArticle
#            1             0            10             1             6
#PubmedArticle
#            2
我发现使用列表要比使用XMLInternalDocument容易得多,因此需要进行转换。然后,任务归结为导航嵌套列表并提取相关信息位。

对于使用xml2和purrr的用户:


XPath解决方案是计算每个PubmedArticle中AuthorList节点的xmlChildren数:

library(XML)
library(tidyverse)
library(plyr)

xmlParse("pubmedsample.jun18.xml") -> doc

getNodeSet(doc, "//PubmedArticle") -> articles

ldply(articles, function(x) {
  xpathSApply(x, ".//Article/Journal/ISSN", xmlValue) -> ISSN
  xpathSApply(x, ".//DateCompleted/Year", xmlValue) -> data_completed_year

  xpathSApply(x, ".//AuthorList", xmlChildren) %>% length() -> author_count
  ifelse(author_count > 0, author_count, NA) -> authors

  # ... #

  data_frame(ISSN, data_completed_year, authors)

}) %>%
  tbl_df() ->
  output

head(output)
产出:

# A tibble: 161 x 3
  ISSN      data_completed_year authors
  <chr>     <chr>                 <int>
1 0095-3814 1976                      1
2 0377-8231 1993                     NA
3 0022-2623 1991                     10
4 0021-8820 1987                      6
5 0014-2956 1994                      2
6 1051-0443 1993                      4
7 0017-0011 1996                      2
8 0026-895X 1996                      5
9 1059-2725 1996                      4
10 0009-7322 1997                     11
# ... with 151 more rows

非常感谢你!我能猜出剩下的部分。不客气@Vincent;祝你今后工作顺利。
library(XML)
library(tidyverse)
library(plyr)

xmlParse("pubmedsample.jun18.xml") -> doc

getNodeSet(doc, "//PubmedArticle") -> articles

ldply(articles, function(x) {
  xpathSApply(x, ".//Article/Journal/ISSN", xmlValue) -> ISSN
  xpathSApply(x, ".//DateCompleted/Year", xmlValue) -> data_completed_year

  xpathSApply(x, ".//AuthorList", xmlChildren) %>% length() -> author_count
  ifelse(author_count > 0, author_count, NA) -> authors

  # ... #

  data_frame(ISSN, data_completed_year, authors)

}) %>%
  tbl_df() ->
  output

head(output)
# A tibble: 161 x 3
  ISSN      data_completed_year authors
  <chr>     <chr>                 <int>
1 0095-3814 1976                      1
2 0377-8231 1993                     NA
3 0022-2623 1991                     10
4 0021-8820 1987                      6
5 0014-2956 1994                      2
6 1051-0443 1993                      4
7 0017-0011 1996                      2
8 0026-895X 1996                      5
9 1059-2725 1996                      4
10 0009-7322 1997                     11
# ... with 151 more rows