使用R解析xml内容以提取标题信息_R_Xml

使用R解析xml内容以提取标题信息

r xml

使用R解析xml内容以提取标题信息,r,xml,R,Xml,我有一个xml数据 <?xml version="1.0" encoding="UTF-8"?> <ClinVarResult-Set> <ClinVarSet ID="95075"> <RecordStatus>not current</RecordStatus> <Title>MPV17, 26-BP DEL, NT116 AND Navajo neurohepatopathy</Title&g

我有一个xml数据

<?xml version="1.0" encoding="UTF-8"?>
<ClinVarResult-Set>
  <ClinVarSet ID="95075">
    <RecordStatus>not current</RecordStatus>
    <Title>MPV17, 26-BP DEL, NT116 AND Navajo neurohepatopathy</Title>
    <ReferenceClinVarAssertion DateCreated="2012-08-13" DateLastUpdated="2013-04-03" ID="75049">
      <ClinVarAccession Acc="RCV000017546" Version="1" Type="RCV" DateUpdated="2013-04-08"/>
      <RecordStatus>current</RecordStatus>
      <ClinicalSignificance DateLastEvaluated="2011-11-17">
        <ReviewStatus>classified by single submitter</ReviewStatus>
        <Description>pathogenic</Description>
      </ClinicalSignificance>
      <Assertion Type="variation to disease"/>
      <ExternalID DB="NCBI"/>
      <ObservedIn>
        <Sample>
          <Origin>germline</Origin>
          <Species TaxonomyId="9606">human</Species>
          <AffectedStatus>not provided</AffectedStatus>
        </Sample>
        <Method>
          <MethodType>curation</MethodType>
        </Method>
        <ObservedData ID="208542">
          <Attribute Type="Description">See 137960.0003 and Spinazzola et al. (2006).</Attribute>
          <Citation Type="general">
            <ID Source="PubMed">16582910</ID>
          </Citation>
        </ObservedData>
      </ObservedIn>
      <MeasureSet Type="Variant" ID="16163">
        <Measure Type="Deletion" ID="31202">
          <Name>
            <ElementValue Type="Alternate">MPV17, 26-BP DEL, NT116</ElementValue>
            <XRef Type="Allelic variant" ID="137960.0004" DB="OMIM"/>
          </Name>
          <AttributeSet>
            <Attribute Type="nucleotide change">26-BP DEL, NT116</Attribute>
            <XRef Type="Allelic variant" ID="137960.0004" DB="OMIM"/>
          </AttributeSet>
          <MeasureRelationship Type="variant in gene">
            <Name>
              <ElementValue Type="Preferred">MpV17 mitochondrial inner membrane protein</ElementValue>
            </Name>
            <Symbol>
              <ElementValue Type="Preferred">MPV17</ElementValue>
            </Symbol>
            <XRef ID="4358" DB="Gene"/>
            <XRef ID="137960" DB="OMIM" Type="MIM"/>
          </MeasureRelationship>
          <XRef Type="Allelic variant" ID="137960.0004" DB="OMIM"/>
        </Measure>
      </MeasureSet>
      <TraitSet Type="Disease" ID="5245">
        <Trait ID="3439" Type="Disease">
          <Name>
            <ElementValue Type="Preferred">Navajo neurohepatopathy</ElementValue>
            <XRef ID="3972" DB="Office of Rare Diseases"/>
          </Name>
          <Name>
            <ElementValue Type="Alternate">Navajo neuropathy</ElementValue>
          </Name>
          <Name>
            <ElementValue Type="Alternate">MITOCHONDRIAL DNA DEPLETION SYNDROME 6 (HEPATOCEREBRAL TYPE)</ElementValue>
            <XRef Type="MIM" ID="256810" DB="OMIM"/>
            <XRef Type="Allelic variant" ID="137960.0002" DB="OMIM"/>
            <XRef Type="Allelic variant" ID="137960.0003" DB="OMIM"/>
            <XRef Type="Allelic variant" ID="137960.0005" DB="OMIM"/>
            <XRef Type="Allelic variant" ID="137960.0004" DB="OMIM"/>
            <XRef Type="Allelic variant" ID="137960.0001" DB="OMIM"/>
            <XRef Type="Allelic variant" ID="137960.0006" DB="OMIM"/>
            <XRef Type="Allelic variant" ID="137960.0007" DB="OMIM"/>
          </Name>
          <Name>
            <ElementValue Type="Alternate">MPV17- Related Hepatocerebral Mitochondrial DNA Depletion Syndrome</ElementValue>
            <XRef ID="NBK92947" DB="GeneReviews"/>
          </Name>
          <Symbol>
            <ElementValue Type="Preferred">MTDPS6</ElementValue>
            <XRef Type="MIM" ID="256810" DB="OMIM"/>
          </Symbol>
          <Symbol>
            <ElementValue Type="Alternate">NN</ElementValue>
            <XRef Type="MIM" ID="256810" DB="OMIM"/>
            <XRef ID="3972" DB="Office of Rare Diseases"/>
          </Symbol>
          <Symbol>
            <ElementValue Type="Alternate">NNH</ElementValue>
            <XRef Type="MIM" ID="256810" DB="OMIM"/>
          </Symbol>
          <AttributeSet>
            <Attribute Type="age of onset">Childhood</Attribute>
          </AttributeSet>
          <Citation Type="review" Abbrev="GeneReviews">
            <ID Source="PubMed">22593919</ID>
          </Citation>
          <XRef ID="255229" DB="Orphanet"/>
          <XRef ID="C1850406" DB="MedGen"/>
          <XRef ID="NBK92947" DB="GeneReviews"/>
          <XRef Type="MIM" ID="256810" DB="OMIM"/>
        </Trait>
      </TraitSet>
    </ReferenceClinVarAssertion>
    <ClinVarAssertion ID="37818">
      <ClinVarSubmissionID localKey="137960.0004_MITOCHONDRIAL DNA DEPLETION SYNDROME 6 (HEPATOCEREBRAL TYPE)" title="MPV17, 26-BP DEL, NT116 _MITOCHONDRIAL DNA DEPLETION SYNDROME 6 (HEPATOCEREBRAL TYPE)" submitterDate="2011-11-17" submitter="OMIM"/>
      <ClinVarAccession Acc="SCV000037818" OrgID="3" Version="1" Type="SCV" DateUpdated="2013-04-08"/>
      <RecordStatus>current</RecordStatus>
      <ClinicalSignificance DateLastEvaluated="2011-11-17">
        <Description>pathogenic</Description>
      </ClinicalSignificance>
      <Assertion Type="variation to disease"/>
      <ObservedIn>
        <Sample>
          <Origin>germline</Origin>
          <Species>human</Species>
          <AffectedStatus>not provided</AffectedStatus>
        </Sample>
        <Method>
          <MethodType>curation</MethodType>
        </Method>
        <ObservedData>
          <Attribute Type="Description">See 137960.0003 and Spinazzola et al. (2006).</Attribute>
          <Citation>
            <ID Source="PubMed">16582910</ID>
          </Citation>
        </ObservedData>
      </ObservedIn>
      <MeasureSet Type="Variant">
        <Measure Type="Variation">
          <Name>
            <ElementValue Type="Preferred">MPV17, 26-BP DEL, NT116 </ElementValue>
          </Name>
          <AttributeSet>
            <Attribute Type="NonHGVS">26-BP DEL, NT116</Attribute>
          </AttributeSet>
          <MeasureRelationship Type="variant in gene">
            <Symbol>
              <ElementValue Type="Preferred">MPV17</ElementValue>
            </Symbol>
          </MeasureRelationship>
          <XRef DB="OMIM" Type="Allelic variant" ID="137960.0004"/>
        </Measure>
      </MeasureSet>
      <TraitSet Type="Disease">
        <Trait Type="Disease">
          <Name>
            <ElementValue Type="Preferred">MITOCHONDRIAL DNA DEPLETION SYNDROME 6 (HEPATOCEREBRAL TYPE)</ElementValue>
          </Name>
        </Trait>
      </TraitSet>
    </ClinVarAssertion>
  </ClinVarSet>
</ClinVarResult-Set>

或

但是这些都让我得到了标题的全部内容，当我只需要登录号的时候，我怎么才能得到这个号码呢？我尝试了多种方法，检查了许多例子，但我就是找不到答案

通过以下方式检索rcv_数据：

library(XML)
library(httr)
UA <- "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
id=95075
rcv_search= paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=clinvar&rettype=clinvarset&id=",id,sep="")
rcv_doc <- GET(rcv_search, user_agent(UA))
rcv_data <- xmlParse(content(rcv_doc, "text"))

库（XML）
图书馆（httr）
UA这是我的目标。。。这有用吗
library( xml2 )
id=95075
rcv_search= paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=clinvar&rettype=clinvarset&id=",id)
#read data
rcv_data <- xml2::read_xml(rcv_search)

xml2::xml_find_all( rcv_data, ".//ClinVarAccession" )

# {xml_nodeset (2)}
# [1] <ClinVarAccession Acc="RCV000017546" Version="1" Type="RCV" DateUpdated="2013-04-08"/>
# [2] <ClinVarAccession Acc="SCV000037818" OrgID="3" Version="1" Type="SCV" DateUpdated="2013-04-08"/>

库（xml2）
id=95075
rcv_search=paste0（“https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=clinvar&rettype=clinvarset&id=“，id）
#读取数据
rcv_data下面是两组代码（一组使用dplyr，另一组不使用），它们将返回以“R”开头的登录号（因为您提供的代码返回两个代码，一个以“R”开头，另一个以“S”开头，并且您指定要以“R”开头的代码）：
或者，如果没有dplyr
R库，您可以执行以下操作：
## get the referenced nodes
nodes <- getNodeSet(doc, "//ClinVarAccession") %>%
## get accession numbers from nodes
accs <- sapply(nodes, xmlGetAttr, "Acc") %>%
## return accession number that begin with "R"
accs[grep("^[R].*", accs)]

##获取引用的节点
节点%
##从节点获取登录号
accs%
##返回以“R”开头的登录号
accs[grep（“^[R].*”，accs）]

我希望这有帮助
 请包括示例代码中使用的包。另外，在读取数据时，我会得到两个相关节点，还有：您的输出应该是什么样的？我的预期输出应该是“RCV00017546”，我可能弄错了数字，它应该是“RCV00017546”，我已经更新了我使用的包
library( xml2 )
id=95075
rcv_search= paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=clinvar&rettype=clinvarset&id=",id)
#read data
rcv_data <- xml2::read_xml(rcv_search)

xml2::xml_find_all( rcv_data, ".//ClinVarAccession" )

# {xml_nodeset (2)}
# [1] <ClinVarAccession Acc="RCV000017546" Version="1" Type="RCV" DateUpdated="2013-04-08"/>
# [2] <ClinVarAccession Acc="SCV000037818" OrgID="3" Version="1" Type="SCV" DateUpdated="2013-04-08"/>

## return accession number that begin with "R"
## call dplyr library in order to use pipes
library(dplyr)
## get the referenced nodes
getNodeSet(doc, "//ClinVarAccession") %>%
## get accession numbers from nodes
sapply(xmlGetAttr, "Acc") %>%
## return codes that start with "R"
.[grep("^[R].*", .)]

## get the referenced nodes
nodes <- getNodeSet(doc, "//ClinVarAccession") %>%
## get accession numbers from nodes
accs <- sapply(nodes, xmlGetAttr, "Acc") %>%
## return accession number that begin with "R"
accs[grep("^[R].*", accs)]