使用R解析xml内容以提取标题信息
我有一个xml数据使用R解析xml内容以提取标题信息,r,xml,R,Xml,我有一个xml数据 <?xml version="1.0" encoding="UTF-8"?> <ClinVarResult-Set> <ClinVarSet ID="95075"> <RecordStatus>not current</RecordStatus> <Title>MPV17, 26-BP DEL, NT116 AND Navajo neurohepatopathy</Title&g
<?xml version="1.0" encoding="UTF-8"?>
<ClinVarResult-Set>
<ClinVarSet ID="95075">
<RecordStatus>not current</RecordStatus>
<Title>MPV17, 26-BP DEL, NT116 AND Navajo neurohepatopathy</Title>
<ReferenceClinVarAssertion DateCreated="2012-08-13" DateLastUpdated="2013-04-03" ID="75049">
<ClinVarAccession Acc="RCV000017546" Version="1" Type="RCV" DateUpdated="2013-04-08"/>
<RecordStatus>current</RecordStatus>
<ClinicalSignificance DateLastEvaluated="2011-11-17">
<ReviewStatus>classified by single submitter</ReviewStatus>
<Description>pathogenic</Description>
</ClinicalSignificance>
<Assertion Type="variation to disease"/>
<ExternalID DB="NCBI"/>
<ObservedIn>
<Sample>
<Origin>germline</Origin>
<Species TaxonomyId="9606">human</Species>
<AffectedStatus>not provided</AffectedStatus>
</Sample>
<Method>
<MethodType>curation</MethodType>
</Method>
<ObservedData ID="208542">
<Attribute Type="Description">See 137960.0003 and Spinazzola et al. (2006).</Attribute>
<Citation Type="general">
<ID Source="PubMed">16582910</ID>
</Citation>
</ObservedData>
</ObservedIn>
<MeasureSet Type="Variant" ID="16163">
<Measure Type="Deletion" ID="31202">
<Name>
<ElementValue Type="Alternate">MPV17, 26-BP DEL, NT116</ElementValue>
<XRef Type="Allelic variant" ID="137960.0004" DB="OMIM"/>
</Name>
<AttributeSet>
<Attribute Type="nucleotide change">26-BP DEL, NT116</Attribute>
<XRef Type="Allelic variant" ID="137960.0004" DB="OMIM"/>
</AttributeSet>
<MeasureRelationship Type="variant in gene">
<Name>
<ElementValue Type="Preferred">MpV17 mitochondrial inner membrane protein</ElementValue>
</Name>
<Symbol>
<ElementValue Type="Preferred">MPV17</ElementValue>
</Symbol>
<XRef ID="4358" DB="Gene"/>
<XRef ID="137960" DB="OMIM" Type="MIM"/>
</MeasureRelationship>
<XRef Type="Allelic variant" ID="137960.0004" DB="OMIM"/>
</Measure>
</MeasureSet>
<TraitSet Type="Disease" ID="5245">
<Trait ID="3439" Type="Disease">
<Name>
<ElementValue Type="Preferred">Navajo neurohepatopathy</ElementValue>
<XRef ID="3972" DB="Office of Rare Diseases"/>
</Name>
<Name>
<ElementValue Type="Alternate">Navajo neuropathy</ElementValue>
</Name>
<Name>
<ElementValue Type="Alternate">MITOCHONDRIAL DNA DEPLETION SYNDROME 6 (HEPATOCEREBRAL TYPE)</ElementValue>
<XRef Type="MIM" ID="256810" DB="OMIM"/>
<XRef Type="Allelic variant" ID="137960.0002" DB="OMIM"/>
<XRef Type="Allelic variant" ID="137960.0003" DB="OMIM"/>
<XRef Type="Allelic variant" ID="137960.0005" DB="OMIM"/>
<XRef Type="Allelic variant" ID="137960.0004" DB="OMIM"/>
<XRef Type="Allelic variant" ID="137960.0001" DB="OMIM"/>
<XRef Type="Allelic variant" ID="137960.0006" DB="OMIM"/>
<XRef Type="Allelic variant" ID="137960.0007" DB="OMIM"/>
</Name>
<Name>
<ElementValue Type="Alternate">MPV17- Related Hepatocerebral Mitochondrial DNA Depletion Syndrome</ElementValue>
<XRef ID="NBK92947" DB="GeneReviews"/>
</Name>
<Symbol>
<ElementValue Type="Preferred">MTDPS6</ElementValue>
<XRef Type="MIM" ID="256810" DB="OMIM"/>
</Symbol>
<Symbol>
<ElementValue Type="Alternate">NN</ElementValue>
<XRef Type="MIM" ID="256810" DB="OMIM"/>
<XRef ID="3972" DB="Office of Rare Diseases"/>
</Symbol>
<Symbol>
<ElementValue Type="Alternate">NNH</ElementValue>
<XRef Type="MIM" ID="256810" DB="OMIM"/>
</Symbol>
<AttributeSet>
<Attribute Type="age of onset">Childhood</Attribute>
</AttributeSet>
<Citation Type="review" Abbrev="GeneReviews">
<ID Source="PubMed">22593919</ID>
</Citation>
<XRef ID="255229" DB="Orphanet"/>
<XRef ID="C1850406" DB="MedGen"/>
<XRef ID="NBK92947" DB="GeneReviews"/>
<XRef Type="MIM" ID="256810" DB="OMIM"/>
</Trait>
</TraitSet>
</ReferenceClinVarAssertion>
<ClinVarAssertion ID="37818">
<ClinVarSubmissionID localKey="137960.0004_MITOCHONDRIAL DNA DEPLETION SYNDROME 6 (HEPATOCEREBRAL TYPE)" title="MPV17, 26-BP DEL, NT116 _MITOCHONDRIAL DNA DEPLETION SYNDROME 6 (HEPATOCEREBRAL TYPE)" submitterDate="2011-11-17" submitter="OMIM"/>
<ClinVarAccession Acc="SCV000037818" OrgID="3" Version="1" Type="SCV" DateUpdated="2013-04-08"/>
<RecordStatus>current</RecordStatus>
<ClinicalSignificance DateLastEvaluated="2011-11-17">
<Description>pathogenic</Description>
</ClinicalSignificance>
<Assertion Type="variation to disease"/>
<ObservedIn>
<Sample>
<Origin>germline</Origin>
<Species>human</Species>
<AffectedStatus>not provided</AffectedStatus>
</Sample>
<Method>
<MethodType>curation</MethodType>
</Method>
<ObservedData>
<Attribute Type="Description">See 137960.0003 and Spinazzola et al. (2006).</Attribute>
<Citation>
<ID Source="PubMed">16582910</ID>
</Citation>
</ObservedData>
</ObservedIn>
<MeasureSet Type="Variant">
<Measure Type="Variation">
<Name>
<ElementValue Type="Preferred">MPV17, 26-BP DEL, NT116 </ElementValue>
</Name>
<AttributeSet>
<Attribute Type="NonHGVS">26-BP DEL, NT116</Attribute>
</AttributeSet>
<MeasureRelationship Type="variant in gene">
<Symbol>
<ElementValue Type="Preferred">MPV17</ElementValue>
</Symbol>
</MeasureRelationship>
<XRef DB="OMIM" Type="Allelic variant" ID="137960.0004"/>
</Measure>
</MeasureSet>
<TraitSet Type="Disease">
<Trait Type="Disease">
<Name>
<ElementValue Type="Preferred">MITOCHONDRIAL DNA DEPLETION SYNDROME 6 (HEPATOCEREBRAL TYPE)</ElementValue>
</Name>
</Trait>
</TraitSet>
</ClinVarAssertion>
</ClinVarSet>
</ClinVarResult-Set>
或
但是这些都让我得到了标题的全部内容,当我只需要登录号的时候,我怎么才能得到这个号码呢?我尝试了多种方法,检查了许多例子,但我就是找不到答案
通过以下方式检索rcv_数据:
library(XML)
library(httr)
UA <- "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
id=95075
rcv_search= paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=clinvar&rettype=clinvarset&id=",id,sep="")
rcv_doc <- GET(rcv_search, user_agent(UA))
rcv_data <- xmlParse(content(rcv_doc, "text"))
库(XML)
图书馆(httr)
UA这是我的目标。。。这有用吗
library( xml2 )
id=95075
rcv_search= paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=clinvar&rettype=clinvarset&id=",id)
#read data
rcv_data <- xml2::read_xml(rcv_search)
xml2::xml_find_all( rcv_data, ".//ClinVarAccession" )
# {xml_nodeset (2)}
# [1] <ClinVarAccession Acc="RCV000017546" Version="1" Type="RCV" DateUpdated="2013-04-08"/>
# [2] <ClinVarAccession Acc="SCV000037818" OrgID="3" Version="1" Type="SCV" DateUpdated="2013-04-08"/>
库(xml2)
id=95075
rcv_search=paste0(“https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=clinvar&rettype=clinvarset&id=“,id)
#读取数据
rcv_data下面是两组代码(一组使用dplyr,另一组不使用),它们将返回以“R”开头的登录号(因为您提供的代码返回两个代码,一个以“R”开头,另一个以“S”开头,并且您指定要以“R”开头的代码):
或者,如果没有dplyr
R库,您可以执行以下操作:
## get the referenced nodes
nodes <- getNodeSet(doc, "//ClinVarAccession") %>%
## get accession numbers from nodes
accs <- sapply(nodes, xmlGetAttr, "Acc") %>%
## return accession number that begin with "R"
accs[grep("^[R].*", accs)]
##获取引用的节点
节点%
##从节点获取登录号
accs%
##返回以“R”开头的登录号
accs[grep(“^[R].*”,accs)]
我希望这有帮助 请包括示例代码中使用的包。另外,在读取数据时,我会得到两个相关节点,还有:您的输出应该是什么样的?我的预期输出应该是“RCV00017546”,我可能弄错了数字,它应该是“RCV00017546”,我已经更新了我使用的包
library( xml2 )
id=95075
rcv_search= paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=clinvar&rettype=clinvarset&id=",id)
#read data
rcv_data <- xml2::read_xml(rcv_search)
xml2::xml_find_all( rcv_data, ".//ClinVarAccession" )
# {xml_nodeset (2)}
# [1] <ClinVarAccession Acc="RCV000017546" Version="1" Type="RCV" DateUpdated="2013-04-08"/>
# [2] <ClinVarAccession Acc="SCV000037818" OrgID="3" Version="1" Type="SCV" DateUpdated="2013-04-08"/>
## return accession number that begin with "R"
## call dplyr library in order to use pipes
library(dplyr)
## get the referenced nodes
getNodeSet(doc, "//ClinVarAccession") %>%
## get accession numbers from nodes
sapply(xmlGetAttr, "Acc") %>%
## return codes that start with "R"
.[grep("^[R].*", .)]
## get the referenced nodes
nodes <- getNodeSet(doc, "//ClinVarAccession") %>%
## get accession numbers from nodes
accs <- sapply(nodes, xmlGetAttr, "Acc") %>%
## return accession number that begin with "R"
accs[grep("^[R].*", accs)]