在R中解析10gbxml文件
我有10GB的XML文件,我需要解析它。XML的示例结构是在R中解析10gbxml文件,r,xml,R,Xml,我有10GB的XML文件,我需要解析它。XML的示例结构是 <?xml version="1.0" encoding="UTF-8"?> <proteinAtlas xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://v18.proteinatlas.org/download/proteinatlas.xsd" schemaVersion="2.5"> &
<?xml version="1.0" encoding="UTF-8"?>
<proteinAtlas xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://v18.proteinatlas.org/download/proteinatlas.xsd" schemaVersion="2.5">
<entry version="18" url="http://v18.proteinatlas.org/ENSG00000000003">
<name>TSPAN6</name>
<synonym>T245</synonym>
<synonym>TM4SF6</synonym>
<synonym>TSPAN-6</synonym>
<identifier id="ENSG00000000003" db="Ensembl" version="88.38">
<xref id="O43657" db="Uniprot/SWISSPROT"/>
</identifier>
<proteinClasses>
<proteinClass source="MDM" id="Ma" parent_id="" name="Predicted membrane proteins"/>
<proteinClass source="MDM" id="Md" parent_id="" name="Membrane proteins predicted by MDM"/>
<proteinClass source="MEMSAT3" id="Me" parent_id="" name="MEMSAT3 predicted membrane proteins"/>
<proteinClass source="MEMSAT-SVM" id="Mf" parent_id="" name="MEMSAT-SVM predicted membrane proteins"/>
<proteinClass source="Phobius" id="Mg" parent_id="" name="Phobius predicted membrane proteins"/>
<proteinClass source="SCAMPI" id="Mh" parent_id="" name="SCAMPI predicted membrane proteins"/>
<proteinClass source="SPOCTOPUS" id="Mi" parent_id="" name="SPOCTOPUS predicted membrane proteins"/>
<proteinClass source="THUMBUP" id="Mj" parent_id="" name="THUMBUP predicted membrane proteins"/>
<proteinClass source="TMHMM" id="Mk" parent_id="" name="TMHMM predicted membrane proteins"/>
<proteinClass source="MDM" id="M1" parent_id="" name="1TM proteins predicted by MDM"/>
<proteinClass source="MDM" id="M4" parent_id="" name="4TM proteins predicted by MDM"/>
<proteinClass source="SignalP" id="Sb" parent_id="Se" name="SignalP predicted secreted proteins"/>
<proteinClass source="HPA" id="Za" parent_id="" name="Predicted intracellular proteins"/>
<proteinClass source="UniProt" id="Ua" parent_id="" name="UniProt - Evidence at protein level"/>
<proteinClass source="Kim et al 2014" id="Ea" parent_id="" name="Protein evidence (Kim et al 2014)"/>
<proteinClass source="Ezkurdia et al 2014" id="Eb" parent_id="" name="Protein evidence (Ezkurdia et al 2014)"/>
</proteinClasses>
<proteinEvidence evidence="Evidence at protein level">
<" source="HPA" evidence="Evidence at transcript level"/>
<evidence source="MS" evidence="Evidence at protein level"/>
<evidence source="UniProt" evidence="Evidence at protein level"/>
</proteinEvidence>
<tissueExpression source="HPA" technology="IHC" assayType="tissue">
<summary type="tissue"><![CDATA[Cytoplasmic and membranous expression in most tissues.]]></summary>
<verification type="reliability" description="Antibody staining mainly consistent with RNA expression data. Pending external verification. ">approved</verification>
<image imageType="selected">
</tissueExpression>
</entry>
TSPAN6
T245
TM4SF6
TSPAN-6
不确定它是否更节省内存,但尝试一下也无妨:
library( xml2 )
library( data.table )
#first, parse the xml document
doc <- read_xml( "./test.xml" )
#get all entry-nodes
entry.nodes <- xml_find_all( doc, "//entry")
#if necessary, you can now delete the read-in document 'doc' to free up memory
# rm( doc )
#
#build data.table
# will also handle missing attributes/nodes.
# because xml_find_first will return NA if node is not found
data.table( name = xml_text( xml_find_first( entry.nodes, ".//name" ) ),
eviden = xml_attr( xml_find_first( entry.nodes, ".//proteinEvidence" ), "evidence" )
)
# name eviden
# 1: TSPAN6 Evidence at protein level
库(xml2)
库(data.table)
#首先,解析xml文档
doc不确定它是否更节省内存,但尝试一下也无妨:
library( xml2 )
library( data.table )
#first, parse the xml document
doc <- read_xml( "./test.xml" )
#get all entry-nodes
entry.nodes <- xml_find_all( doc, "//entry")
#if necessary, you can now delete the read-in document 'doc' to free up memory
# rm( doc )
#
#build data.table
# will also handle missing attributes/nodes.
# because xml_find_first will return NA if node is not found
data.table( name = xml_text( xml_find_first( entry.nodes, ".//name" ) ),
eviden = xml_attr( xml_find_first( entry.nodes, ".//proteinEvidence" ), "evidence" )
)
# name eviden
# 1: TSPAN6 Evidence at protein level
库(xml2)
库(data.table)
#首先,解析xml文档
doc我在网上发现,R可能不是解析如此大的XML文件的最佳语言。Python确实有这样的库,它们工作得很好。我试过其中一种,似乎效果不错 我在网上发现,R可能不是解析如此大的XML文件的最佳语言。Python确实有这样的库,它们工作得很好。我试过其中一种,似乎效果不错 我在网上发现,R可能不是解析如此大的XML文件的最佳语言。Python确实有这样的库,它们工作得很好。我尝试了其中一种,但没有完全探索,但似乎效果不错。我在网上发现,R可能不是解析如此大的XML文件的最佳语言。Python确实有这样的库,它们工作得很好。我试过其中一个,但没有完全探索,但似乎效果不错。