C# 在C代码中解析(大)XML的最佳方法是什么?
我正在用C#编写一个GIS客户端工具,以从服务器检索基于GML的XML模式(下面的示例)中的“功能”。提取限制为100000个特征 我估计最大的extract.xml可能会达到150兆字节左右,所以很明显DOM解析器已经过时了,我一直在尝试在和生成的绑定(或者)和手工制作的对象图之间做出选择 或者也许有一个更好的方法,我还没有考虑呢?比如XLINQ,或者 有人能给我指路吗?特别是关于任何给定方法的内存效率。如果没有,我将不得不“原型化”这两种解决方案,并将它们并排进行分析 我在.NET中有点像生虾。任何指导都将不胜感激 谢谢你。基思C# 在C代码中解析(大)XML的最佳方法是什么?,c#,xml,xml-serialization,xmlreader,C#,Xml,Xml Serialization,Xmlreader,我正在用C#编写一个GIS客户端工具,以从服务器检索基于GML的XML模式(下面的示例)中的“功能”。提取限制为100000个特征 我估计最大的extract.xml可能会达到150兆字节左右,所以很明显DOM解析器已经过时了,我一直在尝试在和生成的绑定(或者)和手工制作的对象图之间做出选择 或者也许有一个更好的方法,我还没有考虑呢?比如XLINQ,或者 有人能给我指路吗?特别是关于任何给定方法的内存效率。如果没有,我将不得不“原型化”这两种解决方案,并将它们并排进行分析 我在.NET中有点像生
示例XML-最多100000个,每个功能最多234600个坐标
<feature featId="27168306" fType="vegetation" fTypeId="1129" fClass="vegetation" gType="Polygon" ID="0" cLockNr="51598" metadataId="51599" mdFileId="NRM/TIS/VEGETATION/9543_22_v3" dataScale="25000">
<MultiGeometry>
<geometryMember>
<Polygon>
<outerBoundaryIs>
<LinearRing>
<coordinates>153.505004,-27.42196 153.505044,-27.422015 153.503992 .... 172 coordinates omitted to save space ... 153.505004,-27.42196</coordinates>
</LinearRing>
</outerBoundaryIs>
</Polygon>
</geometryMember>
</MultiGeometry>
</feature>
153.505004,-27.42196 153.505044,-27.422015 153.503992 .... 省略172个坐标以节省空间。。。153.505004,-27.42196
使用XmlReader
解析大型XML文档XmlReader
提供对XML数据的快速、仅向前、非缓存访问。(前进仅表示可以从头到尾读取XML文件,但不能在文件中向后移动。)XmlReader
使用少量内存,相当于使用简单的SAX读取器
using (XmlReader myReader = XmlReader.Create(@"c:\data\coords.xml"))
{
while (myReader.Read())
{
// Process each node (myReader.Value) here
// ...
}
}
您可以使用XmlReader来处理最大为2 GB的文件 Ref:您可能正在寻找一个解析器。SAX不需要您将整个文档读入内存—它以增量方式解析文档,并允许您在运行时处理元素。我不知道.NET中是否提供了SAX解析器,但您可以查看以下几个开源选项:
大家干杯。基思。早在2009年5月14日:我已经转向使用混合方法。。。请参阅下面的代码
using System;
using System.Reflection;
using System.Xml;
using System.Xml.Serialization;
using System.IO;
using System.Collections.Generic;
using nrw_rime_extract.utils;
using nrw_rime_extract.xml.generated_bindings;
namespace nrw_rime_extract.xml
{
internal interface ExtractXmlReader
{
rimeType read(string xmlFilename);
}
/// <summary>
/// RimeExtractXml provides bindings to the RIME Extract XML as defined by
/// $/Release 2.7/Documentation/Technical/SCHEMA and DTDs/nrw-rime-extract.xsd
/// </summary>
internal class ExtractXmlReader_XmlSerializerImpl : ExtractXmlReader
{
private Log log = Log.getInstance();
public rimeType read(string xmlFilename)
{
log.write(
string.Format(
"DEBUG: ExtractXmlReader_XmlSerializerImpl.read({0})",
xmlFilename));
using (Stream stream = new FileStream(xmlFilename, FileMode.Open))
{
return read(stream);
}
}
internal rimeType read(Stream xmlInputStream)
{
// create an instance of the XmlSerializer class,
// specifying the type of object to be deserialized.
XmlSerializer serializer = new XmlSerializer(typeof(rimeType));
serializer.UnknownNode += new XmlNodeEventHandler(handleUnknownNode);
serializer.UnknownAttribute +=
new XmlAttributeEventHandler(handleUnknownAttribute);
// use the Deserialize method to restore the object's state
// with data from the XML document.
return (rimeType)serializer.Deserialize(xmlInputStream);
}
protected void handleUnknownNode(object sender, XmlNodeEventArgs e)
{
log.write(
string.Format(
"XML_ERROR: Unknown Node at line {0} position {1} : {2}\t{3}",
e.LineNumber, e.LinePosition, e.Name, e.Text));
}
protected void handleUnknownAttribute(object sender, XmlAttributeEventArgs e)
{
log.write(
string.Format(
"XML_ERROR: Unknown Attribute at line {0} position {1} : {2}='{3}'",
e.LineNumber, e.LinePosition, e.Attr.Name, e.Attr.Value));
}
}
/// <summary>
/// xtractXmlReader provides bindings to the extract.xml
/// returned by the RIME server; as defined by:
/// $/Release X/Documentation/Technical/SCHEMA and
/// DTDs/nrw-rime-extract.xsd
/// </summary>
internal class ExtractXmlReader_XmlTextReaderXmlSerializerHybridImpl :
ExtractXmlReader
{
private Log log = Log.getInstance();
public rimeType read(string xmlFilename)
{
log.write(
string.Format(
"DEBUG: ExtractXmlReader_XmlTextReaderXmlSerializerHybridImpl." +
"read({0})",
xmlFilename));
using (XmlReader reader = XmlReader.Create(xmlFilename))
{
return read(reader);
}
}
public rimeType read(XmlReader reader)
{
rimeType result = new rimeType();
// a deserializer for featureClass, feature, etc, "doclets"
Dictionary<Type, XmlSerializer> serializers =
new Dictionary<Type, XmlSerializer>();
serializers.Add(typeof(featureClassType),
newSerializer(typeof(featureClassType)));
serializers.Add(typeof(featureType),
newSerializer(typeof(featureType)));
List<featureClassType> featureClasses = new List<featureClassType>();
List<featureType> features = new List<featureType>();
while (!reader.EOF)
{
if (reader.MoveToContent() != XmlNodeType.Element)
{
reader.Read(); // skip non-element-nodes and unknown-elements.
continue;
}
// skip junk nodes.
if (reader.Name.Equals("featureClass"))
{
using (
StringReader elementReader =
new StringReader(reader.ReadOuterXml()))
{
XmlSerializer deserializer =
serializers[typeof (featureClassType)];
featureClasses.Add(
(featureClassType)
deserializer.Deserialize(elementReader));
}
continue;
// ReadOuterXml advances the reader, so don't read again.
}
if (reader.Name.Equals("feature"))
{
using (
StringReader elementReader =
new StringReader(reader.ReadOuterXml()))
{
XmlSerializer deserializer =
serializers[typeof (featureType)];
features.Add(
(featureType)
deserializer.Deserialize(elementReader));
}
continue;
// ReadOuterXml advances the reader, so don't read again.
}
log.write(
"WARNING: unknown element '" + reader.Name +
"' was skipped during parsing.");
reader.Read(); // skip non-element-nodes and unknown-elements.
}
result.featureClasses = featureClasses.ToArray();
result.features = features.ToArray();
return result;
}
private XmlSerializer newSerializer(Type elementType)
{
XmlSerializer serializer = new XmlSerializer(elementType);
serializer.UnknownNode += new XmlNodeEventHandler(handleUnknownNode);
serializer.UnknownAttribute +=
new XmlAttributeEventHandler(handleUnknownAttribute);
return serializer;
}
protected void handleUnknownNode(object sender, XmlNodeEventArgs e)
{
log.write(
string.Format(
"XML_ERROR: Unknown Node at line {0} position {1} : {2}\t{3}",
e.LineNumber, e.LinePosition, e.Name, e.Text));
}
protected void handleUnknownAttribute(object sender, XmlAttributeEventArgs e)
{
log.write(
string.Format(
"XML_ERROR: Unknown Attribute at line {0} position {1} : {2}='{3}'",
e.LineNumber, e.LinePosition, e.Attr.Name, e.Attr.Value));
}
}
}
此版本具有两者的大部分优点:*XmlReader/XmlTextReader(内存效率-->速度);和
*XmlSerializer(代码生成-->开发经验和灵活性) 它使用XmlTextReader遍历文档,并创建“doclet”,使用XmlSerializer和XSD.EXE生成的“XML绑定”类对其进行反序列化 我想这个食谱是普遍适用的,而且很快。。。我正在解析一个201MB的XML文档,其中包含56000个GML特性,只需7秒钟。。。这个应用程序的旧VB6实现需要几分钟(甚至几个小时)来解析大型摘录。。。所以我看起来很适合去 再一次,非常感谢福美特夫妇为您贡献了宝贵的时间。我真的很感激 大家干杯。基思
using System;
using System.Reflection;
using System.Xml;
using System.Xml.Serialization;
using System.IO;
using System.Collections.Generic;
using nrw_rime_extract.utils;
using nrw_rime_extract.xml.generated_bindings;
namespace nrw_rime_extract.xml
{
internal interface ExtractXmlReader
{
rimeType read(string xmlFilename);
}
/// <summary>
/// RimeExtractXml provides bindings to the RIME Extract XML as defined by
/// $/Release 2.7/Documentation/Technical/SCHEMA and DTDs/nrw-rime-extract.xsd
/// </summary>
internal class ExtractXmlReader_XmlSerializerImpl : ExtractXmlReader
{
private Log log = Log.getInstance();
public rimeType read(string xmlFilename)
{
log.write(
string.Format(
"DEBUG: ExtractXmlReader_XmlSerializerImpl.read({0})",
xmlFilename));
using (Stream stream = new FileStream(xmlFilename, FileMode.Open))
{
return read(stream);
}
}
internal rimeType read(Stream xmlInputStream)
{
// create an instance of the XmlSerializer class,
// specifying the type of object to be deserialized.
XmlSerializer serializer = new XmlSerializer(typeof(rimeType));
serializer.UnknownNode += new XmlNodeEventHandler(handleUnknownNode);
serializer.UnknownAttribute +=
new XmlAttributeEventHandler(handleUnknownAttribute);
// use the Deserialize method to restore the object's state
// with data from the XML document.
return (rimeType)serializer.Deserialize(xmlInputStream);
}
protected void handleUnknownNode(object sender, XmlNodeEventArgs e)
{
log.write(
string.Format(
"XML_ERROR: Unknown Node at line {0} position {1} : {2}\t{3}",
e.LineNumber, e.LinePosition, e.Name, e.Text));
}
protected void handleUnknownAttribute(object sender, XmlAttributeEventArgs e)
{
log.write(
string.Format(
"XML_ERROR: Unknown Attribute at line {0} position {1} : {2}='{3}'",
e.LineNumber, e.LinePosition, e.Attr.Name, e.Attr.Value));
}
}
/// <summary>
/// xtractXmlReader provides bindings to the extract.xml
/// returned by the RIME server; as defined by:
/// $/Release X/Documentation/Technical/SCHEMA and
/// DTDs/nrw-rime-extract.xsd
/// </summary>
internal class ExtractXmlReader_XmlTextReaderXmlSerializerHybridImpl :
ExtractXmlReader
{
private Log log = Log.getInstance();
public rimeType read(string xmlFilename)
{
log.write(
string.Format(
"DEBUG: ExtractXmlReader_XmlTextReaderXmlSerializerHybridImpl." +
"read({0})",
xmlFilename));
using (XmlReader reader = XmlReader.Create(xmlFilename))
{
return read(reader);
}
}
public rimeType read(XmlReader reader)
{
rimeType result = new rimeType();
// a deserializer for featureClass, feature, etc, "doclets"
Dictionary<Type, XmlSerializer> serializers =
new Dictionary<Type, XmlSerializer>();
serializers.Add(typeof(featureClassType),
newSerializer(typeof(featureClassType)));
serializers.Add(typeof(featureType),
newSerializer(typeof(featureType)));
List<featureClassType> featureClasses = new List<featureClassType>();
List<featureType> features = new List<featureType>();
while (!reader.EOF)
{
if (reader.MoveToContent() != XmlNodeType.Element)
{
reader.Read(); // skip non-element-nodes and unknown-elements.
continue;
}
// skip junk nodes.
if (reader.Name.Equals("featureClass"))
{
using (
StringReader elementReader =
new StringReader(reader.ReadOuterXml()))
{
XmlSerializer deserializer =
serializers[typeof (featureClassType)];
featureClasses.Add(
(featureClassType)
deserializer.Deserialize(elementReader));
}
continue;
// ReadOuterXml advances the reader, so don't read again.
}
if (reader.Name.Equals("feature"))
{
using (
StringReader elementReader =
new StringReader(reader.ReadOuterXml()))
{
XmlSerializer deserializer =
serializers[typeof (featureType)];
features.Add(
(featureType)
deserializer.Deserialize(elementReader));
}
continue;
// ReadOuterXml advances the reader, so don't read again.
}
log.write(
"WARNING: unknown element '" + reader.Name +
"' was skipped during parsing.");
reader.Read(); // skip non-element-nodes and unknown-elements.
}
result.featureClasses = featureClasses.ToArray();
result.features = features.ToArray();
return result;
}
private XmlSerializer newSerializer(Type elementType)
{
XmlSerializer serializer = new XmlSerializer(elementType);
serializer.UnknownNode += new XmlNodeEventHandler(handleUnknownNode);
serializer.UnknownAttribute +=
new XmlAttributeEventHandler(handleUnknownAttribute);
return serializer;
}
protected void handleUnknownNode(object sender, XmlNodeEventArgs e)
{
log.write(
string.Format(
"XML_ERROR: Unknown Node at line {0} position {1} : {2}\t{3}",
e.LineNumber, e.LinePosition, e.Name, e.Text));
}
protected void handleUnknownAttribute(object sender, XmlAttributeEventArgs e)
{
log.write(
string.Format(
"XML_ERROR: Unknown Attribute at line {0} position {1} : {2}='{3}'",
e.LineNumber, e.LinePosition, e.Attr.Name, e.Attr.Value));
}
}
}
使用系统;
运用系统反思;
使用System.Xml;
使用System.Xml.Serialization;
使用System.IO;
使用System.Collections.Generic;
使用nrw_rime_extract.utils;
使用nrw_inite_extract.xml.generated_绑定;
名称空间nrw_orime_extract.xml
{
内部接口提取XmlReader
{
读取rimeType(字符串xmlFilename);
}
///
///RimeExtractXml提供对所定义的提取XML的绑定
///$/Release 2.7/Documentation/Technical/SCHEMA和DTDs/nrw-rime-extract.xsd
///
内部类ExtractXmlReader\u XmlSerializerImpl:ExtractXmlReader
{
private Log=Log.getInstance();
公共rimeType读取(字符串xmlFilename)
{
log.write(
字符串格式(
“调试:ExtractXmlReader_XmlSerializerImpl.read({0})”,
xml文件名);
使用(Stream=newfilestream(xmlFilename,FileMode.Open))
{
返回读取(流);
}
}
内部rimeType读取(流xmlInputStream)
{
//创建XmlSerializer类的实例,
//指定要反序列化的对象的类型。
XmlSerializer serializer=新的XmlSerializer(typeof(rimeType));
serializer.UnknownNode+=新的XmlNodeEventHandler(handleUnknownNode);
serializer.UnknownAttribute+=
新的XmlAttributeEventHandler(handleUnknownAttribute);
//使用反序列化方法恢复对象的状态
//使用XML文档中的数据。
return(rimeType)序列化程序。反序列化(xmlInputStream);
}
受保护的void handleUnknownNode(对象发送方,XmlNodeEventArgs e)
{
log.write(
字符串格式(
“XML_错误:第{0}行位置{1}:{2}\t{3}处的未知节点”,
e、 行号,如LinePosition,如Name,如Text);
}
受保护的void handleUnknownAttribute(对象发送方,XmlAttributeEventArgs e)
{
log.write(
字符串格式(
“XML_错误:第{0}行位置{1}:{2}='{3}'处的未知属性,
e、 行号、e.LinePosition、e.Attr.Name、e.Attr.Value));
}
}
///
///xtractXmlReader提供对extract.xml的绑定
///由iRime服务器返回;定义如下:
///$/X版/文档/技术/模式和
///DTDs/nrw-rime-extract.xsd
///
using (var xml_reader = XmlReader.Create (this.source.Url))
{
if (!SkipToElement (xml_reader, "Root"))
throw new InvalidOperationException ("XML element \"Root\" was not found.");
if (!SkipToElement (xml_reader, "Users"))
throw new InvalidOperationException ("XML element \"Root/Users\" was not found.");
...
}