对大型XML文件使用Python Iterparse_Python_Xml_Lxml_Large Files_Elementtree

对大型XML文件使用Python Iterparse

python xml

对大型XML文件使用Python Iterparse,python,xml,lxml,large-files,elementtree,Python,Xml,Lxml,Large Files,Elementtree,我需要用Python编写一个解析器，它可以在没有太多内存（只有2GB）的计算机上处理一些非常大的文件（>2GB）。我想在lxml中使用iterparse来实现它我的文件的格式如下： <item> <title>Item 1</title> <desc>Description 1</desc> </item> <item> <title>Item 2</title> <

我需要用Python编写一个解析器，它可以在没有太多内存（只有2GB）的计算机上处理一些非常大的文件（>2GB）。我想在lxml中使用iterparse来实现它

我的文件的格式如下：

<item>
  <title>Item 1</title>
  <desc>Description 1</desc>
</item>
<item>
  <title>Item 2</title>
  <desc>Description 2</desc>
</item>

但不幸的是，这个解决方案仍在消耗大量内存。我认为问题在于，在处理完每一个“项目”后，我需要做一些事情来清理空孩子。有谁能给我一些建议，告诉我在处理完数据后应该怎么做才能正确地清理吗？

为什么不使用“回调”方法呢？

试试看。在处理元素

elem

之后，它调用

elem.clear（）

来删除子元素，并删除前面的同级元素

def fast_iter(context, func, *args, **kwargs):
    """
    http://lxml.de/parsing.html#modifying-the-tree
    Based on Liza Daly's fast_iter
    http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
    See also http://effbot.org/zone/element-iterparse.htm
    """
    for event, elem in context:
        func(elem, *args, **kwargs)
        # It's safe to call clear() here because no descendants will be
        # accessed
        elem.clear()
        # Also eliminate now-empty references from the root node to elem
        for ancestor in elem.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
    del context


def process_element(elem):
    print elem.xpath( 'description/text( )' )

context = etree.iterparse( MYFILE, tag='item' )
fast_iter(context,process_element)

Daly的文章读起来很好，尤其是在处理大型XML文件时

编辑：上面发布的

fast\u iter

是Daly的

fast\u iter

的修改版。在处理一个元素后，它在删除不再需要的其他元素时更具攻击性

下面的脚本显示了行为上的差异。请特别注意，

orig\u fast\u iter

不会删除

A1

元素，而

mod\u fast\u iter

会删除它，从而节省更多内存

import lxml.etree as ET
import textwrap
import io

def setup_ABC():
    content = textwrap.dedent('''\
      <root>
        <A1>
          <B1></B1>
          <C>1<D1></D1></C>
          <E1></E1>
        </A1>
        <A2>
          <B2></B2>
          <C>2<D></D></C>
          <E2></E2>
        </A2>
      </root>
        ''')
    return content


def study_fast_iter():
    def orig_fast_iter(context, func, *args, **kwargs):
        for event, elem in context:
            print('Processing {e}'.format(e=ET.tostring(elem)))
            func(elem, *args, **kwargs)
            print('Clearing {e}'.format(e=ET.tostring(elem)))
            elem.clear()
            while elem.getprevious() is not None:
                print('Deleting {p}'.format(
                    p=(elem.getparent()[0]).tag))
                del elem.getparent()[0]
        del context

    def mod_fast_iter(context, func, *args, **kwargs):
        """
        http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
        Author: Liza Daly
        See also http://effbot.org/zone/element-iterparse.htm
        """
        for event, elem in context:
            print('Processing {e}'.format(e=ET.tostring(elem)))
            func(elem, *args, **kwargs)
            # It's safe to call clear() here because no descendants will be
            # accessed
            print('Clearing {e}'.format(e=ET.tostring(elem)))
            elem.clear()
            # Also eliminate now-empty references from the root node to elem
            for ancestor in elem.xpath('ancestor-or-self::*'):
                print('Checking ancestor: {a}'.format(a=ancestor.tag))
                while ancestor.getprevious() is not None:
                    print(
                        'Deleting {p}'.format(p=(ancestor.getparent()[0]).tag))
                    del ancestor.getparent()[0]
        del context

    content = setup_ABC()
    context = ET.iterparse(io.BytesIO(content), events=('end', ), tag='C')
    orig_fast_iter(context, lambda elem: None)
    # Processing <C>1<D1/></C>
    # Clearing <C>1<D1/></C>
    # Deleting B1
    # Processing <C>2<D/></C>
    # Clearing <C>2<D/></C>
    # Deleting B2

    print('-' * 80)
    """
    The improved fast_iter deletes A1. The original fast_iter does not.
    """
    content = setup_ABC()
    context = ET.iterparse(io.BytesIO(content), events=('end', ), tag='C')
    mod_fast_iter(context, lambda elem: None)
    # Processing <C>1<D1/></C>
    # Clearing <C>1<D1/></C>
    # Checking ancestor: root
    # Checking ancestor: A1
    # Checking ancestor: C
    # Deleting B1
    # Processing <C>2<D/></C>
    # Clearing <C>2<D/></C>
    # Checking ancestor: root
    # Checking ancestor: A2
    # Deleting A1
    # Checking ancestor: C
    # Deleting B2

study_fast_iter()

将lxml.etree作为ET导入
导入文本包装
输入io
def设置_ABC（）：
content=textwrap.dedent（“”）\
1.
2.
''')
返回内容
def study_fast_iter（）：
定义源代码（上下文、函数、*args、**kwargs）：
对于事件，上下文中的元素：
打印（'Processing{e}'。格式（e=ET.tostring（elem）））
func（元素，*args，**kwargs）
打印（'Clearing{e}'。格式（e=ET.tostring（elem）））
元素清除（）
虽然elem.getprevious（）不是无：
打印（'Deleting{p}'。格式(
p=（elem.getparent（）[0]）.tag）
del elem.getparent（）[0]
删除上下文
def mod_fast_iter（上下文、函数、*args、**kwargs）：
"""
http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
作者：莉莎·戴利
另见http://effbot.org/zone/element-iterparse.htm
"""
对于事件，上下文中的元素：
打印（'Processing{e}'。格式（e=ET.tostring（elem）））
func（元素，*args，**kwargs）
#在这里调用clear（）是安全的，因为不会有后代
#访问
打印（'Clearing{e}'。格式（e=ET.tostring（elem）））
元素清除（）
#现在还消除了从根节点到元素的空引用
对于elem.xpath中的祖先（“祖先或自身：：*”）：
打印（'检查祖先：{a}'。格式（a=祖先.tag））
虽然祖先.getprevious（）不是无：
印刷品(
'正在删除{p}'。格式（p=（祖先.getparent（）[0]）.tag））
del祖先.getparent（）[0]
删除上下文
内容=设置\u ABC（）
context=ET.iterparse（io.BytesIO（content），events=（'end'，），tag='C'）
原始快速iter（上下文，lambda元素：无）
#处理1
#清算1
#删除B1
#处理2
#清算2
#删除B2
打印（'-'*80）
"""
改进的fast_iter删除了A1，而原来的fast_iter没有删除A1。
"""
内容=设置\u ABC（）
context=ET.iterparse（io.BytesIO（content），events=（'end'，），tag='C'）
mod_fast_iter（上下文，lambda元素：无）
#处理1
#清算1
#检查祖先：根
#检查祖先：A1
#检查祖先：C
#删除B1
#处理2
#清算2
#检查祖先：根
#检查祖先：A2
#删除A1
#检查祖先：C
#删除B2
国际热核聚变实验堆（iter）研究

iterparse（）

允许您在构建树时执行一些操作，这意味着除非您删除不再需要的内容，否则最终仍然会得到整个树

更多信息：阅读原始ElementTree实现的作者（但它也适用于lxml）

注意，iterparse仍然构建树，就像parse一样，但您可以在解析时安全地重新排列或删除树的部分。例如，要解析大型文件，您可以在处理完元素后立即删除它们：

对于事件，iterparse中的元素（来源）：
如果elem.tag==“记录”：
... 进程记录元素。。。
元素清除（）

上述模式有一个缺点；它不会清除根元素，因此您将得到一个包含大量空子元素的元素。如果您的文件很大，而不仅仅是很大，这可能是一个问题。要解决这个问题，您需要着手处理根元素。最简单的方法是启用开始事件，并保存对变量中第一个元素的引用：

得到一个好消息

context=iterparse（源、事件=（“开始”、“结束”））

将其转换为迭代器

context=iter（context）

获取根元素

所以这是一个增量解析的问题，对于总结的答案，您可以参考上面的root.clear（）方法唯一的问题是它返回非类型。例如，这意味着您不能使用诸如replace（）或title（）之类的字符串方法编辑您解析的数据。也就是说，如果您只是按原样解析数据，那么这是一种最佳方法。

根据我的经验，无论是否使用

元素，iterparse都可以使用。clear

（请参见和L.Daly）无法始终处理非常大的XML文件：它在一段时间内运行良好，突然，内存消耗超过上限，出现内存错误或系统崩溃。如果遇到相同的问题，也许可以使用相同的解决方案：expat解析器。另请参见或下面使用OP的XML片段的示例(

import lxml.etree as ET
import textwrap
import io

def setup_ABC():
    content = textwrap.dedent('''\
      <root>
        <A1>
          <B1></B1>
          <C>1<D1></D1></C>
          <E1></E1>
        </A1>
        <A2>
          <B2></B2>
          <C>2<D></D></C>
          <E2></E2>
        </A2>
      </root>
        ''')
    return content


def study_fast_iter():
    def orig_fast_iter(context, func, *args, **kwargs):
        for event, elem in context:
            print('Processing {e}'.format(e=ET.tostring(elem)))
            func(elem, *args, **kwargs)
            print('Clearing {e}'.format(e=ET.tostring(elem)))
            elem.clear()
            while elem.getprevious() is not None:
                print('Deleting {p}'.format(
                    p=(elem.getparent()[0]).tag))
                del elem.getparent()[0]
        del context

    def mod_fast_iter(context, func, *args, **kwargs):
        """
        http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
        Author: Liza Daly
        See also http://effbot.org/zone/element-iterparse.htm
        """
        for event, elem in context:
            print('Processing {e}'.format(e=ET.tostring(elem)))
            func(elem, *args, **kwargs)
            # It's safe to call clear() here because no descendants will be
            # accessed
            print('Clearing {e}'.format(e=ET.tostring(elem)))
            elem.clear()
            # Also eliminate now-empty references from the root node to elem
            for ancestor in elem.xpath('ancestor-or-self::*'):
                print('Checking ancestor: {a}'.format(a=ancestor.tag))
                while ancestor.getprevious() is not None:
                    print(
                        'Deleting {p}'.format(p=(ancestor.getparent()[0]).tag))
                    del ancestor.getparent()[0]
        del context

    content = setup_ABC()
    context = ET.iterparse(io.BytesIO(content), events=('end', ), tag='C')
    orig_fast_iter(context, lambda elem: None)
    # Processing <C>1<D1/></C>
    # Clearing <C>1<D1/></C>
    # Deleting B1
    # Processing <C>2<D/></C>
    # Clearing <C>2<D/></C>
    # Deleting B2

    print('-' * 80)
    """
    The improved fast_iter deletes A1. The original fast_iter does not.
    """
    content = setup_ABC()
    context = ET.iterparse(io.BytesIO(content), events=('end', ), tag='C')
    mod_fast_iter(context, lambda elem: None)
    # Processing <C>1<D1/></C>
    # Clearing <C>1<D1/></C>
    # Checking ancestor: root
    # Checking ancestor: A1
    # Checking ancestor: C
    # Deleting B1
    # Processing <C>2<D/></C>
    # Clearing <C>2<D/></C>
    # Checking ancestor: root
    # Checking ancestor: A2
    # Deleting A1
    # Checking ancestor: C
    # Deleting B2

study_fast_iter()

event, root = context.next()

for event, elem in context:
    if event == "end" and elem.tag == "record":
        ... process record elements ...
        root.clear()

Description 1ä
Description 2ü