在Python中使用元素树合并xml文件_Python_Xml_Xpath_Merge_Elementtree

在Python中使用元素树合并xml文件

python xml xpath merge

在Python中使用元素树合并xml文件,python,xml,xpath,merge,elementtree,Python,Xml,Xpath,Merge,Elementtree,我正在尝试合并两个xml文件。这些文件包含相同的总体结构，但细节不同 file1.xml： <book> <chapter id="113"> <sentence id="1"> <word id="128160"> <POS Tag="V"/> <grammar type="STEM"/>

我正在尝试合并两个xml文件。这些文件包含相同的总体结构，但细节不同

file1.xml：

<book>
    <chapter id="113">
        <sentence id="1">
            <word id="128160">
                <POS Tag="V"/>
                <grammar type="STEM"/>
                <Aspect type="IMPV"/>
                <Number type="S"/>
            </word>
            <word id="128161">
                <POS Tag="V"/>
                <grammar type="STEM"/>
                <Aspect type="IMPF"/>
            </word>
             </sentence>
             <sentence id="2">
            <word id="128162">
                <POS Tag="P"/>
                <grammar type="PREFIX"/>
                <Tag Tag="bi+"/>
            </word>
             </sentence>
        </chapter>
</book>

请，任何帮助

我在过去做过类似的事情，就是创建一个xml文档，然后附加您要查找的值。我不相信有办法“合并”它们

xml=ET.fromstring（“”）
document=ET.parse（tempFile）
childNodeList=document.findall（xpathQuery）
对于childNodeList中的节点：
append（节点）

这里有一个解决方案。从一个空的合并文档开始，然后在枚举文件时，将找不到的元素添加到合并文档中。你可以概括这一点，但这里有一个切入点：

import lxml.etree
merged = lxml.etree.Element('book')
for xml_file in xml_files:
    for merge_chapter in lxml.etree.parse(xml_file):
        try:
            chapter = merged.xpath('chapter[@id=%s]' % merge_chapter.get('id'))[0]
            for merge_sentence in merge_chapter:
                try:
                    sentence = chapter.xpath('sentence[@id=%s]' % merge_sentence.get('id'))[0]
                    for merge_word in merge_sentence:
                        try:
                            word = sentence.xpath('word[@id=%s]' % merge_word.get('id'))[0]
                            for data in merge_word:
                                try:
                                    word.xpath(data.tag)[0]
                                except IndexError:
                                    # add newly discovered word data
                                    word.append(data)
                        except IndexError:
                            # add newly discovered word
                            sentence.append(merge_word)
                except IndexError:
                    # add newly discovered sentence
                    chapter.append(merge_sentence)
        except IndexError:
            # add newly discovered chapter
            merged.append(merge_chapter)

如果您希望将File2合并到File1中，那么可以循环File2中的所有元素，然后将属性从File2的元素复制到File1的元素中

我现在正在做一个项目，我必须做类似的事情。下面是我当前的解决方案，它应该可以在Python2.7下工作

请注意，我进一步添加了在公共节点之间复制属性的需求。您将看到我将以下属性添加到了：

内尔
巴斯·盖迪

然后我对B补充说：

吉他“亚历克斯”

最终合并的文件包含了power trio的三名成员

我还添加了

，以证明元素的顺序不再重要

#!/usr/bin/python
from lxml import etree 
from copy import deepcopy
import lxml

xmlA='''
<book>
    <chapter id="113">

        <sentence id="1" drums='Neil'>
            <word id="128160" bass='Geddy'>
                <POS Tag="V"/>
                <grammar type="STEM"/>
                <Aspect type="IMPV"/>
                <Number type="S"/>
            </word>
            <word id="128161">
                <POS Tag="V"/>
                <grammar type="STEM"/>
                <Aspect type="IMPF"/>
            </word>
        </sentence>

        <sentence id="2">
            <word id="128162">
                <POS Tag="P"/>
                <grammar type="PREFIX"/>
                <Tag Tag="bi+"/>
            </word>
        </sentence>

    </chapter>
</book>
'''

xmlB='''
<book>
    <chapter id="113">

        <sentence id="3">
            <word id="128168">
                <concept English="sadness"/>
            </word>
        </sentence>

        <sentence id="1">
            <word id="128160">
                <concept English="joke"/>
            </word>
            <word id="128161">
                <concept English="romance"/>
            </word>
        </sentence>

        <sentence id="2" guitar='Alex'>
            <word id="128162">
                <concept English="happiness"/>
            </word>
        </sentence>


    </chapter>
</book>
'''

import re
from copy import deepcopy

##
#   @brief  Translates the relational xpath to an explicit xpath.
#   In the XML examples above, getpath will return the following for 
#   <sentance id='1'/>:
#       - xmlA = /book/chapter/sentance[1]
#       - xmlb = /book/chapter/sentance[2]
#
#   A path that is explicit in both document would be:
#       - xmlA = /book/chapter/sentance[@id='1']
#       - xmlb = /book/chapter/sentance[@id='1']
#
def convertXpath(element):
    newPath = ''
    tree    = element.getroottree()
    path    = tree.getpath(element).split('/')
    root    = tree.getroot()

    for p in path:
        if p == '':
            continue

        if re.search('\[[0-9]*\]', p):

            # Get the element at this path
            #
            node = root.xpath(newPath+'/'+p)[0]
            id=node.get('id')

            p=re.sub('\[[0-9]*\]','', p)
            newPath += '/'+p+"[@id='"+id+"']"

        else:
            newPath+='/'+p

    return newPath



def mergeXml(a,b):

    for node in a.nodes():
        path = convertXpath(node)

        # find the element in the other document
        #
        elements =  b.root.xpath(path)

        for e in elements:
            for name, value in node.items():
                if name == 'id':
                    continue
                e.set(name,value)

        if len(elements) == 0:
            # Add the node to other document
            #
            newElement = deepcopy(node)

            # Find the path to the parent
            #
            parent = node.getparent()
            path = convertXpath(parent)

            bParent = b.root.xpath(path)[0]
            bParent.append(newElement)

class XmlDoc:
    def __init__(self, xml):
        self.root = etree.fromstring(xml)
        self.tree = self.root.getroottree()

    def __str__(self):
        return etree.tostring(self.root, pretty_print=True)

    def nodes(self):
        return self.root.iter('*')



if __name__ == '__main__':
    a = XmlDoc(xmlA)
    b = XmlDoc(xmlB)

    mergeXml(a,b)
    print b

#/usr/bin/python
从lxml导入etree
从复制导入deepcopy
导入lxml
xmlA=''
'''
xmlB=“”
'''
进口稀土
从复制导入deepcopy
##
#@brief将关系xpath转换为显式xpath。
#在上面的XML示例中，getpath将为
#   :
#-xmlA=/book/chapter/sentance[1]
#-xmlb=/book/chapter/sentance[2]
#
#在两个文档中都是显式的路径是：
#-xmlA=/book/chapter/sentance[@id='1']
#-xmlb=/book/chapter/sentance[@id='1']
#
def convertXpath（元素）：
新路径=“”
tree=element.getroottree（）
path=tree.getpath（element.split（“/”）
root=tree.getroot（）
对于路径中的p：
如果p=''：
持续
如果重新搜索（'\[[0-9]*\]'，p）：
#获取此路径上的元素
#
node=root.xpath（newPath+'/'+p）[0]
id=node.get（'id'）
p=re.sub（'\[[0-9]*\]'，''，p）
newPath+='/'+p+“[@id=''+id+']””
其他：
新路径+='/'+p
返回新路径
def mergeXml（a，b）：
对于a.nodes（）中的节点：
path=convertXpath（节点）
#在其他文档中查找元素
#
elements=b.root.xpath（路径）
对于元素中的e：
节点.items（）中的值作为名称：
如果名称=='id'：
持续
e、 集合（名称、值）
如果len（元素）==0：
#将节点添加到其他文档
#
newElement=deepcopy（节点）
#找到父级的路径
#
parent=node.getparent（）
path=convertXpath（父级）
bParent=b.root.xpath（路径）[0]
bParent.append（新元素）
XmlDoc类：
定义初始化（self，xml）：
self.root=etree.fromstring（xml）
self.tree=self.root.getroottree（）
定义（自我）：
返回etree.tostring（self.root，pretty\u print=True）
def节点（自身）：
返回self.root.iter（“*”）
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu'：
a=XmlDoc（xmlA）
b=XmlDoc（xmlB）
合并XML（a，b）
打印b

这将产生以下输出：

<book>
    <chapter id="113">

        <sentence id="3">
            <word id="128168">
                <concept English="sadness"/>
            </word>
        </sentence>

        <sentence id="1" drums="Neil">
            <word id="128160" bass="Geddy">
                <concept English="joke"/>
            <POS Tag="V"/>
                <grammar type="STEM"/>
                <Aspect type="IMPV"/>
                <Number type="S"/>
            </word>
            <word id="128161">
                <concept English="romance"/>
            <POS Tag="V"/>
                <grammar type="STEM"/>
                <Aspect type="IMPF"/>
            </word>
        </sentence>

        <sentence id="2" guitar="Alex">
            <word id="128162">
                <concept English="happiness"/>
            <POS Tag="P"/>
                <grammar type="PREFIX"/>
                <Tag Tag="bi+"/>
            </word>
        </sentence>


    </chapter>
</book>

Ok，但是如何在我的文件中获得正确的xpath查询？如何比较这两个文件是否包含相同的单词id，然后复制并创建一个新的xml文件。这些是不同的问题。您询问了如何合并两个xml文件。对于xpath查询，我看这里：就word id比较而言，必须执行xpath查询以获得匹配节点的列表，对其进行迭代并比较word id，如果它不在新xml中，则添加它。这部分实际上是一个算法问题……您好，谢谢您的帮助，我尝试运行代码，但出现了以下错误：AttributeError:'ElementTree'对象没有属性'element'，我在xml元素树模型中工作。代码运行在哪个模型中？哦。。。我的错。我在那里切换了lxml和ElementTree。lxml有一个很棒的xpath解析器，我喜欢它胜过ElementTree。我做了一个编辑。使用异常作为控制流操作是一件好事吗？

xml = ET.fromstring("<book></book>")
document = ET.parse(tempFile)
childNodeList = document.findall(xpathQuery)
for node in childNodeList: 
   xml.append(node)

import lxml.etree
merged = lxml.etree.Element('book')
for xml_file in xml_files:
    for merge_chapter in lxml.etree.parse(xml_file):
        try:
            chapter = merged.xpath('chapter[@id=%s]' % merge_chapter.get('id'))[0]
            for merge_sentence in merge_chapter:
                try:
                    sentence = chapter.xpath('sentence[@id=%s]' % merge_sentence.get('id'))[0]
                    for merge_word in merge_sentence:
                        try:
                            word = sentence.xpath('word[@id=%s]' % merge_word.get('id'))[0]
                            for data in merge_word:
                                try:
                                    word.xpath(data.tag)[0]
                                except IndexError:
                                    # add newly discovered word data
                                    word.append(data)
                        except IndexError:
                            # add newly discovered word
                            sentence.append(merge_word)
                except IndexError:
                    # add newly discovered sentence
                    chapter.append(merge_sentence)
        except IndexError:
            # add newly discovered chapter
            merged.append(merge_chapter)

#!/usr/bin/python
from lxml import etree 
from copy import deepcopy
import lxml

xmlA='''
<book>
    <chapter id="113">

        <sentence id="1" drums='Neil'>
            <word id="128160" bass='Geddy'>
                <POS Tag="V"/>
                <grammar type="STEM"/>
                <Aspect type="IMPV"/>
                <Number type="S"/>
            </word>
            <word id="128161">
                <POS Tag="V"/>
                <grammar type="STEM"/>
                <Aspect type="IMPF"/>
            </word>
        </sentence>

        <sentence id="2">
            <word id="128162">
                <POS Tag="P"/>
                <grammar type="PREFIX"/>
                <Tag Tag="bi+"/>
            </word>
        </sentence>

    </chapter>
</book>
'''

xmlB='''
<book>
    <chapter id="113">

        <sentence id="3">
            <word id="128168">
                <concept English="sadness"/>
            </word>
        </sentence>

        <sentence id="1">
            <word id="128160">
                <concept English="joke"/>
            </word>
            <word id="128161">
                <concept English="romance"/>
            </word>
        </sentence>

        <sentence id="2" guitar='Alex'>
            <word id="128162">
                <concept English="happiness"/>
            </word>
        </sentence>


    </chapter>
</book>
'''

import re
from copy import deepcopy

##
#   @brief  Translates the relational xpath to an explicit xpath.
#   In the XML examples above, getpath will return the following for 
#   <sentance id='1'/>:
#       - xmlA = /book/chapter/sentance[1]
#       - xmlb = /book/chapter/sentance[2]
#
#   A path that is explicit in both document would be:
#       - xmlA = /book/chapter/sentance[@id='1']
#       - xmlb = /book/chapter/sentance[@id='1']
#
def convertXpath(element):
    newPath = ''
    tree    = element.getroottree()
    path    = tree.getpath(element).split('/')
    root    = tree.getroot()

    for p in path:
        if p == '':
            continue

        if re.search('\[[0-9]*\]', p):

            # Get the element at this path
            #
            node = root.xpath(newPath+'/'+p)[0]
            id=node.get('id')

            p=re.sub('\[[0-9]*\]','', p)
            newPath += '/'+p+"[@id='"+id+"']"

        else:
            newPath+='/'+p

    return newPath



def mergeXml(a,b):

    for node in a.nodes():
        path = convertXpath(node)

        # find the element in the other document
        #
        elements =  b.root.xpath(path)

        for e in elements:
            for name, value in node.items():
                if name == 'id':
                    continue
                e.set(name,value)

        if len(elements) == 0:
            # Add the node to other document
            #
            newElement = deepcopy(node)

            # Find the path to the parent
            #
            parent = node.getparent()
            path = convertXpath(parent)

            bParent = b.root.xpath(path)[0]
            bParent.append(newElement)

class XmlDoc:
    def __init__(self, xml):
        self.root = etree.fromstring(xml)
        self.tree = self.root.getroottree()

    def __str__(self):
        return etree.tostring(self.root, pretty_print=True)

    def nodes(self):
        return self.root.iter('*')



if __name__ == '__main__':
    a = XmlDoc(xmlA)
    b = XmlDoc(xmlB)

    mergeXml(a,b)
    print b

<book>
    <chapter id="113">

        <sentence id="3">
            <word id="128168">
                <concept English="sadness"/>
            </word>
        </sentence>

        <sentence id="1" drums="Neil">
            <word id="128160" bass="Geddy">
                <concept English="joke"/>
            <POS Tag="V"/>
                <grammar type="STEM"/>
                <Aspect type="IMPV"/>
                <Number type="S"/>
            </word>
            <word id="128161">
                <concept English="romance"/>
            <POS Tag="V"/>
                <grammar type="STEM"/>
                <Aspect type="IMPF"/>
            </word>
        </sentence>

        <sentence id="2" guitar="Alex">
            <word id="128162">
                <concept English="happiness"/>
            <POS Tag="P"/>
                <grammar type="PREFIX"/>
                <Tag Tag="bi+"/>
            </word>
        </sentence>


    </chapter>
</book>