在Python中使用元素树合并xml文件
我正在尝试合并两个xml文件。这些文件包含相同的总体结构,但细节不同 file1.xml:在Python中使用元素树合并xml文件,python,xml,xpath,merge,elementtree,Python,Xml,Xpath,Merge,Elementtree,我正在尝试合并两个xml文件。这些文件包含相同的总体结构,但细节不同 file1.xml: <book> <chapter id="113"> <sentence id="1"> <word id="128160"> <POS Tag="V"/> <grammar type="STEM"/>
<book>
<chapter id="113">
<sentence id="1">
<word id="128160">
<POS Tag="V"/>
<grammar type="STEM"/>
<Aspect type="IMPV"/>
<Number type="S"/>
</word>
<word id="128161">
<POS Tag="V"/>
<grammar type="STEM"/>
<Aspect type="IMPF"/>
</word>
</sentence>
<sentence id="2">
<word id="128162">
<POS Tag="P"/>
<grammar type="PREFIX"/>
<Tag Tag="bi+"/>
</word>
</sentence>
</chapter>
</book>
请,任何帮助我在过去做过类似的事情,就是创建一个xml文档,然后附加您要查找的值。我不相信有办法“合并”它们
xml=ET.fromstring(“”)
document=ET.parse(tempFile)
childNodeList=document.findall(xpathQuery)
对于childNodeList中的节点:
append(节点)
这里有一个解决方案。从一个空的合并文档开始,然后在枚举文件时,将找不到的元素添加到合并文档中。你可以概括这一点,但这里有一个切入点:
import lxml.etree
merged = lxml.etree.Element('book')
for xml_file in xml_files:
for merge_chapter in lxml.etree.parse(xml_file):
try:
chapter = merged.xpath('chapter[@id=%s]' % merge_chapter.get('id'))[0]
for merge_sentence in merge_chapter:
try:
sentence = chapter.xpath('sentence[@id=%s]' % merge_sentence.get('id'))[0]
for merge_word in merge_sentence:
try:
word = sentence.xpath('word[@id=%s]' % merge_word.get('id'))[0]
for data in merge_word:
try:
word.xpath(data.tag)[0]
except IndexError:
# add newly discovered word data
word.append(data)
except IndexError:
# add newly discovered word
sentence.append(merge_word)
except IndexError:
# add newly discovered sentence
chapter.append(merge_sentence)
except IndexError:
# add newly discovered chapter
merged.append(merge_chapter)
如果您希望将File2合并到File1中,那么可以循环File2中的所有元素,然后将属性从File2的元素复制到File1的元素中 我现在正在做一个项目,我必须做类似的事情。下面是我当前的解决方案,它应该可以在Python2.7下工作 请注意,我进一步添加了在公共节点之间复制属性的需求。您将看到我将以下属性添加到了:
- 内尔
- 巴斯·盖迪
- 吉他“亚历克斯”
,以证明元素的顺序不再重要
#!/usr/bin/python
from lxml import etree
from copy import deepcopy
import lxml
xmlA='''
<book>
<chapter id="113">
<sentence id="1" drums='Neil'>
<word id="128160" bass='Geddy'>
<POS Tag="V"/>
<grammar type="STEM"/>
<Aspect type="IMPV"/>
<Number type="S"/>
</word>
<word id="128161">
<POS Tag="V"/>
<grammar type="STEM"/>
<Aspect type="IMPF"/>
</word>
</sentence>
<sentence id="2">
<word id="128162">
<POS Tag="P"/>
<grammar type="PREFIX"/>
<Tag Tag="bi+"/>
</word>
</sentence>
</chapter>
</book>
'''
xmlB='''
<book>
<chapter id="113">
<sentence id="3">
<word id="128168">
<concept English="sadness"/>
</word>
</sentence>
<sentence id="1">
<word id="128160">
<concept English="joke"/>
</word>
<word id="128161">
<concept English="romance"/>
</word>
</sentence>
<sentence id="2" guitar='Alex'>
<word id="128162">
<concept English="happiness"/>
</word>
</sentence>
</chapter>
</book>
'''
import re
from copy import deepcopy
##
# @brief Translates the relational xpath to an explicit xpath.
# In the XML examples above, getpath will return the following for
# <sentance id='1'/>:
# - xmlA = /book/chapter/sentance[1]
# - xmlb = /book/chapter/sentance[2]
#
# A path that is explicit in both document would be:
# - xmlA = /book/chapter/sentance[@id='1']
# - xmlb = /book/chapter/sentance[@id='1']
#
def convertXpath(element):
newPath = ''
tree = element.getroottree()
path = tree.getpath(element).split('/')
root = tree.getroot()
for p in path:
if p == '':
continue
if re.search('\[[0-9]*\]', p):
# Get the element at this path
#
node = root.xpath(newPath+'/'+p)[0]
id=node.get('id')
p=re.sub('\[[0-9]*\]','', p)
newPath += '/'+p+"[@id='"+id+"']"
else:
newPath+='/'+p
return newPath
def mergeXml(a,b):
for node in a.nodes():
path = convertXpath(node)
# find the element in the other document
#
elements = b.root.xpath(path)
for e in elements:
for name, value in node.items():
if name == 'id':
continue
e.set(name,value)
if len(elements) == 0:
# Add the node to other document
#
newElement = deepcopy(node)
# Find the path to the parent
#
parent = node.getparent()
path = convertXpath(parent)
bParent = b.root.xpath(path)[0]
bParent.append(newElement)
class XmlDoc:
def __init__(self, xml):
self.root = etree.fromstring(xml)
self.tree = self.root.getroottree()
def __str__(self):
return etree.tostring(self.root, pretty_print=True)
def nodes(self):
return self.root.iter('*')
if __name__ == '__main__':
a = XmlDoc(xmlA)
b = XmlDoc(xmlB)
mergeXml(a,b)
print b
#/usr/bin/python
从lxml导入etree
从复制导入deepcopy
导入lxml
xmlA=''
'''
xmlB=“”
'''
进口稀土
从复制导入deepcopy
##
#@brief将关系xpath转换为显式xpath。
#在上面的XML示例中,getpath将为
# :
#-xmlA=/book/chapter/sentance[1]
#-xmlb=/book/chapter/sentance[2]
#
#在两个文档中都是显式的路径是:
#-xmlA=/book/chapter/sentance[@id='1']
#-xmlb=/book/chapter/sentance[@id='1']
#
def convertXpath(元素):
新路径=“”
tree=element.getroottree()
path=tree.getpath(element.split(“/”)
root=tree.getroot()
对于路径中的p:
如果p='':
持续
如果重新搜索('\[[0-9]*\]',p):
#获取此路径上的元素
#
node=root.xpath(newPath+'/'+p)[0]
id=node.get('id')
p=re.sub('\[[0-9]*\]','',p)
newPath+='/'+p+“[@id=''+id+']””
其他:
新路径+='/'+p
返回新路径
def mergeXml(a,b):
对于a.nodes()中的节点:
path=convertXpath(节点)
#在其他文档中查找元素
#
elements=b.root.xpath(路径)
对于元素中的e:
节点.items()中的值作为名称:
如果名称=='id':
持续
e、 集合(名称、值)
如果len(元素)==0:
#将节点添加到其他文档
#
newElement=deepcopy(节点)
#找到父级的路径
#
parent=node.getparent()
path=convertXpath(父级)
bParent=b.root.xpath(路径)[0]
bParent.append(新元素)
XmlDoc类:
定义初始化(self,xml):
self.root=etree.fromstring(xml)
self.tree=self.root.getroottree()
定义(自我):
返回etree.tostring(self.root,pretty\u print=True)
def节点(自身):
返回self.root.iter(“*”)
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu':
a=XmlDoc(xmlA)
b=XmlDoc(xmlB)
合并XML(a,b)
打印b
这将产生以下输出:
<book>
<chapter id="113">
<sentence id="3">
<word id="128168">
<concept English="sadness"/>
</word>
</sentence>
<sentence id="1" drums="Neil">
<word id="128160" bass="Geddy">
<concept English="joke"/>
<POS Tag="V"/>
<grammar type="STEM"/>
<Aspect type="IMPV"/>
<Number type="S"/>
</word>
<word id="128161">
<concept English="romance"/>
<POS Tag="V"/>
<grammar type="STEM"/>
<Aspect type="IMPF"/>
</word>
</sentence>
<sentence id="2" guitar="Alex">
<word id="128162">
<concept English="happiness"/>
<POS Tag="P"/>
<grammar type="PREFIX"/>
<Tag Tag="bi+"/>
</word>
</sentence>
</chapter>
</book>
Ok,但是如何在我的文件中获得正确的xpath查询?如何比较这两个文件是否包含相同的单词id,然后复制并创建一个新的xml文件。这些是不同的问题。您询问了如何合并两个xml文件。对于xpath查询,我看这里:就word id比较而言,必须执行xpath查询以获得匹配节点的列表,对其进行迭代并比较word id,如果它不在新xml中,则添加它。这部分实际上是一个算法问题……您好,谢谢您的帮助,我尝试运行代码,但出现了以下错误:AttributeError:'ElementTree'对象没有属性'element',我在xml元素树模型中工作。代码运行在哪个模型中?哦。。。我的错。我在那里切换了lxml和ElementTree。lxml有一个很棒的xpath解析器,我喜欢它胜过ElementTree。我做了一个编辑。使用异常作为控制流操作是一件好事吗?
xml = ET.fromstring("<book></book>")
document = ET.parse(tempFile)
childNodeList = document.findall(xpathQuery)
for node in childNodeList:
xml.append(node)
import lxml.etree
merged = lxml.etree.Element('book')
for xml_file in xml_files:
for merge_chapter in lxml.etree.parse(xml_file):
try:
chapter = merged.xpath('chapter[@id=%s]' % merge_chapter.get('id'))[0]
for merge_sentence in merge_chapter:
try:
sentence = chapter.xpath('sentence[@id=%s]' % merge_sentence.get('id'))[0]
for merge_word in merge_sentence:
try:
word = sentence.xpath('word[@id=%s]' % merge_word.get('id'))[0]
for data in merge_word:
try:
word.xpath(data.tag)[0]
except IndexError:
# add newly discovered word data
word.append(data)
except IndexError:
# add newly discovered word
sentence.append(merge_word)
except IndexError:
# add newly discovered sentence
chapter.append(merge_sentence)
except IndexError:
# add newly discovered chapter
merged.append(merge_chapter)
#!/usr/bin/python
from lxml import etree
from copy import deepcopy
import lxml
xmlA='''
<book>
<chapter id="113">
<sentence id="1" drums='Neil'>
<word id="128160" bass='Geddy'>
<POS Tag="V"/>
<grammar type="STEM"/>
<Aspect type="IMPV"/>
<Number type="S"/>
</word>
<word id="128161">
<POS Tag="V"/>
<grammar type="STEM"/>
<Aspect type="IMPF"/>
</word>
</sentence>
<sentence id="2">
<word id="128162">
<POS Tag="P"/>
<grammar type="PREFIX"/>
<Tag Tag="bi+"/>
</word>
</sentence>
</chapter>
</book>
'''
xmlB='''
<book>
<chapter id="113">
<sentence id="3">
<word id="128168">
<concept English="sadness"/>
</word>
</sentence>
<sentence id="1">
<word id="128160">
<concept English="joke"/>
</word>
<word id="128161">
<concept English="romance"/>
</word>
</sentence>
<sentence id="2" guitar='Alex'>
<word id="128162">
<concept English="happiness"/>
</word>
</sentence>
</chapter>
</book>
'''
import re
from copy import deepcopy
##
# @brief Translates the relational xpath to an explicit xpath.
# In the XML examples above, getpath will return the following for
# <sentance id='1'/>:
# - xmlA = /book/chapter/sentance[1]
# - xmlb = /book/chapter/sentance[2]
#
# A path that is explicit in both document would be:
# - xmlA = /book/chapter/sentance[@id='1']
# - xmlb = /book/chapter/sentance[@id='1']
#
def convertXpath(element):
newPath = ''
tree = element.getroottree()
path = tree.getpath(element).split('/')
root = tree.getroot()
for p in path:
if p == '':
continue
if re.search('\[[0-9]*\]', p):
# Get the element at this path
#
node = root.xpath(newPath+'/'+p)[0]
id=node.get('id')
p=re.sub('\[[0-9]*\]','', p)
newPath += '/'+p+"[@id='"+id+"']"
else:
newPath+='/'+p
return newPath
def mergeXml(a,b):
for node in a.nodes():
path = convertXpath(node)
# find the element in the other document
#
elements = b.root.xpath(path)
for e in elements:
for name, value in node.items():
if name == 'id':
continue
e.set(name,value)
if len(elements) == 0:
# Add the node to other document
#
newElement = deepcopy(node)
# Find the path to the parent
#
parent = node.getparent()
path = convertXpath(parent)
bParent = b.root.xpath(path)[0]
bParent.append(newElement)
class XmlDoc:
def __init__(self, xml):
self.root = etree.fromstring(xml)
self.tree = self.root.getroottree()
def __str__(self):
return etree.tostring(self.root, pretty_print=True)
def nodes(self):
return self.root.iter('*')
if __name__ == '__main__':
a = XmlDoc(xmlA)
b = XmlDoc(xmlB)
mergeXml(a,b)
print b
<book>
<chapter id="113">
<sentence id="3">
<word id="128168">
<concept English="sadness"/>
</word>
</sentence>
<sentence id="1" drums="Neil">
<word id="128160" bass="Geddy">
<concept English="joke"/>
<POS Tag="V"/>
<grammar type="STEM"/>
<Aspect type="IMPV"/>
<Number type="S"/>
</word>
<word id="128161">
<concept English="romance"/>
<POS Tag="V"/>
<grammar type="STEM"/>
<Aspect type="IMPF"/>
</word>
</sentence>
<sentence id="2" guitar="Alex">
<word id="128162">
<concept English="happiness"/>
<POS Tag="P"/>
<grammar type="PREFIX"/>
<Tag Tag="bi+"/>
</word>
</sentence>
</chapter>
</book>