Python 从XML文件中提取信息并为其分配一个向量_Python_Xml_Nlp

Python 从XML文件中提取信息并为其分配一个向量

python xml nlp

Python 从XML文件中提取信息并为其分配一个向量,python,xml,nlp,Python,Xml,Nlp,我想在我的计算机上用python解析一些XML文件，并从中提取一些信息以下是其中一个的xml文件：（如果需要，文本如下所示： ) 作为第一级，我已经完成了第一级： myList = [] #read the whole text from for root, dirs, files in os.walk(path): for file in files: if file.endswith('.xml'): wi

我想在我的计算机上用python解析一些XML文件，并从中提取一些信息

以下是其中一个的xml文件：

（如果需要，文本如下所示： )

作为第一级，我已经完成了第一级：

myList = []                #read the whole text from 
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.xml'):
            with open(os.path.join(root, file), encoding="UTF-8") as content:
                tree = ET.parse(content)
                myList.append(tree)

在myList中，我有一些XMl文件

现在，对于根“边”，它们没有type=“seg”
然后我想把数字分配给src中的notappread，因为这个句子叫做premise，例如这里……我想说“a3”就是所谓的“premise”（因为它不是标签src）
比如这里
（0,0,1,0,0）应该是我的过程的结果，因为a3不合适。我将第三个数组设置为1，其余数组设置为零

一般来说，我想提取信息，以便注释我的文本，这些文本是如何使用xml注释的下面是数据提取部分

import xml.etree.ElementTree as ET xml = '''<?xml version='1.0' encoding='UTF-8'?> <arggraph id="micro_b002" topic_id="higher_dog_poo_fines" stance="pro"> <edu id="e1"><![CDATA[One can hardly move in Friedrichshain or Neukölln these days without permanently scanning the ground for dog dirt.]]></edu> <edu id="e2"><![CDATA[And when bad luck does strike and you step into one of the many 'land mines' you have to painstakingly scrape the remains off your soles.]]></edu> <edu id="e3"><![CDATA[Higher fines are therefore the right measure against negligent, lazy or simply thoughtless dog owners.]]></edu> <edu id="e4"><![CDATA[Of course, first they'd actually need to be caught in the act by public order officers,]]></edu> <edu id="e5"><![CDATA[but once they have to dig into their pockets, their laziness will sure vanish!]]></edu> <adu id="a1" type="pro"/> <adu id="a2" type="pro"/> <adu id="a3" type="pro"/> <adu id="a4" type="opp"/> <adu id="a5" type="pro"/> <edge id="c6" src="e1" trg="a1" type="seg"/> <edge id="c7" src="e2" trg="a2" type="seg"/> <edge id="c8" src="e3" trg="a3" type="seg"/> <edge id="c9" src="e4" trg="a4" type="seg"/> <edge id="c10" src="e5" trg="a5" type="seg"/> <edge id="c1" src="a1" trg="a3" type="sup"/> <edge id="c2" src="a2" trg="a3" type="sup"/> <edge id="c4" src="a4" trg="a3" type="reb"/> <edge id="c5" src="a5" trg="c4" type="und"/> </arggraph>''' root = ET.fromstring(xml) interesting_edges_src = [e.attrib['src'] for e in root.findall('.//edge') if e.attrib['type'] != 'seg' ] print(interesting_edges_src)

使用@ Baldman响应，这里可以被认为是某种答案，它在某种程度上接近最后一个< /P>
myList = [] myEdgesList=[] #read the whole text from for root, dirs, files in os.walk(path): for file in files: if file.endswith('.xml'): with open(os.path.join(root, file), encoding="UTF-8") as content: tree = ET.parse(content) myList.append(tree)
这提供
['a1'、'a2'、'a4'、'a5']以及所有其他示例的列表

[['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a2', 'a3', 'a4', 'a5'], . . .
只剩下将此列表转换为

(0,0,0,0,1) <----- ['a1', 'a2', 'a3', 'a4'] #as a5 is missing (0,0,1,0,0) <------ ['a1', 'a2', 'a4', 'a5'] #as a3 is misisng . . . (0,0,1) <------- ['a2', 'a3'] #as a1 is missing

（0,0,0,0,1）下一个问题 myEdgtlistmap=[] for lst in myEdgesList: tp=[] for el in lst: if el=="a1": tp.append(1) if el=="a2": tp.append(2) if el=="a3": tp.append(3) if el=="a4": tp.append(4) if el=="a5": tp.append(5) if el=="a6": tp.append(6) myEdgtlistmap.append(tp) for k in myList: Edge= [e.attrib['src'] for e in k.findall('.//edge') if e.attrib['type'] != 'seg' ] myEdgesList.append(Edge) [['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a4', 'a5'], ['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a3', 'a4'], ['a1', 'a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3', 'a4', 'a5'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a1', 'a2', 'a3'], ['a2', 'a3', 'a4', 'a5'], . . . (0,0,0,0,1) <----- ['a1', 'a2', 'a3', 'a4'] #as a5 is missing (0,0,1,0,0) <------ ['a1', 'a2', 'a4', 'a5'] #as a3 is misisng . . . (0,0,1) <------- ['a2', 'a3'] #as a1 is missing myEdgtlistmap=[] for lst in myEdgesList: tp=[] for el in lst: if el=="a1": tp.append(1) if el=="a2": tp.append(2) if el=="a3": tp.append(3) if el=="a4": tp.append(4) if el=="a5": tp.append(5) if el=="a6": tp.append(6) myEdgtlistmap.append(tp) label=[] for le in myEdgtlistmap: b=[1]*(len(le)+1) for v in le: b[v-1]=0 label.append(b) y=[l for lab in label for l in lab ]