Python 从XML文件中提取信息并为其分配一个向量
我想在我的计算机上用python解析一些XML文件,并从中提取一些信息 以下是其中一个的xml文件: (如果需要,文本如下所示: ) 作为第一级,我已经完成了第一级:Python 从XML文件中提取信息并为其分配一个向量,python,xml,nlp,Python,Xml,Nlp,我想在我的计算机上用python解析一些XML文件,并从中提取一些信息 以下是其中一个的xml文件: (如果需要,文本如下所示: ) 作为第一级,我已经完成了第一级: myList = [] #read the whole text from for root, dirs, files in os.walk(path): for file in files: if file.endswith('.xml'): wi
myList = [] #read the whole text from
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith('.xml'):
with open(os.path.join(root, file), encoding="UTF-8") as content:
tree = ET.parse(content)
myList.append(tree)
在myList中,我有一些XMl文件
现在,对于根“边”,它们没有type=“seg”
然后我想把数字分配给src中的notappread,因为这个句子叫做premise,例如这里……我想说“a3”就是所谓的“premise”(因为它不是标签src)
比如这里
(0,0,1,0,0)应该是我的过程的结果,因为a3不合适。我将第三个数组设置为1,其余数组设置为零
一般来说,我想提取信息,以便注释我的文本,这些文本是如何使用xml注释的 下面是数据提取部分
import xml.etree.ElementTree as ET
xml = '''<?xml version='1.0' encoding='UTF-8'?>
<arggraph id="micro_b002" topic_id="higher_dog_poo_fines" stance="pro">
<edu id="e1"><![CDATA[One can hardly move in Friedrichshain or Neukölln these days without permanently scanning the ground for dog dirt.]]></edu>
<edu id="e2"><![CDATA[And when bad luck does strike and you step into one of the many 'land mines' you have to painstakingly scrape the remains off your soles.]]></edu>
<edu id="e3"><![CDATA[Higher fines are therefore the right measure against negligent, lazy or simply thoughtless dog owners.]]></edu>
<edu id="e4"><![CDATA[Of course, first they'd actually need to be caught in the act by public order officers,]]></edu>
<edu id="e5"><![CDATA[but once they have to dig into their pockets, their laziness will sure vanish!]]></edu>
<adu id="a1" type="pro"/>
<adu id="a2" type="pro"/>
<adu id="a3" type="pro"/>
<adu id="a4" type="opp"/>
<adu id="a5" type="pro"/>
<edge id="c6" src="e1" trg="a1" type="seg"/>
<edge id="c7" src="e2" trg="a2" type="seg"/>
<edge id="c8" src="e3" trg="a3" type="seg"/>
<edge id="c9" src="e4" trg="a4" type="seg"/>
<edge id="c10" src="e5" trg="a5" type="seg"/>
<edge id="c1" src="a1" trg="a3" type="sup"/>
<edge id="c2" src="a2" trg="a3" type="sup"/>
<edge id="c4" src="a4" trg="a3" type="reb"/>
<edge id="c5" src="a5" trg="c4" type="und"/>
</arggraph>'''
root = ET.fromstring(xml)
interesting_edges_src = [e.attrib['src'] for e in root.findall('.//edge') if e.attrib['type'] != 'seg' ]
print(interesting_edges_src)
使用@ Baldman响应,这里可以被认为是某种答案,它在某种程度上接近最后一个< /P>
myList = []
myEdgesList=[]
#read the whole text from
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith('.xml'):
with open(os.path.join(root, file), encoding="UTF-8") as content:
tree = ET.parse(content)
myList.append(tree)
这提供
['a1'、'a2'、'a4'、'a5']以及所有其他示例的列表
[['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a2', 'a3', 'a4', 'a5'],
.
.
.
只剩下将此列表转换为
(0,0,0,0,1) <----- ['a1', 'a2', 'a3', 'a4']
#as a5 is missing
(0,0,1,0,0) <------ ['a1', 'a2', 'a4', 'a5']
#as a3 is misisng
.
.
.
(0,0,1) <------- ['a2', 'a3']
#as a1 is missing
(0,0,0,0,1)下一个问题
myEdgtlistmap=[]
for lst in myEdgesList:
tp=[]
for el in lst:
if el=="a1":
tp.append(1)
if el=="a2":
tp.append(2)
if el=="a3":
tp.append(3)
if el=="a4":
tp.append(4)
if el=="a5":
tp.append(5)
if el=="a6":
tp.append(6)
myEdgtlistmap.append(tp)
for k in myList:
Edge= [e.attrib['src'] for e in k.findall('.//edge') if e.attrib['type'] != 'seg' ]
myEdgesList.append(Edge)
[['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a4', 'a5'],
['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a3', 'a4'],
['a1', 'a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3', 'a4', 'a5'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a1', 'a2', 'a3'],
['a2', 'a3', 'a4', 'a5'],
.
.
.
(0,0,0,0,1) <----- ['a1', 'a2', 'a3', 'a4']
#as a5 is missing
(0,0,1,0,0) <------ ['a1', 'a2', 'a4', 'a5']
#as a3 is misisng
.
.
.
(0,0,1) <------- ['a2', 'a3']
#as a1 is missing
myEdgtlistmap=[]
for lst in myEdgesList:
tp=[]
for el in lst:
if el=="a1":
tp.append(1)
if el=="a2":
tp.append(2)
if el=="a3":
tp.append(3)
if el=="a4":
tp.append(4)
if el=="a5":
tp.append(5)
if el=="a6":
tp.append(6)
myEdgtlistmap.append(tp)
label=[]
for le in myEdgtlistmap:
b=[1]*(len(le)+1)
for v in le:
b[v-1]=0
label.append(b)
y=[l for lab in label for l in lab ]