Python中XML中的父节点插入不像我希望的那样工作
我有一个长XML文件,其结构如下:Python中XML中的父节点插入不像我希望的那样工作,python,xml,tags,lxml,elementtree,Python,Xml,Tags,Lxml,Elementtree,我有一个长XML文件,其结构如下: <?xml version="1.0" encoding="utf-8"?> <pages> <page id="1" bbox="0.000,0.000,462.047,680.315" rotate="0"> <textbox id="0" bbox="179.739,592.028,261.007,604.510"> <textline bbox="17
<?xml version="1.0" encoding="utf-8"?>
<pages>
<page id="1" bbox="0.000,0.000,462.047,680.315" rotate="0">
<textbox id="0" bbox="179.739,592.028,261.007,604.510">
<textline bbox="179.739,592.028,261.007,604.510">
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" ncolour="0" size="12.482">C</text>
<text font="NUMPTY+ImprintMTnum-it" bbox="192.745,592.218,199.339,603.578" ncolour="0" size="12.333">A</text>
<text font="NUMPTY+ImprintMTnum-it" bbox="193.745,592.218,199.339,603.578" ncolour="0" size="12.333">P</text>
<text font="NUMPTY+ImprintMTnum-it" bbox="191.745,592.218,199.339,603.578" ncolour="0" size="12.333">I</text>
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" ncolour="0" size="12.482">T</text>
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" ncolour="0" size="12.482">O</text>
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" ncolour="0" size="12.482">L</text>
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" ncolour="0" size="12.482">O</text>
<text></text>
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" ncolour="0" size="12.482">I</text>
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" ncolour="0" size="12.482">I</text>
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" ncolour="0" size="12.482">I</text>
<text></text>
</textline>
</textbox>
</page>
</pages>
但它并没有像预期的那样工作,我看不到换行符标记,如果缩短距离,它只会包装一个文本元素。我错过了什么
编辑:预期输出
<?xml version="1.0"?>
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text><text font="NUMPTY+ImprintMTnum" bbox="199.227,592.218,205.657,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">P</text><text font="NUMPTY+ImprintMTnum" bbox="205.545,592.218,211.975,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">P</text><text font="NUMPTY+ImprintMTnum" bbox="211.023,592.218,218.617,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text><text font="NUMPTY+ImprintMTnum" bbox="218.515,592.218,226.109,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">R</text><text font="NUMPTY+ImprintMTnum" bbox="226.008,592.218,233.602,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text><text font="NUMPTY+ImprintMTnum" bbox="232.812,592.218,240.932,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">T</text>O
<text font="NUMPTY+ImprintMTnum" bbox="44.614,554.008,49.369,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">2</text><text font="NUMPTY+ImprintMTnum" bbox="49.268,554.008,54.022,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>a
<text font="NUMPTY+ImprintMTnum" bbox="43.563,475.008,48.317,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">2</text><text font="NUMPTY+ImprintMTnum" bbox="48.226,475.008,52.980,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>b
<text font="NUMPTY+ImprintMTnum" bbox="44.614,421.608,49.369,431.846" colourspace="DeviceGray" ncolour="0" size="10.238">2</text><text font="NUMPTY+ImprintMTnum" bbox="49.268,421.608,54.022,431.846" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>c
<text font="NUMPTY+ImprintMTnum" bbox="43.563,339.508,48.317,349.746" colourspace="DeviceGray" ncolour="0" size="10.238">2</text><text font="NUMPTY+ImprintMTnum" bbox="48.226,339.508,52.980,349.746" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>d
<text font="NUMPTY+ImprintMTnum" bbox="44.949,237.108,49.703,247.347" colourspace="DeviceGray" ncolour="0" size="10.238">2</text><text font="NUMPTY+ImprintMTnum" bbox="49.274,237.108,54.028,247.347" colourspace="DeviceGray" ncolour="0" size="10.238">5</text>a
**<newline>**
<text font="PYNIYO+ImprintMTnum-Italic" bbox="68.031,553.639,76.375,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">T</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="76.231,553.639,79.479,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">i</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="79.334,553.639,83.161,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">t</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="83.017,553.639,88.112,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">o</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="87.968,553.639,91.216,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">l</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="91.071,553.639,96.167,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">o</text>
<whitespace/><text font="NUMPTY+ImprintMTnum" bbox="99.311,553.628,104.406,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">I</text>
<text font="NUMPTY+ImprintMTnum" bbox="104.261,553.628,107.510,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">l</text>
<text font="NUMPTY+ImprintMTnum" bbox="107.365,553.628,110.269,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"> </text>
<text font="NUMPTY+ImprintMTnum" bbox="110.658,553.628,119.002,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">C</text>
<text font="NUMPTY+ImprintMTnum" bbox="118.857,553.628,123.953,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">a</text>
<text font="NUMPTY+ImprintMTnum" bbox="123.808,553.628,130.183,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">u</text>
<text font="NUMPTY+ImprintMTnum" bbox="130.038,553.628,134.555,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">s</text>
<text font="NUMPTY+ImprintMTnum" bbox="134.410,553.628,137.659,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="137.514,553.628,143.889,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">d</text>
<text font="NUMPTY+ImprintMTnum" bbox="143.744,553.628,146.993,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="146.848,553.628,151.943,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">c</text>
<text font="NUMPTY+ImprintMTnum" bbox="151.799,553.628,157.595,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="157.450,553.628,161.277,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">]</text>
<text font="NUMPTY+ImprintMTnum" bbox="161.132,553.628,164.036,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"> </text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="164.417,553.639,168.244,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">s</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="168.099,553.639,173.895,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">p</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="173.751,553.639,177.578,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">s</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="176.966,553.639,180.215,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">.</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="180.070,553.639,182.974,566.366" colourspace="DeviceGray" ncolour="0" size="12.727"> </text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="183.363,553.639,189.159,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">a</text>
<whitespace/><text font="NUMPTY+ImprintMTnum" bbox="192.314,553.628,201.937,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">D</text><text font="NUMPTY+ImprintMTnum" bbox="201.793,553.628,207.589,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text><text font="NUMPTY+ImprintMTnum" bbox="207.444,553.628,213.819,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">n</text><text font="NUMPTY+ImprintMTnum" bbox="213.674,553.628,216.578,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"> </text> <text font="NUMPTY+ImprintMTnum" bbox="216.967,553.628,225.311,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">R</text><text font="NUMPTY+ImprintMTnum" bbox="225.166,553.628,230.962,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text><text font="NUMPTY+ImprintMTnum" bbox="230.818,553.628,237.192,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">d</text><text font="NUMPTY+ImprintMTnum" bbox="237.048,553.628,241.565,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">r</text><text font="NUMPTY+ImprintMTnum" bbox="241.420,553.628,244.668,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text><text font="NUMPTY+ImprintMTnum" bbox="244.524,553.628,250.320,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">g</text><text font="NUMPTY+ImprintMTnum" bbox="250.064,553.628,255.860,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text> <text>
</text>
**</newline>**
**<newline>**
<text font="QKWQNQ+ImprintMTnum-Bold" bbox="272.661,554.072,277.415,564.757" colourspace="DeviceGray" ncolour="0" size="10.685">1</text>
... continues
**</newline>**
公寓
24a
24b
24c
24d
25a
****
T
我
T
o
L
o
我
L
C
A.
U
s
我
D
我
C
o
]
s
P
s
.
A.
唐·罗德里戈
****
****
1.
... 继续
****
您可以尝试以下方法:
文本元素
- 对于所有项目,我们保留BBox中的当前值和以前的值。我们保留最后一个有效值。这意味着,如果给定元素缺少
标记,则将使用前一个标记bbox
- 然后我们计算距离(例如,
-|bb_current
)bb_previous
- 如果距离大于10:这意味着需要打开一个新的
标签。但首先,我们需要关闭上一个。因此,使用new\u行
()insert
- 如果
已经打开:我们使用新行
在其中添加当前标记,否则我们将它们作为原始标记append
新行
元素文本
元素或空白
元素将被移动。如果我们想保持它们的原点位置,它们需要在for循环中处理
完整代码:
import lxml.etree as etree
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse('data.xml', parser)
root = tree.getroot()
# Get the first BBox value as float
# Return null if not found
def getBBoxFirstValue(line):
if line is not None:
bb = line.attrib.get('bbox')
if bb is not None:
try:
return float(bb.split(",")[0])
except ValueError:
pass
return None
new_line = None
previous_bb = None
for x in tree.xpath('//text'):
# Get current bb value
bb = getBBoxFirstValue(x)
# Check current and past values aren't empty
if bb is not None and previous_bb is not None:
# If distance with preview bb > 10
if abs(bb - previous_bb) > 10:
# If new_line isn't empty: it's inserted into parent tag at position of current tag
if new_line is not None:
x.getparent().insert(x.getparent().index(x), new_line)
# A new "new_line" element is created
new_line = etree.Element("new_line")
# If the new line isn't not (e.g. one distance > 10 has been already found)
if new_line is not None:
new_line.append(x)
# Keep latest non empty BBox 1st value
if bb is not None:
previous_bb = bb
# Add last new_line element if not null
if new_line is not None:
tree.xpath('//text')[-1].getparent().append(new_line)
newtree = etree.tostring(root, encoding='utf-8', pretty_print=True)
newtree = newtree.decode("utf-8")
input.xml
A.
P
P
A.
R
A.
T
O
2.
4.
A.
2.
4.
B
2.
4.
C
2.
4.
D
2.
5.
A.
T
我
T
o
L
o
我
L
C
A.
U
s
我
D
我
C
o
]
s
P
s
.
A.
D
o
N
R
o
D
R
我
G
o
1.
输出
A.
P
P
A.
R
A.
T
O
2.
4.
A.
2.
4.
B
2.
4.
C
2.
4.
D
2.
5.
A.
T
我
T
o
L
o
我
L
C
A.
U
s
我
D
我
C
o
]
s
P
s
.
A.
D
o
N
R
o
D
R
我
G
o
1.
请问,您能分享预期的输出吗?是的,我对我的问题进行了编辑。我希望现在清楚了。是否需要保留
和空文本
元素?它们需要保留,但如果可以替换,也可以。谢谢!!我得到这个错误:文件“C:/Users/annas/PycharmProjects/TESI/oooooo.py”,第36行,在parent.insert(parent.index(x),new_行)文件“src\lxml\etree.pyx”,第1216行,在lxml.etree中。\u Element.index value错误:Element不是这个节点的子节点;你知道为什么吗?这个问题是发生在我提供的示例还是另一个xml
文件中?如果是,您能在问题中更新它吗?我正在努力更好地理解它,基本上问题似乎是因为我的XML更长(它重复页面和页面中的所有内容),不知何故,代码不起作用,但与短输入一起工作。在我使用的完整XML中添加链接是否有用?…
元素是否都属于同一父元素?是的,当然!
<?xml version="1.0"?>
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text><text font="NUMPTY+ImprintMTnum" bbox="199.227,592.218,205.657,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">P</text><text font="NUMPTY+ImprintMTnum" bbox="205.545,592.218,211.975,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">P</text><text font="NUMPTY+ImprintMTnum" bbox="211.023,592.218,218.617,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text><text font="NUMPTY+ImprintMTnum" bbox="218.515,592.218,226.109,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">R</text><text font="NUMPTY+ImprintMTnum" bbox="226.008,592.218,233.602,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text><text font="NUMPTY+ImprintMTnum" bbox="232.812,592.218,240.932,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">T</text>O
<text font="NUMPTY+ImprintMTnum" bbox="44.614,554.008,49.369,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">2</text><text font="NUMPTY+ImprintMTnum" bbox="49.268,554.008,54.022,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>a
<text font="NUMPTY+ImprintMTnum" bbox="43.563,475.008,48.317,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">2</text><text font="NUMPTY+ImprintMTnum" bbox="48.226,475.008,52.980,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>b
<text font="NUMPTY+ImprintMTnum" bbox="44.614,421.608,49.369,431.846" colourspace="DeviceGray" ncolour="0" size="10.238">2</text><text font="NUMPTY+ImprintMTnum" bbox="49.268,421.608,54.022,431.846" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>c
<text font="NUMPTY+ImprintMTnum" bbox="43.563,339.508,48.317,349.746" colourspace="DeviceGray" ncolour="0" size="10.238">2</text><text font="NUMPTY+ImprintMTnum" bbox="48.226,339.508,52.980,349.746" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>d
<text font="NUMPTY+ImprintMTnum" bbox="44.949,237.108,49.703,247.347" colourspace="DeviceGray" ncolour="0" size="10.238">2</text><text font="NUMPTY+ImprintMTnum" bbox="49.274,237.108,54.028,247.347" colourspace="DeviceGray" ncolour="0" size="10.238">5</text>a
**<newline>**
<text font="PYNIYO+ImprintMTnum-Italic" bbox="68.031,553.639,76.375,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">T</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="76.231,553.639,79.479,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">i</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="79.334,553.639,83.161,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">t</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="83.017,553.639,88.112,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">o</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="87.968,553.639,91.216,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">l</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="91.071,553.639,96.167,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">o</text>
<whitespace/><text font="NUMPTY+ImprintMTnum" bbox="99.311,553.628,104.406,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">I</text>
<text font="NUMPTY+ImprintMTnum" bbox="104.261,553.628,107.510,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">l</text>
<text font="NUMPTY+ImprintMTnum" bbox="107.365,553.628,110.269,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"> </text>
<text font="NUMPTY+ImprintMTnum" bbox="110.658,553.628,119.002,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">C</text>
<text font="NUMPTY+ImprintMTnum" bbox="118.857,553.628,123.953,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">a</text>
<text font="NUMPTY+ImprintMTnum" bbox="123.808,553.628,130.183,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">u</text>
<text font="NUMPTY+ImprintMTnum" bbox="130.038,553.628,134.555,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">s</text>
<text font="NUMPTY+ImprintMTnum" bbox="134.410,553.628,137.659,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="137.514,553.628,143.889,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">d</text>
<text font="NUMPTY+ImprintMTnum" bbox="143.744,553.628,146.993,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="146.848,553.628,151.943,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">c</text>
<text font="NUMPTY+ImprintMTnum" bbox="151.799,553.628,157.595,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="157.450,553.628,161.277,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">]</text>
<text font="NUMPTY+ImprintMTnum" bbox="161.132,553.628,164.036,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"> </text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="164.417,553.639,168.244,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">s</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="168.099,553.639,173.895,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">p</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="173.751,553.639,177.578,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">s</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="176.966,553.639,180.215,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">.</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="180.070,553.639,182.974,566.366" colourspace="DeviceGray" ncolour="0" size="12.727"> </text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="183.363,553.639,189.159,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">a</text>
<whitespace/><text font="NUMPTY+ImprintMTnum" bbox="192.314,553.628,201.937,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">D</text><text font="NUMPTY+ImprintMTnum" bbox="201.793,553.628,207.589,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text><text font="NUMPTY+ImprintMTnum" bbox="207.444,553.628,213.819,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">n</text><text font="NUMPTY+ImprintMTnum" bbox="213.674,553.628,216.578,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"> </text> <text font="NUMPTY+ImprintMTnum" bbox="216.967,553.628,225.311,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">R</text><text font="NUMPTY+ImprintMTnum" bbox="225.166,553.628,230.962,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text><text font="NUMPTY+ImprintMTnum" bbox="230.818,553.628,237.192,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">d</text><text font="NUMPTY+ImprintMTnum" bbox="237.048,553.628,241.565,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">r</text><text font="NUMPTY+ImprintMTnum" bbox="241.420,553.628,244.668,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text><text font="NUMPTY+ImprintMTnum" bbox="244.524,553.628,250.320,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">g</text><text font="NUMPTY+ImprintMTnum" bbox="250.064,553.628,255.860,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text> <text>
</text>
**</newline>**
**<newline>**
<text font="QKWQNQ+ImprintMTnum-Bold" bbox="272.661,554.072,277.415,564.757" colourspace="DeviceGray" ncolour="0" size="10.685">1</text>
... continues
**</newline>**
import lxml.etree as etree
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse('data.xml', parser)
root = tree.getroot()
# Get the first BBox value as float
# Return null if not found
def getBBoxFirstValue(line):
if line is not None:
bb = line.attrib.get('bbox')
if bb is not None:
try:
return float(bb.split(",")[0])
except ValueError:
pass
return None
new_line = None
previous_bb = None
for x in tree.xpath('//text'):
# Get current bb value
bb = getBBoxFirstValue(x)
# Check current and past values aren't empty
if bb is not None and previous_bb is not None:
# If distance with preview bb > 10
if abs(bb - previous_bb) > 10:
# If new_line isn't empty: it's inserted into parent tag at position of current tag
if new_line is not None:
x.getparent().insert(x.getparent().index(x), new_line)
# A new "new_line" element is created
new_line = etree.Element("new_line")
# If the new line isn't not (e.g. one distance > 10 has been already found)
if new_line is not None:
new_line.append(x)
# Keep latest non empty BBox 1st value
if bb is not None:
previous_bb = bb
# Add last new_line element if not null
if new_line is not None:
tree.xpath('//text')[-1].getparent().append(new_line)
newtree = etree.tostring(root, encoding='utf-8', pretty_print=True)
newtree = newtree.decode("utf-8")
<?xml version="1.0" encoding="utf-8"?>
<pages>
<page id="1" bbox="0.000,0.000,462.047,680.315" rotate="0">
<textbox id="0" bbox="179.739,592.028,261.007,604.510">
<textline bbox="179.739,592.028,261.007,604.510">
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="199.227,592.218,205.657,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">P</text>
<text font="NUMPTY+ImprintMTnum" bbox="205.545,592.218,211.975,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">P</text>
<text font="NUMPTY+ImprintMTnum" bbox="211.023,592.218,218.617,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="218.515,592.218,226.109,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">R</text>
<text font="NUMPTY+ImprintMTnum" bbox="226.008,592.218,233.602,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="232.812,592.218,240.932,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">T</text>
O
<text font="NUMPTY+ImprintMTnum" bbox="44.614,554.008,49.369,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="49.268,554.008,54.022,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
a
<text font="NUMPTY+ImprintMTnum" bbox="43.563,475.008,48.317,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="48.226,475.008,52.980,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
b
<text font="NUMPTY+ImprintMTnum" bbox="44.614,421.608,49.369,431.846" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="49.268,421.608,54.022,431.846" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
c
<text font="NUMPTY+ImprintMTnum" bbox="43.563,339.508,48.317,349.746" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="48.226,339.508,52.980,349.746" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
d
<text font="NUMPTY+ImprintMTnum" bbox="44.949,237.108,49.703,247.347" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="49.274,237.108,54.028,247.347" colourspace="DeviceGray" ncolour="0" size="10.238">5</text>
a
<text font="PYNIYO+ImprintMTnum-Italic" bbox="68.031,553.639,76.375,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">T</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="76.231,553.639,79.479,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">i</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="79.334,553.639,83.161,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">t</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="83.017,553.639,88.112,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">o</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="87.968,553.639,91.216,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">l</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="91.071,553.639,96.167,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="99.311,553.628,104.406,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">I</text>
<text font="NUMPTY+ImprintMTnum" bbox="104.261,553.628,107.510,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">l</text>
<text font="NUMPTY+ImprintMTnum" bbox="107.365,553.628,110.269,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"></text>
<text font="NUMPTY+ImprintMTnum" bbox="110.658,553.628,119.002,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">C</text>
<text font="NUMPTY+ImprintMTnum" bbox="118.857,553.628,123.953,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">a</text>
<text font="NUMPTY+ImprintMTnum" bbox="123.808,553.628,130.183,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">u</text>
<text font="NUMPTY+ImprintMTnum" bbox="130.038,553.628,134.555,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">s</text>
<text font="NUMPTY+ImprintMTnum" bbox="134.410,553.628,137.659,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="137.514,553.628,143.889,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">d</text>
<text font="NUMPTY+ImprintMTnum" bbox="143.744,553.628,146.993,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="146.848,553.628,151.943,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">c</text>
<text font="NUMPTY+ImprintMTnum" bbox="151.799,553.628,157.595,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="157.450,553.628,161.277,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">]</text>
<text font="NUMPTY+ImprintMTnum" bbox="161.132,553.628,164.036,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"></text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="164.417,553.639,168.244,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">s</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="168.099,553.639,173.895,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">p</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="173.751,553.639,177.578,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">s</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="176.966,553.639,180.215,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">.</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="180.070,553.639,182.974,566.366" colourspace="DeviceGray" ncolour="0" size="12.727"></text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="183.363,553.639,189.159,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">a</text>
<whitespace/>
<text font="NUMPTY+ImprintMTnum" bbox="192.314,553.628,201.937,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">D</text>
<text font="NUMPTY+ImprintMTnum" bbox="201.793,553.628,207.589,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="207.444,553.628,213.819,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">n</text>
<text font="NUMPTY+ImprintMTnum" bbox="213.674,553.628,216.578,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"></text>
<text font="NUMPTY+ImprintMTnum" bbox="216.967,553.628,225.311,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">R</text>
<text font="NUMPTY+ImprintMTnum" bbox="225.166,553.628,230.962,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="230.818,553.628,237.192,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">d</text>
<text font="NUMPTY+ImprintMTnum" bbox="237.048,553.628,241.565,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">r</text>
<text font="NUMPTY+ImprintMTnum" bbox="241.420,553.628,244.668,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="244.524,553.628,250.320,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">g</text>
<text font="NUMPTY+ImprintMTnum" bbox="250.064,553.628,255.860,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text></text>
<text font="QKWQNQ+ImprintMTnum-Bold" bbox="272.661,554.072,277.415,564.757" colourspace="DeviceGray" ncolour="0" size="10.685">1</text>
</textline>
</textbox>
</page>
</pages>
<pages>
<page id="1" bbox="0.000,0.000,462.047,680.315" rotate="0">
<textbox id="0" bbox="179.739,592.028,261.007,604.510">
<textline bbox="179.739,592.028,261.007,604.510">
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="199.227,592.218,205.657,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">P</text>
<text font="NUMPTY+ImprintMTnum" bbox="205.545,592.218,211.975,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">P</text>
<text font="NUMPTY+ImprintMTnum" bbox="211.023,592.218,218.617,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="218.515,592.218,226.109,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">R</text>
<text font="NUMPTY+ImprintMTnum" bbox="226.008,592.218,233.602,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="232.812,592.218,240.932,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">T</text>
O
<new_line>
<text font="NUMPTY+ImprintMTnum" bbox="44.614,554.008,49.369,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="49.268,554.008,54.022,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
a
<text font="NUMPTY+ImprintMTnum" bbox="43.563,475.008,48.317,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="48.226,475.008,52.980,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
b
<text font="NUMPTY+ImprintMTnum" bbox="44.614,421.608,49.369,431.846" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="49.268,421.608,54.022,431.846" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
c
<text font="NUMPTY+ImprintMTnum" bbox="43.563,339.508,48.317,349.746" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="48.226,339.508,52.980,349.746" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
d
<text font="NUMPTY+ImprintMTnum" bbox="44.949,237.108,49.703,247.347" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="49.274,237.108,54.028,247.347" colourspace="DeviceGray" ncolour="0" size="10.238">5</text>
a
</new_line>
<whitespace/>
<text/>
<new_line>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="68.031,553.639,76.375,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">T</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="76.231,553.639,79.479,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">i</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="79.334,553.639,83.161,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">t</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="83.017,553.639,88.112,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">o</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="87.968,553.639,91.216,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">l</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="91.071,553.639,96.167,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="99.311,553.628,104.406,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">I</text>
<text font="NUMPTY+ImprintMTnum" bbox="104.261,553.628,107.510,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">l</text>
<text font="NUMPTY+ImprintMTnum" bbox="107.365,553.628,110.269,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"/>
<text font="NUMPTY+ImprintMTnum" bbox="110.658,553.628,119.002,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">C</text>
<text font="NUMPTY+ImprintMTnum" bbox="118.857,553.628,123.953,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">a</text>
<text font="NUMPTY+ImprintMTnum" bbox="123.808,553.628,130.183,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">u</text>
<text font="NUMPTY+ImprintMTnum" bbox="130.038,553.628,134.555,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">s</text>
<text font="NUMPTY+ImprintMTnum" bbox="134.410,553.628,137.659,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="137.514,553.628,143.889,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">d</text>
<text font="NUMPTY+ImprintMTnum" bbox="143.744,553.628,146.993,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="146.848,553.628,151.943,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">c</text>
<text font="NUMPTY+ImprintMTnum" bbox="151.799,553.628,157.595,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="157.450,553.628,161.277,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">]</text>
<text font="NUMPTY+ImprintMTnum" bbox="161.132,553.628,164.036,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"/>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="164.417,553.639,168.244,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">s</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="168.099,553.639,173.895,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">p</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="173.751,553.639,177.578,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">s</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="176.966,553.639,180.215,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">.</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="180.070,553.639,182.974,566.366" colourspace="DeviceGray" ncolour="0" size="12.727"/>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="183.363,553.639,189.159,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">a</text>
<text font="NUMPTY+ImprintMTnum" bbox="192.314,553.628,201.937,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">D</text>
<text font="NUMPTY+ImprintMTnum" bbox="201.793,553.628,207.589,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="207.444,553.628,213.819,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">n</text>
<text font="NUMPTY+ImprintMTnum" bbox="213.674,553.628,216.578,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"/>
<text font="NUMPTY+ImprintMTnum" bbox="216.967,553.628,225.311,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">R</text>
<text font="NUMPTY+ImprintMTnum" bbox="225.166,553.628,230.962,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="230.818,553.628,237.192,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">d</text>
<text font="NUMPTY+ImprintMTnum" bbox="237.048,553.628,241.565,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">r</text>
<text font="NUMPTY+ImprintMTnum" bbox="241.420,553.628,244.668,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="244.524,553.628,250.320,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">g</text>
<text font="NUMPTY+ImprintMTnum" bbox="250.064,553.628,255.860,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
</new_line>
<new_line>
<text font="QKWQNQ+ImprintMTnum-Bold" bbox="272.661,554.072,277.415,564.757" colourspace="DeviceGray" ncolour="0" size="10.685">1</text>
</new_line>
</textline>
</textbox>
</page>
</pages>