Вы можете попробовать следующий подход:
Найти элементы text
, используя xpath
Итерация по всем элементам:
Для всех элементов мы сохраняем текущее значение и предыдущие значения из BBox. Мы сохраняем последнее действительное значение. Это означает, что если для данного элемента отсутствует тег bbox
, будет использован предыдущий.
Затем мы вычисляем расстояние (например, |bb_current
- bb_previous|
)
Если расстояние больше 10: это означает, что необходимо открыть новый тег new_line
. Но сначала нам нужно закрыть предыдущий. Таким образом, current_line
вставляется в родительский тег, используя insert
( связанный топи c)
Если new_line
уже открыт: мы добавляем текущие теги внутри, используя append
, иначе мы оставим их как оригинальные.
Наконец, мы добавляем последний элемент new_line
вне l oop.
Примечания : другие теги, такие как пустые элементы text
или whitespace
elemnts, перемещаются. Они должны быть обработаны в течение l oop, если мы хотим сохранить их исходную позицию.
Полный код :
import lxml.etree as etree
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse('data.xml', parser)
root = tree.getroot()
# Get the first BBox value as float
# Return null if not found
def getBBoxFirstValue(line):
if line is not None:
bb = line.attrib.get('bbox')
if bb is not None:
try:
return float(bb.split(",")[0])
except ValueError:
pass
return None
new_line = None
previous_bb = None
for x in tree.xpath('//text'):
# Get current bb value
bb = getBBoxFirstValue(x)
# Check current and past values aren't empty
if bb is not None and previous_bb is not None:
# If distance with preview bb > 10
if abs(bb - previous_bb) > 10:
# If new_line isn't empty: it's inserted into parent tag at position of current tag
if new_line is not None:
x.getparent().insert(x.getparent().index(x), new_line)
# A new "new_line" element is created
new_line = etree.Element("new_line")
# If the new line isn't not (e.g. one distance > 10 has been already found)
if new_line is not None:
new_line.append(x)
# Keep latest non empty BBox 1st value
if bb is not None:
previous_bb = bb
# Add last new_line element if not null
if new_line is not None:
tree.xpath('//text')[-1].getparent().append(new_line)
newtree = etree.tostring(root, encoding='utf-8', pretty_print=True)
newtree = newtree.decode("utf-8")
вход. xml
<?xml version="1.0" encoding="utf-8"?>
<pages>
<page id="1" bbox="0.000,0.000,462.047,680.315" rotate="0">
<textbox id="0" bbox="179.739,592.028,261.007,604.510">
<textline bbox="179.739,592.028,261.007,604.510">
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="199.227,592.218,205.657,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">P</text>
<text font="NUMPTY+ImprintMTnum" bbox="205.545,592.218,211.975,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">P</text>
<text font="NUMPTY+ImprintMTnum" bbox="211.023,592.218,218.617,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="218.515,592.218,226.109,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">R</text>
<text font="NUMPTY+ImprintMTnum" bbox="226.008,592.218,233.602,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="232.812,592.218,240.932,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">T</text>
O
<text font="NUMPTY+ImprintMTnum" bbox="44.614,554.008,49.369,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="49.268,554.008,54.022,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
a
<text font="NUMPTY+ImprintMTnum" bbox="43.563,475.008,48.317,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="48.226,475.008,52.980,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
b
<text font="NUMPTY+ImprintMTnum" bbox="44.614,421.608,49.369,431.846" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="49.268,421.608,54.022,431.846" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
c
<text font="NUMPTY+ImprintMTnum" bbox="43.563,339.508,48.317,349.746" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="48.226,339.508,52.980,349.746" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
d
<text font="NUMPTY+ImprintMTnum" bbox="44.949,237.108,49.703,247.347" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="49.274,237.108,54.028,247.347" colourspace="DeviceGray" ncolour="0" size="10.238">5</text>
a
<text font="PYNIYO+ImprintMTnum-Italic" bbox="68.031,553.639,76.375,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">T</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="76.231,553.639,79.479,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">i</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="79.334,553.639,83.161,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">t</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="83.017,553.639,88.112,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">o</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="87.968,553.639,91.216,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">l</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="91.071,553.639,96.167,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="99.311,553.628,104.406,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">I</text>
<text font="NUMPTY+ImprintMTnum" bbox="104.261,553.628,107.510,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">l</text>
<text font="NUMPTY+ImprintMTnum" bbox="107.365,553.628,110.269,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"></text>
<text font="NUMPTY+ImprintMTnum" bbox="110.658,553.628,119.002,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">C</text>
<text font="NUMPTY+ImprintMTnum" bbox="118.857,553.628,123.953,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">a</text>
<text font="NUMPTY+ImprintMTnum" bbox="123.808,553.628,130.183,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">u</text>
<text font="NUMPTY+ImprintMTnum" bbox="130.038,553.628,134.555,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">s</text>
<text font="NUMPTY+ImprintMTnum" bbox="134.410,553.628,137.659,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="137.514,553.628,143.889,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">d</text>
<text font="NUMPTY+ImprintMTnum" bbox="143.744,553.628,146.993,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="146.848,553.628,151.943,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">c</text>
<text font="NUMPTY+ImprintMTnum" bbox="151.799,553.628,157.595,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="157.450,553.628,161.277,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">]</text>
<text font="NUMPTY+ImprintMTnum" bbox="161.132,553.628,164.036,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"></text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="164.417,553.639,168.244,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">s</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="168.099,553.639,173.895,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">p</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="173.751,553.639,177.578,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">s</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="176.966,553.639,180.215,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">.</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="180.070,553.639,182.974,566.366" colourspace="DeviceGray" ncolour="0" size="12.727"></text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="183.363,553.639,189.159,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">a</text>
<whitespace/>
<text font="NUMPTY+ImprintMTnum" bbox="192.314,553.628,201.937,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">D</text>
<text font="NUMPTY+ImprintMTnum" bbox="201.793,553.628,207.589,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="207.444,553.628,213.819,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">n</text>
<text font="NUMPTY+ImprintMTnum" bbox="213.674,553.628,216.578,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"></text>
<text font="NUMPTY+ImprintMTnum" bbox="216.967,553.628,225.311,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">R</text>
<text font="NUMPTY+ImprintMTnum" bbox="225.166,553.628,230.962,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="230.818,553.628,237.192,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">d</text>
<text font="NUMPTY+ImprintMTnum" bbox="237.048,553.628,241.565,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">r</text>
<text font="NUMPTY+ImprintMTnum" bbox="241.420,553.628,244.668,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="244.524,553.628,250.320,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">g</text>
<text font="NUMPTY+ImprintMTnum" bbox="250.064,553.628,255.860,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text></text>
<text font="QKWQNQ+ImprintMTnum-Bold" bbox="272.661,554.072,277.415,564.757" colourspace="DeviceGray" ncolour="0" size="10.685">1</text>
</textline>
</textbox>
</page>
</pages>
выход
<pages>
<page id="1" bbox="0.000,0.000,462.047,680.315" rotate="0">
<textbox id="0" bbox="179.739,592.028,261.007,604.510">
<textline bbox="179.739,592.028,261.007,604.510">
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="199.227,592.218,205.657,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">P</text>
<text font="NUMPTY+ImprintMTnum" bbox="205.545,592.218,211.975,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">P</text>
<text font="NUMPTY+ImprintMTnum" bbox="211.023,592.218,218.617,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="218.515,592.218,226.109,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">R</text>
<text font="NUMPTY+ImprintMTnum" bbox="226.008,592.218,233.602,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="232.812,592.218,240.932,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">T</text>
O
<new_line>
<text font="NUMPTY+ImprintMTnum" bbox="44.614,554.008,49.369,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="49.268,554.008,54.022,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
a
<text font="NUMPTY+ImprintMTnum" bbox="43.563,475.008,48.317,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="48.226,475.008,52.980,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
b
<text font="NUMPTY+ImprintMTnum" bbox="44.614,421.608,49.369,431.846" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="49.268,421.608,54.022,431.846" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
c
<text font="NUMPTY+ImprintMTnum" bbox="43.563,339.508,48.317,349.746" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="48.226,339.508,52.980,349.746" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
d
<text font="NUMPTY+ImprintMTnum" bbox="44.949,237.108,49.703,247.347" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="49.274,237.108,54.028,247.347" colourspace="DeviceGray" ncolour="0" size="10.238">5</text>
a
</new_line>
<whitespace/>
<text/>
<new_line>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="68.031,553.639,76.375,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">T</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="76.231,553.639,79.479,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">i</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="79.334,553.639,83.161,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">t</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="83.017,553.639,88.112,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">o</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="87.968,553.639,91.216,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">l</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="91.071,553.639,96.167,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="99.311,553.628,104.406,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">I</text>
<text font="NUMPTY+ImprintMTnum" bbox="104.261,553.628,107.510,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">l</text>
<text font="NUMPTY+ImprintMTnum" bbox="107.365,553.628,110.269,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"/>
<text font="NUMPTY+ImprintMTnum" bbox="110.658,553.628,119.002,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">C</text>
<text font="NUMPTY+ImprintMTnum" bbox="118.857,553.628,123.953,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">a</text>
<text font="NUMPTY+ImprintMTnum" bbox="123.808,553.628,130.183,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">u</text>
<text font="NUMPTY+ImprintMTnum" bbox="130.038,553.628,134.555,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">s</text>
<text font="NUMPTY+ImprintMTnum" bbox="134.410,553.628,137.659,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="137.514,553.628,143.889,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">d</text>
<text font="NUMPTY+ImprintMTnum" bbox="143.744,553.628,146.993,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="146.848,553.628,151.943,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">c</text>
<text font="NUMPTY+ImprintMTnum" bbox="151.799,553.628,157.595,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="157.450,553.628,161.277,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">]</text>
<text font="NUMPTY+ImprintMTnum" bbox="161.132,553.628,164.036,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"/>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="164.417,553.639,168.244,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">s</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="168.099,553.639,173.895,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">p</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="173.751,553.639,177.578,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">s</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="176.966,553.639,180.215,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">.</text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="180.070,553.639,182.974,566.366" colourspace="DeviceGray" ncolour="0" size="12.727"/>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="183.363,553.639,189.159,566.366" colourspace="DeviceGray" ncolour="0" size="12.727">a</text>
<text font="NUMPTY+ImprintMTnum" bbox="192.314,553.628,201.937,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">D</text>
<text font="NUMPTY+ImprintMTnum" bbox="201.793,553.628,207.589,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="207.444,553.628,213.819,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">n</text>
<text font="NUMPTY+ImprintMTnum" bbox="213.674,553.628,216.578,566.110" colourspace="DeviceGray" ncolour="0" size="12.482"/>
<text font="NUMPTY+ImprintMTnum" bbox="216.967,553.628,225.311,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">R</text>
<text font="NUMPTY+ImprintMTnum" bbox="225.166,553.628,230.962,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
<text font="NUMPTY+ImprintMTnum" bbox="230.818,553.628,237.192,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">d</text>
<text font="NUMPTY+ImprintMTnum" bbox="237.048,553.628,241.565,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">r</text>
<text font="NUMPTY+ImprintMTnum" bbox="241.420,553.628,244.668,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">i</text>
<text font="NUMPTY+ImprintMTnum" bbox="244.524,553.628,250.320,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">g</text>
<text font="NUMPTY+ImprintMTnum" bbox="250.064,553.628,255.860,566.110" colourspace="DeviceGray" ncolour="0" size="12.482">o</text>
</new_line>
<new_line>
<text font="QKWQNQ+ImprintMTnum-Bold" bbox="272.661,554.072,277.415,564.757" colourspace="DeviceGray" ncolour="0" size="10.685">1</text>
</new_line>
</textline>
</textbox>
</page>
</pages>