Как обернуть элементы в указанный родительский тег XML Python? - PullRequest
0 голосов
/ 16 апреля 2020

У меня есть XML:

<?xml version="1.0" encoding="utf-8" ?>
<pages>
<page id="1" bbox="0.000,0.000,462.047,680.315" rotate="0">
<textbox id="0" bbox="191.745,592.218,249.042,603.578">
<textline bbox="191.745,592.218,249.042,603.578">
<text font="NUMPTY+ImprintMTnum" bbox="191.745,592.218,199.339,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="199.227,592.218,205.657,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">P</text>
<text font="NUMPTY+ImprintMTnum" bbox="205.545,592.218,211.975,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">P</text>
<text font="NUMPTY+ImprintMTnum" bbox="211.023,592.218,218.617,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="218.515,592.218,226.109,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">R</text>
<text font="NUMPTY+ImprintMTnum" bbox="226.008,592.218,233.602,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">A</text>
<text font="NUMPTY+ImprintMTnum" bbox="232.812,592.218,240.932,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">T</text>
<text font="NUMPTY+ImprintMTnum" bbox="240.922,592.218,249.042,603.578" colourspace="DeviceGray" ncolour="0" size="11.360">O</text>
</textline>
</textbox>
<textbox id="1" bbox="44.614,554.008,58.101,564.246">
<textline bbox="44.614,554.008,58.101,564.246">
<text font="NUMPTY+ImprintMTnum" bbox="44.614,554.008,49.369,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="49.268,554.008,54.022,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
<text font="NUMPTY+ImprintMTnum" bbox="53.922,554.008,58.101,564.246" colourspace="DeviceGray" ncolour="0" size="10.238">a</text>
</textline>
</textbox>
<textbox id="2" bbox="43.563,475.008,58.117,485.246">
<textline bbox="43.563,475.008,58.117,485.246">
<text font="NUMPTY+ImprintMTnum" bbox="43.563,475.008,48.317,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="48.226,475.008,52.980,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">4</text>
<text font="NUMPTY+ImprintMTnum" bbox="52.889,475.008,58.117,485.246" colourspace="DeviceGray" ncolour="0" size="10.238">b</text>
</textline>
</textbox>
</page>
</pages>

Это длиннее, но имеет ту же структуру.

Я хочу вставить <newline> родитель отмечайте каждый раз, когда я указываю определенное расстояние (между первым числом атрибута bbox и первым числом и следующим атрибутом bbox). Я хочу, чтобы тег закрывался только тогда, когда есть необходимость открыть еще один. Вот так:

<newline>
   <text tags>[...]</text tags>
</newline>
<newline>
   <text tags>[...]</text tags]
</newline>

Итак, новая строка переносит текстовые теги. Я пытался с моим кодом, но вывод не работает. Мой код ниже:

import lxml.etree as etree
from lxml.builder import E

parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse('fe3.xml', parser)
root = tree.getroot()

def removeByIdx(parent, idx):
    currElem = parent[idx]   # The indicated element
    parent.remove(currElem)  # Remove it from the parent
    return currElem          # Return the index and element

def wrap(line, idxList):
    if len(idxList) == 0:
        return    # No elements to wrap
    # Take the first element from the original location
    idx = idxList.pop(0)     # Index of the first element
    elem = removeByIdx(line, idx) # The indicated element
    # Create "newline" element with "elem" inside
    nElem = E.newline(elem)
    line.insert(idx, nElem)  # Put it in place of "elem"
    while len(idxList) > 0:  # Process the rest of index list
        # Value not used, but must be removed
        idxList.pop(0)
        # Remove the current element from the original location
        currElem = removeByIdx(line, idx + 1)
        nElem.append(currElem)  # Append it to "newline"


global distance
distance = 0
x_prev = None
for x in tree.xpath('//text'):
    idxList = []
    bb = x.attrib.get('bbox')
    if bb is not None:
        bb = bb.split(',')
        #print('This: ', bb)

        if x_prev is not None:
            #print('  Previous: ', x_prev)
            distance = float(bb[0]) - float(x_prev[0])

        else:
            print('  No previous bbox')
        # Store this bounding box for the next loop (to be used as x_prev)
        x_prev = bb

        if distance > 20:
            for elem in x:
                par = elem.getparent()
                idx = par.index(elem)
                idxList.append(idx)
        else:  # "Wrong" element, wrap elements "gathered" so far
                wrap(x, idxList)
                idxList = []
            # Process "good" elements without any "bad" after them, if any
        wrap(x, idxList)
print(etree.tostring(root, encoding='unicode', pretty_print=True))
...