У меня есть этот документ PDF: https://drive.google.com/file/d/18Kmq4kD2Xai7w9KAROIJZ-6baCKjHA5H/view?usp=sharing
И я преобразовал его в XML с помощью PDFminer. Проблема в том, что я хочу, чтобы каждый раз, когда появлялись такие длинные пробелы, требовался пользовательский перевод новой строки, например:
в первой строке. есть новая строка перед «1», потому что есть более длинный пробел:
Titolo Il Causidico] sps. a Don Rodri go [BIG SPACE]
1 sventura] sps. a mi-
Я пытался настроить пользовательские параметры LA, но я не знаю, как этого добиться. Мой код до сих пор:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import io
def convert(case, pdfpath, targetfilepath, pages=100):
if not pages: pagenums = set();
else: pagenums = set(pages);
manager = PDFResourceManager()
codec = 'utf-8'
caching = True
word_margin = 1
laparams2 = LAParams(all_texts=True, detect_vertical=True,
line_overlap=0.5, char_margin=1000.0, #set char_margin to a large number
line_margin=0.5, word_margin=0.5,
boxes_flow=0.5)
if case == 'text':
output = io.StringIO()
converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
if case == 'HTML':
output = io.BytesIO()
converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams())
if case == 'XML':
output = io.BytesIO()
converter = XMLConverter(manager, output, codec=codec, laparams= laparams2)
interpreter = PDFPageInterpreter(manager, converter)
infile = open(pdfpath, 'rb')
for page in PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True):
interpreter.process_page(page)
convertedPDF = output.getvalue()
infile.close(); converter.close(); output.close()
convertedFile = open(targetfilepath, 'wb')
convertedFile.write(convertedPDF)
convertedFile.close()
print(convert('XML', 'fel_split2.pdf', 'fe2.xml', pages=None))
См. Фрагмент вывода, который я получаю здесь (в противном случае это будет слишком долго).
Результат XML для первая строка (удалена несущественная информация) выглядит так:
<pages>
<page id="1" bbox="0.000,0.000,462.047,680.315">
<textbox id="1" bbox="44.614,554.008,58.101,564.246">
<textline bbox="44.614,554.008,58.101,564.246">
<text bbox="44.614,554.008,49.369,564.246">2</text>
<text bbox="49.268,554.008,54.022,564.246">4</text>
<text bbox="53.922,554.008,58.101,564.246">a</text>
<text />
</textline>
</textbox>
<textbox id="6" bbox="68.031,502.428,372.824,566.366">
<textline bbox="68.031,553.628,372.759,566.366">
<text bbox="68.031,553.639,76.375,566.366">T</text>
<text bbox="76.231,553.639,79.479,566.366">i</text>
<text bbox="79.334,553.639,83.161,566.366">t</text>
<text bbox="83.017,553.639,88.112,566.366">o</text>
<text bbox="87.968,553.639,91.216,566.366">l</text>
<text bbox="91.071,553.639,96.167,566.366">o</text>
<text bbox="99.311,553.628,104.406,566.110">I</text>
<text bbox="104.261,553.628,107.510,566.110">l</text>
<text bbox="107.365,553.628,110.269,566.110" />
<text bbox="110.658,553.628,119.002,566.110">C</text>
<text bbox="118.857,553.628,123.953,566.110">a</text>
<text bbox="123.808,553.628,130.183,566.110">u</text>
<text bbox="130.038,553.628,134.555,566.110">s</text>
<text bbox="134.410,553.628,137.659,566.110">i</text>
<text bbox="137.514,553.628,143.889,566.110">d</text>
<text bbox="143.744,553.628,146.993,566.110">i</text>
<text bbox="146.848,553.628,151.943,566.110">c</text>
<text bbox="151.799,553.628,157.595,566.110">o</text>
<text bbox="157.450,553.628,161.277,566.110">]</text>
<text bbox="161.132,553.628,164.036,566.110" />
<text bbox="164.417,553.639,168.244,566.366">s</text>
<text bbox="168.099,553.639,173.895,566.366">p</text>
<text bbox="173.751,553.639,177.578,566.366">s</text>
<text bbox="176.966,553.639,180.215,566.366">.</text>
<text bbox="180.070,553.639,182.974,566.366" />
<text bbox="183.363,553.639,189.159,566.366">a</text>
<text bbox="192.314,553.628,201.937,566.110">D</text>
<text bbox="201.793,553.628,207.589,566.110">o</text>
<text bbox="207.444,553.628,213.819,566.110">n</text>
<text bbox="213.674,553.628,216.578,566.110" />
<text bbox="216.967,553.628,225.311,566.110">R</text>
<text bbox="225.166,553.628,230.962,566.110">o</text>
<text bbox="230.818,553.628,237.192,566.110">d</text>
<text bbox="237.048,553.628,241.565,566.110">r</text>
<text bbox="241.420,553.628,244.668,566.110">i</text>
<text bbox="244.524,553.628,250.320,566.110">g</text>
<text bbox="250.064,553.628,255.860,566.110">o</text>
<text />
<text bbox="272.661,554.072,277.415,564.757">1</text>
<text bbox="280.592,553.628,285.109,566.110">s</text>
<text bbox="284.964,553.628,290.760,566.110">v</text>
<text bbox="290.382,553.628,295.477,566.110">e</text>
<text bbox="295.333,553.628,301.707,566.110">n</text>
<text bbox="301.563,553.628,305.390,566.110">t</text>
<text bbox="305.245,553.628,311.620,566.110">u</text>
<text bbox="311.475,553.628,315.992,566.110">r</text>
<text bbox="315.847,553.628,320.942,566.110">a</text>
<text bbox="320.798,553.628,324.625,566.110">]</text>
<text bbox="324.480,553.628,327.384,566.110" />
<text bbox="327.763,553.639,331.590,566.366">s</text>
<text bbox="331.445,553.639,337.241,566.366">p</text>
<text bbox="337.097,553.639,340.924,566.366">s</text>
<text bbox="340.312,553.639,343.560,566.366">.</text>
<text bbox="343.416,553.639,346.319,566.366" />
<text bbox="346.709,553.639,352.505,566.366">a</text>
<text bbox="355.660,553.628,365.283,566.110">m</text>
<text bbox="365.139,553.628,368.387,566.110">i</text>
<text bbox="368.242,553.628,372.759,566.110">-</text>
<text />
</textline>
</textbox>
</page>
</pages>