Я хочу конвертировать PDF в текстовые файлы для обработки документа.Я использую Python 3.x и в то время как я пытаюсь конвертировать pdftotext, используя PDFMiner .некоторые PDF не могут быть проанализированы и приводит к ошибке (Python 3.x) Это код, который преобразует PDF в текст.onlyfiles [2] - это имя файла PDF.не могу понять сообщение об ошибке.
import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
def extract_text_by_page(pdf_path):
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
yield text
# close open handles
converter.close()
fake_file_handle.close()
def extract_text(pdf_path):
for page in extract_text_by_page(pdf_path):
print(page)
print()
if __name__ == '__main__':
print(extract_text(onlyfiles[2]))
Сообщение об ошибке:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-19-7fed22ec1779> in <module>
25 print()
26 if __name__ == '__main__':
---> 27 print(extract_text(onlyfiles[2]))
<ipython-input-19-7fed22ec1779> in extract_text(pdf_path)
21 fake_file_handle.close()
22 def extract_text(pdf_path):
---> 23 for page in extract_text_by_page(pdf_path):
24 print(page)
25 print()
<ipython-input-19-7fed22ec1779> in extract_text_by_page(pdf_path)
14 converter = TextConverter(resource_manager, fake_file_handle)
15 page_interpreter = PDFPageInterpreter(resource_manager, converter)
---> 16 page_interpreter.process_page(page)
17 text = fake_file_handle.getvalue()
18 yield text
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in process_page(self, page)
850 ctm = (1, 0, 0, 1, -x0, -y0)
851 self.device.begin_page(page, ctm)
--> 852 self.render_contents(page.resources, page.contents, ctm=ctm)
853 self.device.end_page(page)
854 return
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in render_contents(self, resources, streams, ctm)
862 self.init_resources(resources)
863 self.init_state(ctm)
--> 864 self.execute(list_value(streams))
865 return
866
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in execute(self, streams)
886 log.debug('exec: %s %r', name, args)
887 if len(args) == nargs:
--> 888 func(*args)
889 else:
890 log.debug('exec: %s', name)
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in do_TJ(self, seq)
770 raise PDFInterpreterError('No font specified!')
771 return
--> 772 self.device.render_string(self.textstate, seq, self.ncs, self.graphicstate.copy())
773 return
774
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfdevice.py in render_string(self, textstate, seq, ncs, graphicstate)
85 textstate.linematrix = self.render_string_horizontal(
86 seq, matrix, textstate.linematrix, font, fontsize,
---> 87 scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
88 return
89
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfdevice.py in render_string_horizontal(self, seq, matrix, pos, font, fontsize, scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
98 needcharspace = True
99 else:
--> 100 for cid in font.decode(obj):
101 if needcharspace:
102 x += charspace
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\pdffont.py in decode(self, bytes)
717
718 def decode(self, bytes):
--> 719 return self.cmap.decode(bytes)
720
721 def char_disp(self, cid):
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\cmapdb.py in decode(self, code)
125
126 def decode(self, code):
--> 127 n = len(code)//2
128 if n:
129 return struct.unpack('>%dH' % n, code)
TypeError: object of type 'PSKeyword' has no len()