PdfMiner: ошибка при обработке литерала страницы: / b'begin ' - PullRequest
1 голос
/ 11 марта 2019

Я пытаюсь прочитать .pdf файл, используя python3 с пакетом, названным pdfminer, что я успешно выполнил, но для части страницы в файле .pdf при чтении страницы, используя interpreter.process_page в getAllPages() изВ следующем коде я получаю следующие ошибки:

  1. ошибка при обработке требуемого литерала страницы: /b'begin'.
  2. ошибка при обработке страницы Неизвестный оператор: 'Qq'.

Это происходит только для нескольких документов, но не в состоянии выяснить, в чем проблема, и в каком случае это может произойти?

Код: -

class PDFDoc():
    def __init__(self):
        self.rsrcmgr = PDFResourceManager()
        self.laparams = LAParams()
        self.device = PDFPageDetailedAggregator(self.rsrcmgr, laparams=self.laparams)
        self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
        self.doc_values = []
        self.total_no_of_pages = 0
        self.doc_page_dict = collections.OrderedDict()
        # self.doc = None
    """
    Read PDF Document
    """
    def readDoc(self, doc_name):
        fp = open(doc_name, 'rb')
        self.parser = PDFParser(fp)
        self.doc = PDFDocument(self.parser)
    """
    Read all pages in the document and saved in List of tuples format. 
    It contains the text and their coordinate info along with page number
    """
    def getAllPages(self):
        for page in PDFPage.create_pages(self.doc):
            self.interpreter.process_page(page)
            # receive the LTPage object for this page
            self.device.get_result()
        self.doc_values = self.device.rows
    """
    Get the total number of pages
    """
    def getTotalPages(self):
        self.total_no_of_pages = max(self.doc_page_dict)+1
    """
    Convert the document info into Page-wise dict. {Key:Value}-->{Page no:[Page text, coordinates]}
    """
    def getPageDict(self):
        for i in range(len(self.doc_values)):
            left = self.doc_values[i][1]
            bottom = self.doc_values[i][2]
            content = self.doc_values[i][-1]
            if self.doc_page_dict.get(self.doc_values[i][0]):
                self.doc_page_dict[self.doc_values[i][0]].append({'left':left, 'bottom':bottom, 'content':content})
            else:
                self.doc_page_dict[self.doc_values[i][0]]=[{'left':left, 'bottom':bottom, 'content':content}]
    """
    Align the page text in case they are misaligned 
    """
    def create_page_table_modified(self, pagedict_list):
        # ##print(pagedict_list)
        page_dict = collections.OrderedDict()
        page_table_1 = []
        page_table = []
        exc_arr = []
        count = 0
        for line in pagedict_list:
            row = []
            temp_key = float(line['bottom'])
            if not line in exc_arr and line["content"]:
                row.append(line)
                exc_arr.append(line)
                for line_1 in pagedict_list:
                    if not line_1 in exc_arr and line_1["content"]:
                        # #print('last_top:', last_top, each_dict_adjusted['bottom'])
                        if abs(int(line["bottom"]) - int(line_1["bottom"])) <= 6:
                            row.append(line_1)
                            exc_arr.append(line_1)
            if row:
                page_dict[temp_key] = row
                page_table.append(row)
                count += 1
        # ##print("\n\nPage:",page_table)
        page_dict_keys = sorted(page_dict, reverse=True)
        for i in page_dict_keys:
            # i = sorted(i, key=lambda k: k['left'])
            page_table_1.append(page_dict[i])
        return page_table_1
    """
    Sort the line elements based on its position coordinates
    """
    def sortRowElements(self,row_list):
        return sorted(row_list, key=lambda k:k['left'])
    """
    Combine line elements to form the line text
    """
    def combineText(self, row):
        temp_ = []
        # for i in range(len(row)):
        text = [k['content'] for k in row]
        temp_.append(' '.join(text))
        return ' '.join(temp_)
    """
    To call aligning and sorting functions 
    """
    def sortText(self):
        for page in self.doc_page_dict:
            self.doc_page_dict[page] = self.create_page_table_modified(self.doc_page_dict[page])
            self.doc_page_dict[page] = [self.sortRowElements(line) for line in self.doc_page_dict[page]]
    """
    To get text from particular page of the document --> List of line text
    """
    def pageText(self, page_no):
        page_text = [self.combineText(line) for line in self.doc_page_dict[page_no]]
        return page_text

read_document = PDFDoc()
...