Чтение изображений из папки, ocr и вывод json в новое место в python - PullRequest
0 голосов
/ 28 апреля 2020

Я читаю изображения из местоположения, затем использую pytesseract для OCR на нем, использую некоторые функции регулярного выражения для желаемого вывода и затем сохраняю вывод в формате json в новой папке. Я сталкиваюсь с проблемой с l oop, так как вывод немного странный, если честно. Пожалуйста, смотрите код ниже:

from PIL import Image
import json, re, subprocess
import re
import pytesseract as pt 
import os 
open('D:\Demo\imagejson\jsonfiles.txt', 'w').close() 
def main(): 

        path ="D:\Demo\images"

        fullTempPath ="D:\Demo\imagejson\jsonfiles.txt"

        for imageName in os.listdir(path): 
            inputPath = os.path.join(path, imageName) 
            img = Image.open(inputPath) 

            text = pt.image_to_string(img, lang ="eng") 
            invoice_ocr = {}
            splits = text.splitlines()
            while(' ' in splits) : 
                splits.remove(' ')
            while('' in splits) : 
                splits.remove('')   

            Product = []
            Total = []
            subtotal=[]
            try:

                date = re.search(r'(0[1-9]|[12][0-9]|3[01])[-](0[1-9]|1[012])[-](19|20)\d\d',text).group()
                for line in splits:
                    if re.search(r'PO',line):
                        PO = line
                        PO = re.sub(r'.*PO', 'PO', PO).split("PO# ",1)[0]
                    if re.search(r'INVOICE',line):
                        Invoice = line
                        Invoice = re.sub(r'.*INV', 'INV', Invoice)        
                    if re.search(r'TOTAL',line):
                        Total.append(line)
                    if re.search(r'Product',line):
                        Product.append(line) 
                    if re.search(r'SUBTOTAL',line):
                        subtotal.append(line)    
            except:
                pass                                   
            invoice_ocr['Name'] = splits[0].split("INVOICE",1)[0]
            invoice_ocr['date'] = date
            invoice_ocr['invoice_number'] = Invoice
            invoice_ocr['PO'] = PO
            invoice_ocr['all_items'] = Product
            invoice_ocr['total']= Total 
            invoice_ocr['subtotal']= subtotal

            invoice_json = json.dumps(invoice_ocr)
            file1 = open(fullTempPath, "a+") 


            file1.write(imageName+"\n") 

            file1.write(invoice_json+"\n") 
            file1.close()  

            file2 = open(fullTempPath, 'r') 
            print(file2.read()) 
            file2.close()         


if __name__ == '__main__': 
    main() 

Вывод, который я получаю,

    V10_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-08-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}

V10_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-08-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
V11_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-09-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 7000.00"], "subtotal": []}

V10_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-08-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
V11_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-09-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 7000.00"], "subtotal": []}
V12_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "13-11-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 5000.00"], "subtotal": []}

V10_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-08-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
V11_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-09-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 7000.00"], "subtotal": []}
V12_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "13-11-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 5000.00"], "subtotal": []}
V13_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "23-01-2020", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 18000.00"], "subtotal": []}

V10_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-08-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
V11_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-09-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 7000.00"], "subtotal": []}
V12_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "13-11-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 5000.00"], "subtotal": []}
V13_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "23-01-2020", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 18000.00"], "subtotal": []}
V14_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "12-02-2020", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}

V10_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-08-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
V11_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-09-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 7000.00"], "subtotal": []}
V12_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "13-11-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 5000.00"], "subtotal": []}
V13_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "23-01-2020", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 18000.00"], "subtotal": []}
V14_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "12-02-2020", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
V15_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "22-03-2020", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}

........ и т. Д.

Почему это цикл с самого начала и снова, и какие изменения в коде могут помочь мне решить эту проблему? Я знаю, что это более функциональная ошибка программирования от меня, кто-нибудь может помочь? Кроме того, я не получаю большую точность с Pytesseract (хотя я не препроцессировать, хотя). Есть ли другие доступные решения с открытым исходным кодом?

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...