Я читаю изображения из местоположения, затем использую pytesseract для OCR на нем, использую некоторые функции регулярного выражения для желаемого вывода и затем сохраняю вывод в формате json в новой папке. Я сталкиваюсь с проблемой с l oop, так как вывод немного странный, если честно. Пожалуйста, смотрите код ниже:
from PIL import Image
import json, re, subprocess
import re
import pytesseract as pt
import os
open('D:\Demo\imagejson\jsonfiles.txt', 'w').close()
def main():
path ="D:\Demo\images"
fullTempPath ="D:\Demo\imagejson\jsonfiles.txt"
for imageName in os.listdir(path):
inputPath = os.path.join(path, imageName)
img = Image.open(inputPath)
text = pt.image_to_string(img, lang ="eng")
invoice_ocr = {}
splits = text.splitlines()
while(' ' in splits) :
splits.remove(' ')
while('' in splits) :
splits.remove('')
Product = []
Total = []
subtotal=[]
try:
date = re.search(r'(0[1-9]|[12][0-9]|3[01])[-](0[1-9]|1[012])[-](19|20)\d\d',text).group()
for line in splits:
if re.search(r'PO',line):
PO = line
PO = re.sub(r'.*PO', 'PO', PO).split("PO# ",1)[0]
if re.search(r'INVOICE',line):
Invoice = line
Invoice = re.sub(r'.*INV', 'INV', Invoice)
if re.search(r'TOTAL',line):
Total.append(line)
if re.search(r'Product',line):
Product.append(line)
if re.search(r'SUBTOTAL',line):
subtotal.append(line)
except:
pass
invoice_ocr['Name'] = splits[0].split("INVOICE",1)[0]
invoice_ocr['date'] = date
invoice_ocr['invoice_number'] = Invoice
invoice_ocr['PO'] = PO
invoice_ocr['all_items'] = Product
invoice_ocr['total']= Total
invoice_ocr['subtotal']= subtotal
invoice_json = json.dumps(invoice_ocr)
file1 = open(fullTempPath, "a+")
file1.write(imageName+"\n")
file1.write(invoice_json+"\n")
file1.close()
file2 = open(fullTempPath, 'r')
print(file2.read())
file2.close()
if __name__ == '__main__':
main()
Вывод, который я получаю,
V10_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-08-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
V10_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-08-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
V11_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-09-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 7000.00"], "subtotal": []}
V10_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-08-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
V11_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-09-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 7000.00"], "subtotal": []}
V12_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "13-11-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 5000.00"], "subtotal": []}
V10_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-08-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
V11_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-09-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 7000.00"], "subtotal": []}
V12_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "13-11-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 5000.00"], "subtotal": []}
V13_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "23-01-2020", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 18000.00"], "subtotal": []}
V10_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-08-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
V11_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-09-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 7000.00"], "subtotal": []}
V12_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "13-11-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 5000.00"], "subtotal": []}
V13_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "23-01-2020", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 18000.00"], "subtotal": []}
V14_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "12-02-2020", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
V10_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-08-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
V11_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "17-09-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 7000.00"], "subtotal": []}
V12_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "13-11-2019", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 5000.00"], "subtotal": []}
V13_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "23-01-2020", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 18000.00"], "subtotal": []}
V14_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "12-02-2020", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
V15_template.jpg
{"Name": "Urbano Animations Ltd. ", "date": "22-03-2020", "invoice_number": "INVOICE", "PO": "PO # VR68445", "all_items": ["[23423423] Product XYZ"], "total": ["TOTAL $ 10000.00"], "subtotal": []}
........ и т. Д.
Почему это цикл с самого начала и снова, и какие изменения в коде могут помочь мне решить эту проблему? Я знаю, что это более функциональная ошибка программирования от меня, кто-нибудь может помочь? Кроме того, я не получаю большую точность с Pytesseract (хотя я не препроцессировать, хотя). Есть ли другие доступные решения с открытым исходным кодом?