Я пытаюсь запустить amaract textract, используя корзину S3, сохраняя файлы PDF в корзине. Вот пример кода, который я использую, но во время работы я получаю следующую ошибку:
C: \ Users \ Lenovo \ Desktop> python s3demo.py
Traceback ( последний вызов был последним):
Файл "s3demo.py", строка 130, в
main()
Файл "s3demo.py", строка 126, в основном
block_count=process_text_analysis(bucket,document)
Файл "s3demo.py", строка 68, в process_text_analysis
image=Image.open(stream)
Файл "C: \ Users \ Lenovo \ Anaconda3 \ lib \ site-packages \ PIL \ Image.py", строка 2818, в открытом
raise IOError("cannot identify image file %r" % (filename if filename else fp))
OSError: невозможно идентифицировать файл изображения <_io.BytesIO объект в 0x00000146C3A786A8>
Ниже приведен фрагмент кода:
import boto3
import io
from io import BytesIO
import sys
import math
from PIL import Image, ImageDraw, ImageFont
def ShowBoundingBox(draw,box,width,height,boxColor):
left = width * box['Left']
top = height * box['Top']
draw.rectangle([left,top, left + (width * box['Width']), top +(height * box['Height'])],outline=boxColor)
def ShowSelectedElement(draw,box,width,height,boxColor):
left = width * box['Left']
top = height * box['Top']
draw.rectangle([left,top, left + (width * box['Width']), top +(height * box['Height'])],fill=boxColor)
# Displays information about a block returned by text detection and text analysis
def DisplayBlockInformation(block):
print('Id: {}'.format(block['Id']))
if 'Text' in block:
print(' Detected: ' + block['Text'])
print(' Type: ' + block['BlockType'])
if 'Confidence' in block:
print(' Confidence: ' + "{:.2f}".format(block['Confidence']) + "%")
if block['BlockType'] == 'CELL':
print(" Cell information")
print(" Column:" + str(block['ColumnIndex']))
print(" Row:" + str(block['RowIndex']))
print(" Column Span:" + str(block['ColumnSpan']))
print(" RowSpan:" + str(block['ColumnSpan']))
if 'Relationships' in block:
print(' Relationships: {}'.format(block['Relationships']))
print(' Geometry: ')
print(' Bounding Box: {}'.format(block['Geometry']['BoundingBox']))
print(' Polygon: {}'.format(block['Geometry']['Polygon']))
if block['BlockType'] == "KEY_VALUE_SET":
print (' Entity Type: ' + block['EntityTypes'][0])
if block['BlockType'] == 'SELECTION_ELEMENT':
print(' Selection element detected: ', end='')
if block['SelectionStatus'] =='SELECTED':
print('Selected')
else:
print('Not selected')
if 'Page' in block:
print('Page: ' + block['Page'])
print()
def process_text_analysis(bucket, document):
#Get the document from S3
s3_connection = boto3.resource('s3')
s3_object = s3_connection.Object(bucket,document)
s3_response = s3_object.get()
stream = io.BytesIO(s3_response['Body'].read())
image=Image.open(stream)
# Analyze the document
client = boto3.client('textract')
image_binary = stream.getvalue()
response = client.analyze_document(Document={'Bytes': image_binary},
FeatureTypes=["TABLES", "FORMS"])
# Alternatively, process using S3 object
#response = client.analyze_document(
# Document={'S3Object': {'Bucket': bucket, 'Name': document}},
# FeatureTypes=["TABLES", "FORMS"])
#Get the text blocks
blocks=response['Blocks']
width, height =image.size
draw = ImageDraw.Draw(image)
print('Detected Document Text')
# Create image showing bounding box/polygon the detected lines/text
for block in blocks:
DisplayBlockInformation(block)
draw=ImageDraw.Draw(image)
if block['BlockType'] == "KEY_VALUE_SET":
if block['EntityTypes'][0] == "KEY":
ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height,'red')
else:
ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height,'green')
if block['BlockType'] == 'TABLE':
ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'blue')
if block['BlockType'] == 'CELL':
ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'yellow')
if block['BlockType'] == 'SELECTION_ELEMENT':
if block['SelectionStatus'] =='SELECTED':
ShowSelectedElement(draw, block['Geometry']['BoundingBox'],width,height, 'blue')
#uncomment to draw polygon for all Blocks
#points=[]
#for polygon in block['Geometry']['Polygon']:
# points.append((width * polygon['X'], height * polygon['Y']))
#draw.polygon((points), outline='blue')
# Display the image
image.show()
return len(blocks)
def main():
bucket = 'textract-console-ap-southeast-1-e404965c-91e4-4f9e-89fb-242b7c5'
document = 'Invoice_Sample2.pdf'
block_count=process_text_analysis(bucket,document)
print("Blocks detected: " + str(block_count))
if __name__ == "__main__":
main()
Любой Помощь будет высоко ценится.