У меня возникла проблема с unsv, когда он периодически зависал на неопределенное время при попытке конвертировать случайные документы в PDF, поэтому я написал небольшой скрипт на python для загрузки документов в GDrive и загрузки их снова в виде PDF-файлов, чтобы обойти эту проблему.
Проблема, с которой я столкнулся, заключается в том, что google drive автоматически пытается загружать загруженные изображения, и я не хочу, чтобы это произошло, но я пока не могу найти документацию покак отключить OCR.
Я заметил одну вещь: я функция создания из v3 API , в API v2 есть функция вставки , которая принимает OCRфлаг.Возможно ли это с API v3?
Вот мой код:
from __future__ import print_function
import httplib2
import magic
import io
import sys
import argparse
import subprocess as sp
from apiclient import discovery
from oauth2client.service_account import ServiceAccountCredentials
from httplib2 import Http
from googleapiclient.http import MediaFileUpload
from googleapiclient.http import MediaIoBaseDownload
from settings import *
"""
This script exists to mask unoconv for JUST pdf conversion. If it gets flags for anything else, it will fallback on unoconv.
Otherwise, it uploads the document to google drive, download it as a pdf, and then delete the file out of the drive.
"""
MIMETYPE_MAPPING = {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document":"application/vnd.google-apps.document",
"application/rtf":"application/vnd.google-apps.document",
"text/richtext":"application/vnd.google-apps.document",
"text/plain":"application/vnd.google-apps.document",
"text/html":"application/vnd.google-apps.document",
"application/vnd.oasis.opendocument.text":"application/vnd.google-apps.document",
"application/x-iwork-pages-sffpages":"application/vnd.google-apps.document",
"application/msword":"application/vnd.google-apps.document",
"application/vnd.ms-excel":"application/vnd.google-apps.spreadsheets",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":"application/vnd.google-apps.spreadsheets",
"text/csv":"application/vnd.google-apps.spreadsheets",
"text/tab-separated-values":"application/vnd.google-apps.spreadsheets",
"application/vnd.oasis.opendocument.spreadsheets":"application/vnd.google-apps.spreadsheets",
"application/vnd.oasis.opendocument.spreadsheet":"application/vnd.google-apps.spreadsheets",
"application/vnd.ms-powerpoint":"application/vnd.google-apps.presentation",
"application/vnd.openxmlformats-officedocument.presentationml.presentationml":"application/vnd.google-apps.presentation",
"application/vnd.oasis.opendocument.presentation":"application/vnd.google-apps.presentation",
"image/png":"application/vnd.google-apps.document",
"image/x-citrix-png":"application/vnd.google-apps.document",
"image/x-png":"application/vnd.google-apps.document",
"image/jpeg":"application/vnd.google-apps.document",
"image/x-citrix-jpeg":"application/vnd.google-apps.document",
"image/gif":"application/vnd.google-apps.document",
"image/bmp":"application/vnd.google-apps.document",
"application/pdf":"application/vnd.google-apps.document",
}
SERVICE = None
def get_service():
"""
Establishes the connection to the google drive APIs.
"""
global SERVICE
if SERVICE is None:
credentials = ServiceAccountCredentials.from_json(JSON_KEY)
http = http_auth = credentials.authorize(Http())
SERVICE = discovery.build('drive', 'v3', http=http_auth)
return SERVICE
def drive_upload(fp, fn):
"""
Uploads the file found at fp to root of google drive account as a google doc with name fn
Returns the id of the new file
"""
mimetype = magic.from_file(fp, mime=True)
drive_service = get_service()
file_metadata = {
'name' : fn,
'mimeType' : MIMETYPE_MAPPING.get(mimetype, 'application/vnd.google-apps.document'),
}
media = MediaFileUpload(fp,
mimetype=mimetype,
resumable=True)
import inspect
print(inspect.getargspec(drive_service.files().create)[0])
file = drive_service.files().create(body=file_metadata,
media_body=media,
fields='id').execute()
return file.get('id')
def download_pdf(file_id,dlp):
"""
Downloads file from google drive specified by file_id to the filepath in dlp
Will download file as pdf
"""
drive_service = get_service()
request = drive_service.files().export_media(fileId=file_id,
mimeType='application/pdf')
resp = request.execute()
f = open(dlp,'w')
f.write(resp)
f.close()
def convert_to_pdf(inputf, outputf):
"""
Converts input file to pdf located at output file and cleans up file from google drive
"""
fid = drive_upload(inputf,inputf.split('/')[-1])
download_pdf(fid,outputf)
#Now delete the file from drive
service = get_service()
service.files().delete(fileId=fid).execute()
def pass_through():
"""
Calls unoconv with same args that were passed to this script
"""
print("PASSING THROUGH",file=sys.stderr)
cmd = PATH_TO_UNOCONV + " " + " ".join(sys.argv[1:])
child = sp.Popen(cmd.split(), stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = child.communicate()
print(stdout,end='')
print(stderr, file=sys.stderr,end='')
sys.exit(child.returncode)
class ArgParse(argparse.ArgumentParser):
"""
This subclass of ArgumentParser exists to change the default behaviour of the exit function
If the exit function is called with a status other than 0 (usually because unsupported flags are used),
a call is made to pass_through let unoconv handle this call.
"""
def exit(self, status=0,message=None):
if status != 0:
pass_through()
else:
return super(ArgParse,self).exit(status=status,message=message)
if __name__ == '__main__':
parser = ArgParse(description="Wrapper for unoconv that farms pdf conversions to google drive, using any args other than the supplied will cause it to fallback on unoconv")
parser.add_argument('-f', metavar='format', help='Desired ouput format')
parser.add_argument('-o', metavar='output_file', help='Path to output file')
parser.add_argument('fname', metavar='inputf', type=str, nargs=1, help='Path to file to convert')
args = parser.parse_args()
fmt = args.f
output_file = args.o
input_file = args.fname[0]
if fmt.upper() == "PDF":
try:
convert_to_pdf(input_file, output_file)
except:
pass_through()
else:
#if we aren't converting the file to a PDF, let unoconv handle it
pass_through()