Python Tika не может прочитать PDF - не удается загрузить Tika Server - PullRequest
1 голос
/ 22 июня 2019

Я использую Tika для чтения PDF-файлов, и мой код работал до вчерашнего дня. Теперь, когда я запускаю один и тот же код, я получаю ошибки, и, видимо, Тика не может найти файл jar сервера Тика. Я использую следующий код для чтения PDF

import tika
from tika import parser
tika.initVM()

parsed = parser.from_file('my_pdf_file.pdf')

Ошибка трассировки ниже

    2019-06-22 05:54:08,735 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar to /tmp/tika-server.jar.
---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tika/tika.py in getRemoteJar(urlOrPath, destPath)
    715         try:
--> 716             urlretrieve(urlOrPath, destPath)
    717         except IOError:

19 frames
/usr/lib/python3.6/urllib/request.py in urlretrieve(url, filename, reporthook, data)
    247 
--> 248     with contextlib.closing(urlopen(url, data)) as fp:
    249         headers = fp.info()

/usr/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 

/usr/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
    531             meth = getattr(processor, meth_name)
--> 532             response = meth(req, response)
    533 

/usr/lib/python3.6/urllib/request.py in http_response(self, request, response)
    641             response = self.parent.error(
--> 642                 'http', request, response, code, msg, hdrs)
    643 

/usr/lib/python3.6/urllib/request.py in error(self, proto, *args)
    569             args = (dict, 'default', 'http_error_default') + orig_args
--> 570             return self._call_chain(*args)
    571 

/usr/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:

/usr/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
    649     def http_error_default(self, req, fp, code, msg, hdrs):
--> 650         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    651 

HTTPError: HTTP Error 504: Gateway Time-out

During handling of the above exception, another exception occurred:

HTTPError                                 Traceback (most recent call last)
<ipython-input-5-f304ccbde1d0> in <module>()
     13 
     14 # Read the ;PDF file
---> 15 parsed = parser.from_file('/content/gdrive/My Drive/Colab Notebooks/data/sample_pdf_for_excel.pdf')
     16 # content = parsed["content"].strip()
     17 

/usr/local/lib/python3.6/dist-packages/tika/parser.py in from_file(filename, serverEndpoint, xmlContent, headers, config_path)
     34     '''
     35     if not xmlContent:
---> 36         jsonOutput = parse1('all', filename, serverEndpoint, headers=headers, config_path=config_path)
     37     else:
     38         jsonOutput = parse1('all', filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'},

/usr/local/lib/python3.6/dist-packages/tika/tika.py in parse1(option, urlOrPath, serverEndpoint, verbose, tikaServerJar, responseMimeType, services, rawResponse, headers, config_path)
    326     if service == '/tika': responseMimeType = 'text/plain'
    327     status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
--> 328                                   headers, verbose, tikaServerJar, config_path=config_path, rawResponse=rawResponse)
    329 
    330     if file_type == 'remote': os.unlink(path)

/usr/local/lib/python3.6/dist-packages/tika/tika.py in callServer(verb, serverEndpoint, service, data, headers, verbose, tikaServerJar, httpVerbs, classpath, rawResponse, config_path)
    520     global TikaClientOnly
    521     if not TikaClientOnly:
--> 522         serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
    523 
    524     serviceUrl  = serverEndpoint + service

/usr/local/lib/python3.6/dist-packages/tika/tika.py in checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
    569         if not alreadyRunning:
    570             if not os.path.isfile(jarPath) and urlp.scheme != '':
--> 571                 getRemoteJar(tikaServerJar, jarPath)
    572 
    573             if not checkJarSig(tikaServerJar, jarPath):

/usr/local/lib/python3.6/dist-packages/tika/tika.py in getRemoteJar(urlOrPath, destPath)
    724             if os.path.exists(destPath) and os.path.isfile(destPath):
    725                 os.remove(destPath)
--> 726             urlretrieve(urlOrPath, destPath)
    727 
    728         return (destPath, 'remote')

/usr/lib/python3.6/urllib/request.py in urlretrieve(url, filename, reporthook, data)
    246     url_type, path = splittype(url)
    247 
--> 248     with contextlib.closing(urlopen(url, data)) as fp:
    249         headers = fp.info()
    250 

/usr/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221     else:
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 
    225 def install_opener(opener):

/usr/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
    530         for processor in self.process_response.get(protocol, []):
    531             meth = getattr(processor, meth_name)
--> 532             response = meth(req, response)
    533 
    534         return response

/usr/lib/python3.6/urllib/request.py in http_response(self, request, response)
    640         if not (200 <= code < 300):
    641             response = self.parent.error(
--> 642                 'http', request, response, code, msg, hdrs)
    643 
    644         return response

/usr/lib/python3.6/urllib/request.py in error(self, proto, *args)
    568         if http_err:
    569             args = (dict, 'default', 'http_error_default') + orig_args
--> 570             return self._call_chain(*args)
    571 
    572 # XXX probably also want an abstract factory that knows when it makes

/usr/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    502         for handler in handlers:
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:
    506                 return result

/usr/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
    648 class HTTPDefaultErrorHandler(BaseHandler):
    649     def http_error_default(self, req, fp, code, msg, hdrs):
--> 650         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    651 
    652 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 504: Gateway Time-out

Java-версия

!java -version
openjdk version "11.0.3" 2019-04-16
OpenJDK Runtime Environment (build 11.0.3+7-Ubuntu-1ubuntu218.04.1)
OpenJDK 64-Bit Server VM (build 11.0.3+7-Ubuntu-1ubuntu218.04.1, mixed mode, sharing)

На другой машине, где я попробовал тот же код

> java -version
java version "1.8.0_172"
Java(TM) SE Runtime Environment (build 1.8.0_172-b11)
Java HotSpot(TM) Client VM (build 25.172-b11, mixed mode, sharing)

Когда я нажимаю ссылку Java (поиск Maven) http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar, это дает 504 Время ожидания шлюза

Пожалуйста, предложите.

1 Ответ

1 голос
/ 23 июня 2019

Измените URL для загрузки через соответствующую переменную среды:

import os
os.environ['TIKA_SERVER_JAR'] = 'https://repo1.maven.org/maven2/org/apache/tika/tika-server/1.19/tika-server-1.19.jar'
import tika
from tika import parser

Ссылка: https://github.com/chrismattmann/tika-python/issues/230#issuecomment-504704922

...