чтение файлов из GCP-корзины - максимальное количество попыток превышено - PullRequest
0 голосов
/ 06 марта 2020

Я читаю xml файлы, которые хранятся в моем хранилище GCP через многопроцессорную библиотеку, как указано ниже

def extract_entity(entity_number):
    # extract the entity ID from the database
    # os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= "/Users/mygbucketaccess.json"
    # gauth = GoogleAuth()
    # gauth.LoadCredentialsFile("mycreds.txt")
    # print(entity_number)
    time.sleep(1.5)

    blob = bucket.get_blob('xlmfull/' + entity_number + '_full.xml')
    xml_file = blob.download_as_string()

    xml_data = str(xml_file, 'utf-8')
    y=BeautifulSoup(xml_data)
    ...
    ...
    DO SOME ANALYSIS
    return(res)

, и вот моя часть объединения

from multiprocessing import Pool
start_time = time.time()
pool = Pool(processes=8)
results = pool.map(extract_entity, all_ids[0:100])
pool.close()

Это очень случайным образом, независимо от 1) увеличения time.sleep 2) помещения аутентификации внутрь - я получаю следующую ошибку.

---------------------------------------------------------------------------
RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/Users//opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/Users//opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "<ipython-input-65-b023de58d494>", line 10, in extract_entity
    blob = bucket.get_blob('xlmfull/' + entity_number + '_full.xml')
  File "/Users//opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/storage/bucket.py", line 899, in get_blob
    blob.reload(client=client, timeout=timeout)
  File "/Users//opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/storage/_helpers.py", line 150, in reload
    timeout=timeout,
  File "/Users//opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/_http.py", line 426, in api_request
    return response.json()
  File "/Users//opt/anaconda3/envs/husx/lib/python3.7/site-packages/requests/models.py", line 898, in json
    return complexjson.loads(self.text, **kwargs)
  File "/Users//opt/anaconda3/envs/husx/lib/python3.7/json/__init__.py", line 348, in loads
    return _default_decoder.decode(s)
  File "/Users//opt/anaconda3/envs/husx/lib/python3.7/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/Users//opt/anaconda3/envs/husx/lib/python3.7/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Unterminated string starting at: line 17 column 14 (char 720)
"""

The above exception was the direct cause of the following exception:

JSONDecodeError                           Traceback (most recent call last)
<ipython-input-66-d14f24547c6b> in <module>
      2 start_time = time.time()
      3 pool = Pool(processes=7)
----> 4 results = pool.map(extract_entity, all_ids[0:100])
      5 du = time.time() - start_time
      6 du

~/opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py in map(self, func, iterable, chunksize)
    266         in a list that is returned.
    267         '''
--> 268         return self._map_async(func, iterable, mapstar, chunksize).get()
    269 
    270     def starmap(self, func, iterable, chunksize=None):

~/opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py in get(self, timeout)
    655             return self._value
    656         else:
--> 657             raise self._value
    658 
    659     def _set(self, i, obj):

JSONDecodeError: Unterminated string starting at: line 17 column 14 (char 720)

Кажется, что-то разрывает соединение для чтения файла. Таким образом, я удалил учетные данные в функции и позволил ей работать с time.sleep (2), я получаю сообщение о превышении максимального числа повторных попыток

RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/Users/alireza/opt/anaconda3/envs/husx/lib/python3.7/site-packages/urllib3/connectionpool.py", line 672, in urlopen
    chunked=chunked,
  File "/Users/alireza/opt/anaconda3/envs/husx/lib/python3.7/site-packages/urllib3/connectionpool.py", line 421, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/alireza/opt/anaconda3/envs/husx/lib/python3.7/site-packages/urllib3/connectionpool.py", line 416, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/alireza/opt/anaconda3/envs/husx/lib/python3.7/http/client.py", line 1344, in getresponse
    response.begin()
  File "/Users/alireza/opt/anaconda3/envs/husx/lib/python3.7/http/client.py", line 306, in begin
    version, status, reason = self._read_status()
  File "/Users/alireza/opt/anaconda3/envs/husx/lib/python3.7/http/client.py", line 267, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/Users/alireza/opt/anaconda3/envs/husx/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/ssl.py", line 1071, in recv_into
    return self.read(nbytes, buffer)
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/ssl.py", line 929, in read
    return self._sslobj.read(len, buffer)
ssl.SSLError: [SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:2555)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/requests/adapters.py", line 449, in send
    timeout=timeout
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/urllib3/connectionpool.py", line 720, in urlopen
    method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/urllib3/util/retry.py", line 436, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='storage.googleapis.com', port=443): Max retries exceeded with url: /storage/v1/b//o/xlmfull%2F310000052_full.xml?projection=noAcl (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:2555)')))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "<ipython-input-80-8c760f72ecd4>", line 11, in extract_entity
    blob = bucket.get_blob('xlmfull/' + entity_number + '_full.xml')
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/storage/bucket.py", line 899, in get_blob
    blob.reload(client=client, timeout=timeout)
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/storage/_helpers.py", line 150, in reload
    timeout=timeout,
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/_http.py", line 419, in api_request
    timeout=timeout,
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/_http.py", line 277, in _make_request
    method, url, headers, data, target_object, timeout=timeout
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/_http.py", line 315, in _do_request
    url=url, method=method, headers=headers, data=data, timeout=timeout
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/auth/transport/requests.py", line 317, in request
    **kwargs
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/requests/sessions.py", line 530, in request
    resp = self.send(prep, **send_kwargs)
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/requests/sessions.py", line 643, in send
    r = adapter.send(request, **kwargs)
  File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/requests/adapters.py", line 514, in send
    raise SSLError(e, request=request)
requests.exceptions.SSLError: HTTPSConnectionPool(host='storage.googleapis.com', port=443): Max retries exceeded with url: /storage/v1/b//o/xlmfull%2F310000052_full.xml?projection=noAcl (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:2555)')))
"""

The above exception was the direct cause of the following exception:

SSLError                                  Traceback (most recent call last)
<ipython-input-81-5c48af30bc80> in <module>
      2 start_time = time.time()
      3 pool = Pool(processes=8)
----> 4 results = pool.map(extract_entity, range(0, 100))
      5 du = time.time() - start_time
      6 du

~/opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py in map(self, func, iterable, chunksize)
    266         in a list that is returned.
    267         '''
--> 268         return self._map_async(func, iterable, mapstar, chunksize).get()
    269 
    270     def starmap(self, func, iterable, chunksize=None):

~/opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py in get(self, timeout)
    655             return self._value
    656         else:
--> 657             raise self._value
    658 
    659     def _set(self, i, obj):

SSLError: None: Max retries exceeded with url: /storage/v1/b//o/xlmfull%2F310000052_full.xml?projection=noAcl (Caused by None)

** Кажется, комбинация time.sleep и числа процессора меняет результат. есть ли такое ограничение на ведра gcp?

...