Я читаю xml файлы, которые хранятся в моем хранилище GCP через многопроцессорную библиотеку, как указано ниже
def extract_entity(entity_number):
# extract the entity ID from the database
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= "/Users/mygbucketaccess.json"
# gauth = GoogleAuth()
# gauth.LoadCredentialsFile("mycreds.txt")
# print(entity_number)
time.sleep(1.5)
blob = bucket.get_blob('xlmfull/' + entity_number + '_full.xml')
xml_file = blob.download_as_string()
xml_data = str(xml_file, 'utf-8')
y=BeautifulSoup(xml_data)
...
...
DO SOME ANALYSIS
return(res)
, и вот моя часть объединения
from multiprocessing import Pool
start_time = time.time()
pool = Pool(processes=8)
results = pool.map(extract_entity, all_ids[0:100])
pool.close()
Это очень случайным образом, независимо от 1) увеличения time.sleep
2) помещения аутентификации внутрь - я получаю следующую ошибку.
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "/Users//opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "/Users//opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
return list(map(*args))
File "<ipython-input-65-b023de58d494>", line 10, in extract_entity
blob = bucket.get_blob('xlmfull/' + entity_number + '_full.xml')
File "/Users//opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/storage/bucket.py", line 899, in get_blob
blob.reload(client=client, timeout=timeout)
File "/Users//opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/storage/_helpers.py", line 150, in reload
timeout=timeout,
File "/Users//opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/_http.py", line 426, in api_request
return response.json()
File "/Users//opt/anaconda3/envs/husx/lib/python3.7/site-packages/requests/models.py", line 898, in json
return complexjson.loads(self.text, **kwargs)
File "/Users//opt/anaconda3/envs/husx/lib/python3.7/json/__init__.py", line 348, in loads
return _default_decoder.decode(s)
File "/Users//opt/anaconda3/envs/husx/lib/python3.7/json/decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/Users//opt/anaconda3/envs/husx/lib/python3.7/json/decoder.py", line 353, in raw_decode
obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Unterminated string starting at: line 17 column 14 (char 720)
"""
The above exception was the direct cause of the following exception:
JSONDecodeError Traceback (most recent call last)
<ipython-input-66-d14f24547c6b> in <module>
2 start_time = time.time()
3 pool = Pool(processes=7)
----> 4 results = pool.map(extract_entity, all_ids[0:100])
5 du = time.time() - start_time
6 du
~/opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py in map(self, func, iterable, chunksize)
266 in a list that is returned.
267 '''
--> 268 return self._map_async(func, iterable, mapstar, chunksize).get()
269
270 def starmap(self, func, iterable, chunksize=None):
~/opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py in get(self, timeout)
655 return self._value
656 else:
--> 657 raise self._value
658
659 def _set(self, i, obj):
JSONDecodeError: Unterminated string starting at: line 17 column 14 (char 720)
Кажется, что-то разрывает соединение для чтения файла. Таким образом, я удалил учетные данные в функции и позволил ей работать с time.sleep (2), я получаю сообщение о превышении максимального числа повторных попыток
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "/Users/alireza/opt/anaconda3/envs/husx/lib/python3.7/site-packages/urllib3/connectionpool.py", line 672, in urlopen
chunked=chunked,
File "/Users/alireza/opt/anaconda3/envs/husx/lib/python3.7/site-packages/urllib3/connectionpool.py", line 421, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/Users/alireza/opt/anaconda3/envs/husx/lib/python3.7/site-packages/urllib3/connectionpool.py", line 416, in _make_request
httplib_response = conn.getresponse()
File "/Users/alireza/opt/anaconda3/envs/husx/lib/python3.7/http/client.py", line 1344, in getresponse
response.begin()
File "/Users/alireza/opt/anaconda3/envs/husx/lib/python3.7/http/client.py", line 306, in begin
version, status, reason = self._read_status()
File "/Users/alireza/opt/anaconda3/envs/husx/lib/python3.7/http/client.py", line 267, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/Users/alireza/opt/anaconda3/envs/husx/lib/python3.7/socket.py", line 589, in readinto
return self._sock.recv_into(b)
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/ssl.py", line 1071, in recv_into
return self.read(nbytes, buffer)
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/ssl.py", line 929, in read
return self._sslobj.read(len, buffer)
ssl.SSLError: [SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:2555)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/requests/adapters.py", line 449, in send
timeout=timeout
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/urllib3/connectionpool.py", line 720, in urlopen
method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/urllib3/util/retry.py", line 436, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='storage.googleapis.com', port=443): Max retries exceeded with url: /storage/v1/b//o/xlmfull%2F310000052_full.xml?projection=noAcl (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:2555)')))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
return list(map(*args))
File "<ipython-input-80-8c760f72ecd4>", line 11, in extract_entity
blob = bucket.get_blob('xlmfull/' + entity_number + '_full.xml')
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/storage/bucket.py", line 899, in get_blob
blob.reload(client=client, timeout=timeout)
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/storage/_helpers.py", line 150, in reload
timeout=timeout,
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/_http.py", line 419, in api_request
timeout=timeout,
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/_http.py", line 277, in _make_request
method, url, headers, data, target_object, timeout=timeout
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/cloud/_http.py", line 315, in _do_request
url=url, method=method, headers=headers, data=data, timeout=timeout
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/google/auth/transport/requests.py", line 317, in request
**kwargs
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/requests/sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "/Users/a/opt/anaconda3/envs/husx/lib/python3.7/site-packages/requests/adapters.py", line 514, in send
raise SSLError(e, request=request)
requests.exceptions.SSLError: HTTPSConnectionPool(host='storage.googleapis.com', port=443): Max retries exceeded with url: /storage/v1/b//o/xlmfull%2F310000052_full.xml?projection=noAcl (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:2555)')))
"""
The above exception was the direct cause of the following exception:
SSLError Traceback (most recent call last)
<ipython-input-81-5c48af30bc80> in <module>
2 start_time = time.time()
3 pool = Pool(processes=8)
----> 4 results = pool.map(extract_entity, range(0, 100))
5 du = time.time() - start_time
6 du
~/opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py in map(self, func, iterable, chunksize)
266 in a list that is returned.
267 '''
--> 268 return self._map_async(func, iterable, mapstar, chunksize).get()
269
270 def starmap(self, func, iterable, chunksize=None):
~/opt/anaconda3/envs/husx/lib/python3.7/multiprocessing/pool.py in get(self, timeout)
655 return self._value
656 else:
--> 657 raise self._value
658
659 def _set(self, i, obj):
SSLError: None: Max retries exceeded with url: /storage/v1/b//o/xlmfull%2F310000052_full.xml?projection=noAcl (Caused by None)
** Кажется, комбинация time.sleep и числа процессора меняет результат. есть ли такое ограничение на ведра gcp?