Я запускаю сценарий, который соответствует идентичной модели PyMC3
для разных наборов данных в кластере с несколькими узлами. Для распараллеливания внутри узла я использую joblib
, а для использования нескольких узлов я просто запускаю один и тот же сценарий отдельно для каждого узла. Через некоторое время я получаю следующую ошибку:
Traceback (most recent call last):
File "/home/fb90/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 418, in _process_worker
r = call_item()
File "/home/fb90/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 567, in __call__
return self.func(*args, **kwargs)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "/home/fb90/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "fit_multiple_events.py", line 167, in fit_reparametrized_model
# Plot model
File "../utils.py", line 46, in run_sampling
step=xo.get_dense_nuts_step(),
File "/home/fb90/anaconda3/lib/python3.7/site-packages/exoplanet/quadpotential.py", line 203, in get_dense_nuts_step
return pm.NUTS(potential=potential, model=model, **kwargs)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/pymc3/step_methods/hmc/nuts.py", line 152, in __init__
super().__init__(vars, **kwargs)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/pymc3/step_methods/hmc/base_hmc.py", line 72, in __init__
super().__init__(vars, blocked=blocked, model=model, dtype=dtype, **theano_kwargs)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/pymc3/step_methods/arraystep.py", line 228, in __init__
vars, dtype=dtype, **theano_kwargs)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/pymc3/model.py", line 723, in logp_dlogp_function
return ValueGradFunction(self.logpt, grad_vars, extra_vars, **kwargs)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/pymc3/model.py", line 462, in __init__
inputs, [self._cost_joined, grad], givens=givens, **kwargs)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/compile/function.py", line 317, in function
output_keys=output_keys)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/compile/pfunc.py", line 486, in pfunc
output_keys=output_keys)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/compile/function_module.py", line 1841, in orig_function
fn = m.create(defaults)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/compile/function_module.py", line 1715, in create
input_storage=input_storage_lists, storage_map=storage_map)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/gof/link.py", line 699, in make_thunk
storage_map=storage_map)[:3]
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/gof/vm.py", line 1091, in make_all
impl=impl))
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/gof/op.py", line 955, in make_thunk
no_recycling)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/gof/op.py", line 858, in make_c_thunk
output_storage=node_output_storage)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/gof/cc.py", line 1217, in make_thunk
keep_lock=keep_lock)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/gof/cc.py", line 1157, in __compile__
keep_lock=keep_lock)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/gof/cc.py", line 1624, in cthunk_factory
key=key, lnk=self, keep_lock=keep_lock)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/gof/cmodule.py", line 1155, in module_from_key
module = self._get_from_hash(module_hash, key, keep_lock=keep_lock)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/gof/cmodule.py", line 1055, in _get_from_hash
key_data.add_key(key, save_pkl=bool(key[0]))
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/gof/cmodule.py", line 519, in add_key
self.save_pkl()
File "/home/fb90/anaconda3/lib/python3.7/site-packages/theano/gof/cmodule.py", line 540, in save_pkl
with open(self.key_pkl, 'wb') as f:
FileNotFoundError: [Errno 2] No such file or directory: '/gpfs1/home/fb90/.theano/compiledir_Linux-2.6-el6.Bull.122.x86_64-x86_64-with-redhat-6.4-Santiago-x86_64-3.7.3-64/tmpotc6hmx4/key.pkl'
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "fit_multiple_events.py", line 232, in <module>
trajectory = ca.trajectory.Trajectory(event, model.t0, model.u0, model.tE)
File "/home/fb90/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 934, in __call__
self.retrieve()
File "/home/fb90/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 833, in retrieve
self._output.extend(job.get(timeout=self.timeout))
File "/home/fb90/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 521, in wrap_future_result
return future.result(timeout=timeout)
File "/home/fb90/anaconda3/lib/python3.7/concurrent/futures/_base.py", line 425, in result
return self.__get_result()
File "/home/fb90/anaconda3/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
raise self._exception
FileNotFoundError: [Errno The following error happened while compiling the node] Elemwise{Composite{(i0 - ((i1 * i2) + i3))}}(TensorConstant{[52.651766...48194605]}, Elemwise{exp,no_inplace}.0, InplaceDimShuffle{1}.0, Elemwise{exp,no_inplace}.0): '\n' -> 'No such file or directory'
slurmstepd: task_p_post_term: rmdir(/dev/cpuset/slurm140858/slurm140858.4294967294_0) failed Device or resource busy
Это происходит только тогда, когда я одновременно запускаю сценарий на нескольких узлах кластера. Это не произойдет, если я просто использую один узел.
Я нашел тему 2015 года, в которой упоминается та же проблема, но без решения: https://groups.google.com/forum/#!topic/theano-users/Pi4zQpfn5Ts
Я использую последнюю версию Theano, 1.0.4
.