Я не могу генерировать статистику для кадра данных, содержащего строки и использующего n_jobs = -1 (что-либо, отличное от 1). Ошибка исчезает, если строки заменяются целыми числами / числами с плавающей запятой.
tennsflow-data-validation 0.21.1 apache -beam 2.17 тензор потока 2.1.0 Я перепробовал множество разностных версий предыдущих библиотек, ошибка возникла, так как Проверка данных tenorflow v0.14.1. Ошибка может быть воспроизведена с помощью:
import pandas as pd
import tensorflow_data_validation as tfdv
df = pd.DataFrame({'a': ['s', 'd'], 'b': [3, 4]})
tfdv.generate_statistics_from_dataframe(a, n_jobs=-1)
генерирует
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
'''
Traceback (most recent call last):
File "/misc/DLshare/home/rpaes866/.local/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 624, in _queue_management_worker
result_item = result_reader.recv()
File "/misc/DLshare/home/rpaes866/.conda/envs/hocosting3.6/lib/python3.6/multiprocessing/connection.py", line 251, in recv
return _ForkingPickler.loads(buf.getbuffer())
TypeError: __init__() takes 1 positional argument but 2 were given
'''
The above exception was the direct cause of the following exception:
BrokenProcessPool Traceback (most recent call last)
<ipython-input-13-015d2d4640da> in <module>
----> 1 tfdv.generate_statistics_from_dataframe(a, n_jobs=-1)
~/.conda/envs/hocosting3.6/lib/python3.6/site-packages/tensorflow_data_validation/utils/stats_gen_lib.py in generate_statistics_from_dataframe(dataframe, stats_options, n_jobs)
240 partial_stats = Parallel(n_jobs=n_jobs)(
241 delayed(_generate_partial_statistics_from_df)(
--> 242 splits[i], stats_options, stats_generators) for i in range(n_jobs))
243 merged_partial_stats = [
244 gen.merge_accumulators(stats)
~/.local/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
~/.local/lib/python3.6/site-packages/joblib/parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
~/.local/lib/python3.6/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
~/.conda/envs/hocosting3.6/lib/python3.6/concurrent/futures/_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433 else:
434 raise TimeoutError()
~/.conda/envs/hocosting3.6/lib/python3.6/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
BrokenProcessPool: A result has failed to un-serialize. Please ensure that the objects returned by the function are always picklable.
Кто-нибудь сталкивался с этой ошибкой? Спасибо