Я сталкиваюсь с ошибкой в функциях кодирования меток.Чтобы сгенерировать мой случай (Первоначально я импортировал CSV-файл в dask dataframe и после очистки оставил 28 столбцов), я создал dask dataframe, как показано ниже:
import dask
import dask.dataframe as dd
from dask_ml.preprocessing import LabelEncoder
country = np.random.choice(['US','UK','IN'],1700000)
df = pd.DataFrame({'A':country,'B':range(1700000)})
ddf = dd.from_pandas(df,npartitions=2,sort=False)
Затем я попытался пометитьзакодируйте категориальные столбцы, как показано ниже:
le = LabelEncoder()
ddf = ddf.assign(A=dd.from_dask_array(le.fit_transform(ddf['A'])))
, который выдал следующую ошибку:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-106-480a5e12886a> in <module>()
10 type(le.fit_transform(ddf['A']))
11 #ddf['A'] = dd.from_array(le.fit_transform(ddf['A']))
---> 12 ddf = ddf.assign(A=dd.from_dask_array(le.fit_transform(ddf['A'])))
/opt/conda/lib/python3.6/site-packages/dask/dataframe/core.py in assign(self, **kwargs)
2698 # Figure out columns of the output
2699 df2 = self._meta.assign(**_extract_meta(kwargs))
-> 2700 return elemwise(methods.assign, self, *pairs, meta=df2)
2701
2702 @derived_from(pd.DataFrame, ua_args=['index'])
/opt/conda/lib/python3.6/site-packages/dask/dataframe/core.py in elemwise(op, *args, **kwargs)
3277
3278 from .multi import _maybe_align_partitions
-> 3279 args = _maybe_align_partitions(args)
3280 dasks = [arg for arg in args if isinstance(arg, (_Frame, Scalar, Array))]
3281 dfs = [df for df in dasks if isinstance(df, _Frame)]
/opt/conda/lib/python3.6/site-packages/dask/dataframe/multi.py in _maybe_align_partitions(args)
145 divisions = dfs[0].divisions
146 if not all(df.divisions == divisions for df in dfs):
--> 147 dfs2 = iter(align_partitions(*dfs)[0])
148 return [a if not isinstance(a, _Frame) else next(dfs2) for a in args]
149 return args
/opt/conda/lib/python3.6/site-packages/dask/dataframe/multi.py in align_partitions(*dfs)
101 raise ValueError("dfs contains no DataFrame and Series")
102 if not all(df.known_divisions for df in dfs1):
--> 103 raise ValueError("Not all divisions are known, can't align "
104 "partitions. Please use `set_index` "
105 "to set the index.")
ValueError: Not all divisions are known, can't align partitions. Please use `set_index` to set the index.