Dask.distributed.Client создает LocalCluster, для которого вы можете явно установить использование памяти и количество ядер.
import numpy as np
import pandas as pd
from dask.distributed import Client
from dask import dataframe as dd
def names_unique(x):
return x['Names'].unique()
client = Client(memory_limit='2GB', processes=False,
n_workers=1, threads_per_worker=2)
# Data generation
df = pd.DataFrame({'Names': np.random.choice(['A', 'B', 'C', 'D'], size=1000000),
'sales': np.arange(1000000)})
df.to_parquet('parq_df')
ddf = dd.read_parquet('parq_df', npartitions=10)
# Custom computation
sent = client.submit(names_unique, ddf)
names_unique = sent.result().compute()
client.close()
Выход:
names_unique
Out[89]:
0 D
1 B
2 C
3 A
Name: Names, dtype: object