Вы можете использовать интерфейс с задержкой dask для настройки пользовательского рабочего процесса:
import pandas as pd
import dask
import distributed
# start local cluster, by default one worker per core
client = distributed.Client()
@dask.delayed
def my_sum(x, a):
return x + a
df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
'num_wings': [2, 0, 0, 0]})
# Here, we mimic the apply command. However, we do not
# actually run any computation. Instead, that line of code
# results in a list of delayed objects, which contain the
# information what computation should be performed eventually
delayeds = [my_sum(df[column], 2) for column in df.columns]
# send the list of delayed objects to the cluster, which will
# start computing the result in parallel.
# It returns future objects, pointing to the computation while
# it is still running
futures = client.compute(delayeds)
# get all the results, as soon as they are ready. This returns
# a list of pandas Series objects, each is one column of the
# output dataframe
computed_columns = client.gather(futures)
# create dataframe out of individual columns
computed_df = pd.concat(computed_columns, axis = 1)
В качестве альтернативы, вы также можете использовать многопроцессорный бэкэнд dask:
import pandas as pd
import dask
@dask.delayed
def my_sum(x, a):
return x + a
df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
'num_wings': [2, 0, 0, 0]})
# same as above
delayeds = [my_sum(df[column], 2) for column in df.columns]
# run the computation using the dask's multiprocessing backend
computed_columns = dask.compute(delayeds, scheduler = 'processes')
# create dataframe out of individual columns
computed_df = pd.concat(computed_columns, axis = 1)