Я сделал рассол, используя joblib и tempfile Pipeline с пользовательскими классами, и при загрузке сериализованного конвейера я получаю только массив с именами объектов. Впервые я разрабатываю пользовательские классы для конвейеров машинного обучения. Я не знаю, есть ли у scikit-learn класс для выбора фрейма данных или как сделать что-то подобное.
# Custom classes
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names]
class MeanEncoder(BaseEstimator, TransformerMixin):
def __init__(self):
return None
def fit(self, X, y):
if not isinstance(X, pd.DataFrame):
if isinstance(X, pd.Series):
X = pd.DataFrame(X)
else:
raise ValueError('Not a pandas DataFrame')
if X.shape[0] != y.shape[0]:
raise ValueError('The length of X is different than the length of y')
df = pd.DataFrame(X.copy())
df['TARGET'] = y.copy()
vars_cat = {}
for col in X.select_dtypes('object').columns:
ordered_labels = df.groupby(col).agg({'TARGET':'mean'}).sort_values('TARGET', ascending=False).index
ordinal_labels = {k:i for i, k in enumerate(ordered_labels, 0)}
vars_cat[col] = ordinal_labels
df[col] = df[col].map(ordinal_labels)
self.labels_ = vars_cat
self.train_cols_ = X.columns
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
if isinstance(X, pd.Series):
X_transform = pd.DataFrame(X.copy())
else:
raise ValueError('Not a pandas DataFrame')
X_transform = pd.DataFrame(X[self.train_cols_].copy())
check_array = [col for col in self.labels_.keys() if col not in X_transform.columns and X_transform[col].dtype == object]
if len(check_array) > 0:
raise ValueError('Missing the following columns:', check_array)
for col in self.labels_.keys():
X_transform[col] = X_transform[col].map(self.labels_[col])
return X_transform
# --------------------------------------------------------------------
pipeline_grid = Pipeline(steps=[
('select_vars', DataFrameSelector(vars),
('encoder', MeanEncoder()),
('xgboost', xgb.XGBClassifier(random_state=SEED, n_jobs=5, verbosity=2))
])
search = RandomizedSearchCV(
estimator=pipeline_grid,
param_distributions=params_dist_grid,
n_iter=5,
cv=cv,
n_jobs=5,
scoring='roc_auc',
random_state=SEED,
verbose=3
)
pipeline_model = search.best_estimator_
s3 = boto3.resource('s3')
# Write
with tempfile.TemporaryFile() as fp:
joblib.dump(pipeline_modelo, fp)
fp.seek(0)
s3.Bucket(NM_BUCKET).put_object(Key= path + name_pipe, Body=fp.read())
# Load
s3 = boto3.client('s3')
with tempfile.TemporaryFile() as fp:
s3.download_fileobj(Fileobj=fp, Bucket=NM_BUCKET, Key=path + name_pipe)
fp.seek(0)
pipe = joblib.load(fp)
Результат ниже:
array(['VAR01', 'VAR02', 'VAR03', 'VAR04', 'VAR05', 'VAR06', 'VAR07', 'VAR08', 'VAR09', 'VAR10'])