У меня проблема мультиклассовой классификации.В поиске сетки перекрестной проверки, чтобы найти лучшие (гипер) настройки параметров, я обнаружил, что случайный лес крайне неэффективен (точность = 0,412, в то время как другие алгоритмы ML достигли 0,70 или выше).Я понимаю, что нет необходимости в красном флаге, поскольку разные алгоритмы ML могут работать лучше / хуже в разных пространствах решения проблем.Но мне интересно, правильно ли я устанавливаю диапазон возможных гиперпараметров.
ml_algo_param_dict = \
{
'LR_V1': { 'clf': LogisticRegression(),
'param': {
'logisticregression__solver': ['liblinear'],
'logisticregression__penalty': ['l1', 'l2'],
'logisticregression__C': np.logspace(-4, 4, 20),
'logisticregression__tol': np.logspace(-5, 5, 20),
'logisticregression__class_weight': [None, 'balanced'],
'logisticregression__multi_class': ['ovr', 'auto'],
'logisticregression__max_iter': [4000, 20000],
}},
'LR_V2': { 'clf': LogisticRegression(),
'param': {
'logisticregression__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
'logisticregression__penalty': ['none', 'l2'],
'logisticregression__C': np.logspace(-4, 4, 20),
'logisticregression__tol': np.logspace(-5, 5, 20),
'logisticregression__class_weight': [None, 'balanced'],
'logisticregression__multi_class': ['ovr', 'multinomial', 'auto'],
'logisticregression__max_iter': [4000, 20000],
}},
'SVC': { 'clf': OneVsRestClassifier(LinearSVC()),
'param': {
'onevsrestclassifier__estimator__penalty': ['l2'],
'onevsrestclassifier__estimator__loss': ['hinge', 'squared_hinge'],
'onevsrestclassifier__estimator__C': np.logspace(-4, 4, 20),
'onevsrestclassifier__estimator__tol': np.logspace(-5, 5, 20),
'onevsrestclassifier__estimator__class_weight': [None, 'balanced'],
'onevsrestclassifier__estimator__multi_class': ['ovr', 'crammer_singer'],
'onevsrestclassifier__estimator__max_iter': [50, 1000, 4000, 20000],
}},
'RF': {'clf': RandomForestClassifier(),
'param': {
'randomforestclassifier__n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 200, 500, ],
'randomforestclassifier__criterion': ['gini', 'entropy'],
'randomforestclassifier__class_weight': [None, 'balanced', 'balanced_subsample'],
'randomforestclassifier__max_depth': np.linspace(1, 10, 32, endpoint=True),
'randomforestclassifier__min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
'randomforestclassifier__min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True),
'randomforestclassifier__max_leaf_nodes': [None, 50, 100, 200, 400],
'randomforestclassifier__max_features': [None, 'auto', 'sqrt', 'log2'],
}},
'NB': {'clf': BernoulliNB(),
'param': {
'bernoullinb__alpha': np.logspace(-4, 4, 20),
'bernoullinb__binarize': [None, 0, .2, .4, .6, .8, 1],
'bernoullinb__fit_prior': [True, False],
}},
}
Результат
>> Best score: 0.712
>> Best parameter:
Pipeline(memory=None,
steps=[('columntransformer',
ColumnTransformer(n_jobs=None, remainder='drop',
sparse_threshold=0.3,
transformer_weights=None,
transformers=[('num',
Pipeline(memory=None,
steps=[('imputer',
SimpleImputer(add_indicator=False,
copy=True,
fill_value=None,
missing_values=nan,
strategy='median',
verbose=0)),
('scaler',
StandardScaler(copy=True,
with...
verbose=False),
['LOC_ENTITY_LIST'])],
verbose=False)),
('logisticregression',
LogisticRegression(C=0.03359818286283781,
class_weight='balanced', dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=None, max_iter=4000,
multi_class='ovr', n_jobs=None,
penalty='l2', random_state=None,
solver='liblinear', tol=1e-05, verbose=0,
warm_start=False))],
verbose=False)
>> Best selected parameter:
{'logisticregression__tol': 1e-05, 'logisticregression__solver': 'liblinear', 'logisticregression__penalty': 'l2', 'logisticregression__multi_class': 'ovr', 'logisticregression__max_iter': 4000, 'logisticregression__class_weight': 'balanced', 'logisticregression__C': 0.03359818286283781}
>> Best score: 0.738
>> Best parameter:
Pipeline(memory=None,
steps=[('columntransformer',
ColumnTransformer(n_jobs=None, remainder='drop',
sparse_threshold=0.3,
transformer_weights=None,
transformers=[('num',
Pipeline(memory=None,
steps=[('imputer',
SimpleImputer(add_indicator=False,
copy=True,
fill_value=None,
missing_values=nan,
strategy='median',
verbose=0)),
('scaler',
StandardScaler(copy=True,
with...
verbose=False),
['LOC_ENTITY_LIST'])],
verbose=False)),
('logisticregression',
LogisticRegression(C=0.23357214690901212, class_weight=None,
dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None,
max_iter=20000, multi_class='ovr',
n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs',
tol=0.01438449888287663, verbose=0,
warm_start=False))],
verbose=False)
>> Best selected parameter:
{'logisticregression__tol': 0.01438449888287663, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'logisticregression__multi_class': 'ovr', 'logisticregression__max_iter': 20000, 'logisticregression__class_weight': None, 'logisticregression__C': 0.23357214690901212}
>> Best score: 0.708
>> Best parameter:
Pipeline(memory=None,
steps=[('columntransformer',
ColumnTransformer(n_jobs=None, remainder='drop',
sparse_threshold=0.3,
transformer_weights=None,
transformers=[('num',
Pipeline(memory=None,
steps=[('imputer',
SimpleImputer(add_indicator=False,
copy=True,
fill_value=None,
missing_values=nan,
strategy='median',
verbose=0)),
('scaler',
StandardScaler(copy=True,
with...
verbose=False),
['LOC_ENTITY_LIST'])],
verbose=False)),
('onevsrestclassifier',
OneVsRestClassifier(estimator=LinearSVC(C=78.47599703514607,
class_weight='balanced',
dual=True,
fit_intercept=True,
intercept_scaling=1,
loss='hinge',
max_iter=4000,
multi_class='ovr',
penalty='l2',
random_state=None,
tol=3.359818286283781e-05,
verbose=0),
n_jobs=None))],
verbose=False)
>> Best selected parameter:
{'onevsrestclassifier__estimator__tol': 3.359818286283781e-05, 'onevsrestclassifier__estimator__penalty': 'l2', 'onevsrestclassifier__estimator__multi_class': 'ovr', 'onevsrestclassifier__estimator__max_iter': 4000, 'onevsrestclassifier__estimator__loss': 'hinge', 'onevsrestclassifier__estimator__class_weight': 'balanced', 'onevsrestclassifier__estimator__C': 78.47599703514607}
>> Best score: 0.412
>> Best parameter:
Pipeline(memory=None,
steps=[('columntransformer',
ColumnTransformer(n_jobs=None, remainder='drop',
sparse_threshold=0.3,
transformer_weights=None,
transformers=[('num',
Pipeline(memory=None,
steps=[('imputer',
SimpleImputer(add_indicator=False,
copy=True,
fill_value=None,
missing_values=nan,
strategy='median',
verbose=0)),
('scaler',
StandardScaler(copy=True,
with...
RandomForestClassifier(bootstrap=True, class_weight=None,
criterion='gini',
max_depth=6.806451612903226,
max_features=None, max_leaf_nodes=50,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=0.2,
min_samples_split=0.2,
min_weight_fraction_leaf=0.0,
n_estimators=200, n_jobs=None,
oob_score=False, random_state=None,
verbose=0, warm_start=False))],
verbose=False)
>> Best selected parameter:
{'randomforestclassifier__n_estimators': 200, 'randomforestclassifier__min_samples_split': 0.2, 'randomforestclassifier__min_samples_leaf': 0.2, 'randomforestclassifier__max_leaf_nodes': 50, 'randomforestclassifier__max_features': None, 'randomforestclassifier__max_depth': 6.80645161290322 6, 'randomforestclassifier__criterion': 'gini', 'randomforestclassifier__class_weight': None}
>> Best score: 0.697
>> Best parameter:
Pipeline(memory=None,
steps=[('columntransformer',
ColumnTransformer(n_jobs=None, remainder='drop',
sparse_threshold=0.3,
transformer_weights=None,
transformers=[('num',
Pipeline(memory=None,
steps=[('imputer',
SimpleImputer(add_indicator=False,
copy=True,
fill_value=None,
missing_values=nan,
strategy='median',
verbose=0)),
('scaler',
StandardScaler(copy=True,
with...
lowercase=True,
max_df=1.0,
max_features=5000,
min_df=1,
ngram_range=(1,
1),
preprocessor=None,
stop_words=None,
strip_accents=None,
token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None,
vocabulary=None))],
verbose=False),
['LOC_ENTITY_LIST'])],
verbose=False)),
('bernoullinb',
BernoulliNB(alpha=0.00026366508987303583, binarize=0.6,
class_prior=None, fit_prior=True))],
verbose=False)
>> Best selected parameter:
{'bernoullinb__fit_prior': True, 'bernoullinb__binarize': 0.6, 'bernoullinb__alpha': 0.00026366508987303583}
Любое предложение о том, что делать / тестировать дальше, и объяснение, почему это так, будут наиболее цениться.