Версия Catboost: 0,21 Операционная система: Windows Процессор: INTEL I9
Я запускаю учебное пособие по классификации CATBOOST Python с набором данных Amazon (https://github.com/catboost/tutorials/blob/master/classification/classification_tutorial.ipynb)
Чтобы облегчить вам жизнь, ребята, я выделил соответствующий код (из учебника) ниже.
- Я запускаю пример с 30,872 примерами поездов.
- Затем я удаляю одну выборку поездов (1 / 30.872) - таким образом, у меня осталось 30.871 выборок поездов, и они снова запускаются.
- Затем я сравниваю оценки прогноза для всех 58.921 тестовых образцов из первого и второго прогонов.
Результат показывает, что индивидуальные прогнозы изменяются более чем на 10%, 20%, 30%, 40%, 50% с первого до второго запуска ...!? Более 1% баллов прогноза изменяются более чем на 10%. Это влияет на все оценки прогноза.
Ожидается ли это CATBOOST? Лично мне трудно поверить, что такие незначительные изменения - должны так сильно повлиять на результат? Пожалуйста, поправьте меня, если я ошибаюсь.
### DESCRIPTION:
# Runs the classification tutorial twice where 1 matrix is removed from the training dataset on the second run. Afterwards the prediction scores of the two runs are compared and printed
### USAGE:
# python.exe /example.py -1 ---- runs the tutorial twice without removing any matrices
# python.exe /example.py ---- runs the tutorial removing training matrix at index 0
# python.exe /example.py 321 ---- runs the tutorial removing training matrix at index 321
# # Solving classification problems with CatBoost
# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/catboost/tutorials/blob/master/classification/classification_tutorial.ipynb)
#
# In this tutorial we will use dataset Amazon Employee Access Challenge from [Kaggle](https://www.kaggle.com) competition for our experiments. Data can be downloaded [here](https://www.kaggle.com/c/amazon-employee-access-challenge/data).
# ## Libraries installation
import sys
import catboost
print(catboost.__version__)
# ## Reading the data
from copy import deepcopy
import math
import pandas as pd
import os
import numpy as np
np.set_printoptions(precision=4)
import catboost
from catboost import CatBoost
from catboost import *
from catboost import datasets
from catboost.utils import create_cd
from catboost import CatBoostClassifier
from catboost.eval.catboost_evaluation import *
from catboost.eval.evaluation_result import *
from sklearn.model_selection import train_test_split
def run_tutorial(dropped_id):
(train_df, test_df) = catboost.datasets.amazon()
train_df.head()
y = train_df.ACTION
X = train_df.drop('ACTION', axis=1)
############################################# (NOT PART OF TUTORIAL) #############################################
if dropped_id is not None:
if dropped_id >= len(y):
print('\nIndex must not exceed {}\n'.format(len(y)))
sys.exit()
elif dropped_id > -1:
X = X.drop(X.index[dropped_id])
y = y.drop(y.index[dropped_id])
############################################# (NOT PART OF TUTORIAL) #############################################
cat_features = list(range(0, X.shape[1]))
print(cat_features)
print('\nALL MATRICES COUNT:\nLabels: {}'.format(set(y)))
print('Zero count = {}, One count = {}\n'.format(len(y) - sum(y), sum(y)))
dataset_dir = './amazon'
if not os.path.exists(dataset_dir):
os.makedirs(dataset_dir)
train_df.to_csv(
os.path.join(dataset_dir, 'train.tsv'),
index=False, sep='\t', header=False
)
feature_names = dict()
for column, name in enumerate(train_df):
if column == 0:
continue
feature_names[column - 1] = name
create_cd(
label=0,
cat_features=list(range(1, train_df.columns.shape[0])),
feature_names=feature_names,
output_path=os.path.join(dataset_dir, 'train.cd')
)
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.8, random_state=1234)
learn_params = {'iterations': 20, # 2000
'learning_rate': 0.5, # we set big learning_rate,
# because we have small
# #iterations
'random_seed': 0,
'verbose': False,
'loss_function' : 'Logloss',
'boosting_type': 'Plain'}
evaluator = CatboostEvaluation('amazon/train.tsv',
fold_size=10000, # <= 50% of dataset
fold_count=20,
column_description='amazon/train.cd',
partition_random_seed=0,
#working_dir=...
)
result = evaluator.eval_features(learn_config=learn_params,
eval_metrics=['Logloss', 'Accuracy'],
features_to_eval=[6, 7, 8])
logloss_result = result.get_metric_results('Logloss')
logloss_result.get_baseline_comparison(
ScoreConfig(ScoreType.Rel, overfit_iterations_info=False)
)
tunned_model = CatBoostClassifier(
random_seed=63,
iterations=1000,
learning_rate=0.03,
l2_leaf_reg=3,
bagging_temperature=1,
random_strength=1,
one_hot_max_size=2,
leaf_estimation_method='Newton'
)
tunned_model.fit(
X_train, y_train,
cat_features=cat_features,
verbose=False,
eval_set=(X_validation, y_validation),
plot=True
)
# ## Training the model after parameter tunning
best_model = CatBoostClassifier(
random_seed=63,
iterations=int(tunned_model.tree_count_ * 1.2),
)
best_model.fit(
X, y,
cat_features=cat_features,
verbose=100
)
# ## Calculate predictions for the contest
X_test = test_df.drop('id', axis=1)
test_pool = Pool(data=X_test, cat_features=cat_features)
contest_predictions = best_model.predict_proba(test_pool)
print('\nPredictoins:')
print(contest_predictions)
y_pred = [np.argmax(p) for p in contest_predictions]
return len(X_train), y_pred, contest_predictions
### DESCRIPTION:
# Runs the classification tutorial twice where 1 matrix is removed from the training dataset on the second run. Afterwards the prediction scores of the two runs are compared and printed
### USAGE:
# python.exe /example.py -1 ---- runs the tutorial twice without removing any matrices
# python.exe /example.py ---- runs the tutorial removing training matrix at index 0
# python.exe /example.py 321 ---- runs the tutorial removing training matrix at index 321
if __name__ == '__main__':
print('\n{0:} ALL MATRICES {0:}\n'.format('='*53))
train_len_all, y_pred_all, pred_scores_all = run_tutorial(None)
dropped_id = int(sys.argv[1:][0]) if len(sys.argv[1:]) == 1 else 0
print('\n{0:} ALL EXCEPT 1 MATRIX {0:}\n'.format('='*49))
train_len_dropped, y_pred_dropped, pred_scores_dropped = run_tutorial(dropped_id)
### CALCULATE DIFFERENCES
print('\n\nDIFFERENCES (FIRST 50):\n')
changed_pred_indices = []
differing_indices = []
above_10_pct_change = 0
for i, proba in enumerate(pred_scores_all):
pred_score_all, pred_score_dropped = np.max(proba), np.max(pred_scores_dropped[i])
lbl_pred_all, lbl_pred_dropped = y_pred_all[i], y_pred_dropped[i]
differing_pred = y_pred_all[i] != y_pred_dropped[i]
if pred_score_all != pred_score_dropped or lbl_pred_all != lbl_pred_dropped:
lbl_changed = lbl_pred_all != lbl_pred_dropped
pct_change = 100-(((1-pred_score_all) / pred_score_dropped)*100) if lbl_changed else 100-((pred_score_all / pred_score_dropped)*100)
changed_pred_indices.append({'index': i, 'pct_change': pct_change})
if abs(pct_change) > 10:
above_10_pct_change += 1
if i < 10 or (above_10_pct_change < 40 and abs(pct_change) > 10):
print('Index: {:>4} | ({} matrices): {:.4f} | ({} matrices): {:.4f} | pct_change: {:6.2f}%{}'.format(i, train_len_all, pred_score_all, train_len_dropped, pred_score_dropped, pct_change, ' <== DIFFERENT LABEL' if differing_pred else ''))
if differing_pred:
differing_indices.append(i)
avg = np.average([abs(c['pct_change']) for c in changed_pred_indices]) if len(changed_pred_indices) > 0 else 0
print('\nTest size: {} | different labels: {} | changed prediction scores: {} | more than 10 pct change: {} | avg pct_change: {:.2f}%\n'.format(len(y_pred_all), len(differing_indices), len(changed_pred_indices), above_10_pct_change, avg))
#############################################################################################################
################################################## RESULT ###################################################
#############################################################################################################
# D:\example_class_tutorial> python.exe .\example_class_tutorial.py 32768
# 0.21
# ===================================================== ALL MATRICES =====================================================
# [0, 1, 2, 3, 4, 5, 6, 7, 8]
# ALL MATRICES COUNT:
# Labels: {0, 1}
# Zero count = 1897, One count = 30872
# C:\Program Files\Python36\lib\site-packages\sklearn\model_selection\_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
# FutureWarning)
# <IPython.core.display.HTML object>
# MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))
# Learning rate set to 0.041438
# 0: learn: 0.6437243 total: 21ms remaining: 23.4s
# 100: learn: 0.1529288 total: 3.53s remaining: 35.4s
# 200: learn: 0.1465218 total: 8.59s remaining: 39s
# 300: learn: 0.1427914 total: 13.3s remaining: 36s
# 400: learn: 0.1395288 total: 18.2s remaining: 32.4s
# 500: learn: 0.1367097 total: 23.2s remaining: 28.3s
# 600: learn: 0.1334455 total: 28.5s remaining: 24.3s
# 700: learn: 0.1306259 total: 33.4s remaining: 19.6s
# 800: learn: 0.1279926 total: 38.5s remaining: 15s
# 900: learn: 0.1252181 total: 43.6s remaining: 10.3s
# 1000: learn: 0.1229690 total: 48.3s remaining: 5.41s
# 1100: learn: 0.1216446 total: 52s remaining: 567ms
# 1112: learn: 0.1213643 total: 52.6s remaining: 0us
# Predictoins:
# [[0.4535 0.5465]
# [0.0155 0.9845]
# [0.012 0.988 ]
# ...
# [0.0051 0.9949]
# [0.0517 0.9483]
# [0.0127 0.9873]]
# ================================================= ALL EXCEPT 1 MATRIX =================================================
# [0, 1, 2, 3, 4, 5, 6, 7, 8]
# ALL MATRICES COUNT:
# Labels: {0, 1}
# Zero count = 1897, One count = 30871
# <IPython.core.display.HTML object>
# MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))
# Learning rate set to 0.039153
# 0: learn: 0.6463627 total: 12.2ms remaining: 14.4s
# 100: learn: 0.1548777 total: 3.27s remaining: 35s
# 200: learn: 0.1465521 total: 8.01s remaining: 39.1s
# 300: learn: 0.1425796 total: 12.7s remaining: 37.2s
# 400: learn: 0.1399863 total: 17.3s remaining: 33.9s
# 500: learn: 0.1375170 total: 21.8s remaining: 29.8s
# 600: learn: 0.1347887 total: 26.4s remaining: 25.7s
# 700: learn: 0.1322360 total: 31.2s remaining: 21.5s
# 800: learn: 0.1296949 total: 35.9s remaining: 17.2s
# 900: learn: 0.1271796 total: 40.6s remaining: 12.8s
# 1000: learn: 0.1247210 total: 45.3s remaining: 8.28s
# 1100: learn: 0.1226686 total: 50.1s remaining: 3.77s
# 1183: learn: 0.1206776 total: 54s remaining: 0us
# Predictoins:
# [[0.3598 0.6402]
# [0.0182 0.9818]
# [0.0089 0.9911]
# ...
# [0.0053 0.9947]
# [0.0433 0.9567]
# [0.0095 0.9905]]
# DIFFERENCES (FIRST 50):
# Index: 0 | (26215 matrices): 0.5465 | (26214 matrices): 0.6402 | pct_change: 14.62%
# Index: 1 | (26215 matrices): 0.9845 | (26214 matrices): 0.9818 | pct_change: -0.28%
# Index: 2 | (26215 matrices): 0.9880 | (26214 matrices): 0.9911 | pct_change: 0.31%
# Index: 3 | (26215 matrices): 0.9928 | (26214 matrices): 0.9910 | pct_change: -0.19%
# Index: 4 | (26215 matrices): 0.9945 | (26214 matrices): 0.9955 | pct_change: 0.10%
# Index: 5 | (26215 matrices): 0.9912 | (26214 matrices): 0.9906 | pct_change: -0.06%
# Index: 6 | (26215 matrices): 0.9926 | (26214 matrices): 0.9891 | pct_change: -0.36%
# Index: 7 | (26215 matrices): 0.9951 | (26214 matrices): 0.9964 | pct_change: 0.12%
# Index: 8 | (26215 matrices): 0.8384 | (26214 matrices): 0.8653 | pct_change: 3.11%
# Index: 9 | (26215 matrices): 0.9890 | (26214 matrices): 0.9920 | pct_change: 0.30%
# Index: 16 | (26215 matrices): 0.8765 | (26214 matrices): 0.7050 | pct_change: -24.33%
# Index: 69 | (26215 matrices): 0.7027 | (26214 matrices): 0.5122 | pct_change: -37.19%
# Index: 101 | (26215 matrices): 0.6686 | (26214 matrices): 0.8172 | pct_change: 18.19%
# Index: 180 | (26215 matrices): 0.6942 | (26214 matrices): 0.5798 | pct_change: -19.73%
# Index: 209 | (26215 matrices): 0.7661 | (26214 matrices): 0.8756 | pct_change: 12.50%
# Index: 275 | (26215 matrices): 0.7243 | (26214 matrices): 0.5553 | pct_change: -30.44%
# Index: 296 | (26215 matrices): 0.5992 | (26214 matrices): 0.5145 | pct_change: -16.47%
# Index: 302 | (26215 matrices): 0.7977 | (26214 matrices): 0.7167 | pct_change: -11.31%
# Index: 353 | (26215 matrices): 0.7661 | (26214 matrices): 0.5768 | pct_change: -32.82%
# Index: 359 | (26215 matrices): 0.7295 | (26214 matrices): 0.6604 | pct_change: -10.46%
# Index: 363 | (26215 matrices): 0.9067 | (26214 matrices): 0.7345 | pct_change: -23.44%
# Index: 376 | (26215 matrices): 0.5907 | (26214 matrices): 0.5620 | pct_change: 27.17% <== DIFFERENT LABEL
# Index: 380 | (26215 matrices): 0.9165 | (26214 matrices): 0.8028 | pct_change: -14.17%
# Index: 386 | (26215 matrices): 0.8241 | (26214 matrices): 0.7430 | pct_change: -10.92%
# Index: 437 | (26215 matrices): 0.8148 | (26214 matrices): 0.5204 | pct_change: 64.40% <== DIFFERENT LABEL
# Index: 513 | (26215 matrices): 0.9262 | (26214 matrices): 0.8395 | pct_change: -10.33%
# Index: 523 | (26215 matrices): 0.9396 | (26214 matrices): 0.7359 | pct_change: -27.67%
# Index: 546 | (26215 matrices): 0.5608 | (26214 matrices): 0.5093 | pct_change: 13.77% <== DIFFERENT LABEL
# Index: 570 | (26215 matrices): 0.7127 | (26214 matrices): 0.5820 | pct_change: -22.45%
# Index: 573 | (26215 matrices): 0.6991 | (26214 matrices): 0.8089 | pct_change: 13.57%
# Index: 615 | (26215 matrices): 0.5329 | (26214 matrices): 0.6129 | pct_change: 23.79% <== DIFFERENT LABEL
# Index: 647 | (26215 matrices): 0.7050 | (26214 matrices): 0.8416 | pct_change: 16.23%
# Index: 665 | (26215 matrices): 0.5233 | (26214 matrices): 0.5452 | pct_change: 12.56% <== DIFFERENT LABEL
# Index: 672 | (26215 matrices): 0.5144 | (26214 matrices): 0.6450 | pct_change: 20.25%
# Index: 680 | (26215 matrices): 0.5642 | (26214 matrices): 0.6718 | pct_change: 35.12% <== DIFFERENT LABEL
# Index: 695 | (26215 matrices): 0.6807 | (26214 matrices): 0.7753 | pct_change: 12.20%
# Index: 784 | (26215 matrices): 0.5270 | (26214 matrices): 0.7096 | pct_change: 25.73%
# Index: 814 | (26215 matrices): 0.6502 | (26214 matrices): 0.7253 | pct_change: 10.35%
# Index: 849 | (26215 matrices): 0.6538 | (26214 matrices): 0.5683 | pct_change: -15.04%
# Index: 892 | (26215 matrices): 0.6595 | (26214 matrices): 0.7540 | pct_change: 12.54%
# Index: 951 | (26215 matrices): 0.5893 | (26214 matrices): 0.5217 | pct_change: -12.97%
# Index: 955 | (26215 matrices): 0.7544 | (26214 matrices): 0.5025 | pct_change: 51.13% <== DIFFERENT LABEL
# Index: 979 | (26215 matrices): 0.7353 | (26214 matrices): 0.6160 | pct_change: -19.38%
# Index: 983 | (26215 matrices): 0.6603 | (26214 matrices): 0.7593 | pct_change: 13.04%
# Index: 1016 | (26215 matrices): 0.6067 | (26214 matrices): 0.5146 | pct_change: 23.58% <== DIFFERENT LABEL
# Index: 1134 | (26215 matrices): 0.5097 | (26214 matrices): 0.6968 | pct_change: 26.84%
# Index: 1208 | (26215 matrices): 0.6113 | (26214 matrices): 0.7034 | pct_change: 13.08%
# Index: 1286 | (26215 matrices): 0.5651 | (26214 matrices): 0.5651 | pct_change: 23.05% <== DIFFERENT LABEL
# Test size: 58921 | different labels: 529 | changed prediction scores: 58921 | more than 10 pct change: 2193 | avg pct_change: 1.45%