CATBoost и прогнозирование отклонений - PullRequest
0 голосов
/ 08 марта 2020

Версия Catboost: 0,21 Операционная система: Windows Процессор: INTEL I9

Я запускаю учебное пособие по классификации CATBOOST Python с набором данных Amazon (https://github.com/catboost/tutorials/blob/master/classification/classification_tutorial.ipynb)

Чтобы облегчить вам жизнь, ребята, я выделил соответствующий код (из учебника) ниже.

  1. Я запускаю пример с 30,872 примерами поездов.
  2. Затем я удаляю одну выборку поездов (1 / 30.872) - таким образом, у меня осталось 30.871 выборок поездов, и они снова запускаются.
  3. Затем я сравниваю оценки прогноза для всех 58.921 тестовых образцов из первого и второго прогонов.

Результат показывает, что индивидуальные прогнозы изменяются более чем на 10%, 20%, 30%, 40%, 50% с первого до второго запуска ...!? Более 1% баллов прогноза изменяются более чем на 10%. Это влияет на все оценки прогноза.

Ожидается ли это CATBOOST? Лично мне трудно поверить, что такие незначительные изменения - должны так сильно повлиять на результат? Пожалуйста, поправьте меня, если я ошибаюсь.

### DESCRIPTION:
#    Runs the classification tutorial twice where 1 matrix is removed from the training dataset on the second run. Afterwards the prediction scores of the two runs are compared and printed

### USAGE:
# python.exe /example.py -1     ---- runs the tutorial twice without removing any matrices
# python.exe /example.py        ---- runs the tutorial removing training matrix at index 0
# python.exe /example.py 321    ---- runs the tutorial removing training matrix at index 321





# # Solving classification problems with CatBoost

# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/catboost/tutorials/blob/master/classification/classification_tutorial.ipynb)
# 
# In this tutorial we will use dataset Amazon Employee Access Challenge from [Kaggle](https://www.kaggle.com) competition for our experiments. Data can be downloaded [here](https://www.kaggle.com/c/amazon-employee-access-challenge/data).
# ## Libraries installation

import sys
import catboost
print(catboost.__version__)

# ## Reading the data

from copy import deepcopy
import math
import pandas as pd
import os
import numpy as np
np.set_printoptions(precision=4)
import catboost
from catboost import CatBoost
from catboost import *
from catboost import datasets    
from catboost.utils import create_cd
from catboost import CatBoostClassifier
from catboost.eval.catboost_evaluation import *
from catboost.eval.evaluation_result import *

from sklearn.model_selection import train_test_split

def run_tutorial(dropped_id):
    (train_df, test_df) = catboost.datasets.amazon()

    train_df.head()

    y = train_df.ACTION
    X = train_df.drop('ACTION', axis=1)

    ############################################# (NOT PART OF TUTORIAL) #############################################

    if dropped_id is not None:
        if dropped_id >= len(y):
            print('\nIndex must not exceed {}\n'.format(len(y)))
            sys.exit()
        elif dropped_id > -1:
            X = X.drop(X.index[dropped_id])
            y = y.drop(y.index[dropped_id])

    ############################################# (NOT PART OF TUTORIAL) #############################################

    cat_features = list(range(0, X.shape[1]))
    print(cat_features)

    print('\nALL MATRICES COUNT:\nLabels: {}'.format(set(y)))
    print('Zero count = {}, One count = {}\n'.format(len(y) - sum(y), sum(y)))

    dataset_dir = './amazon'
    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)

    train_df.to_csv(
        os.path.join(dataset_dir, 'train.tsv'),
        index=False, sep='\t', header=False
    )

    feature_names = dict()
    for column, name in enumerate(train_df):
        if column == 0:
            continue
        feature_names[column - 1] = name

    create_cd(
        label=0, 
        cat_features=list(range(1, train_df.columns.shape[0])),
        feature_names=feature_names,
        output_path=os.path.join(dataset_dir, 'train.cd')
    )


    X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.8, random_state=1234)


    learn_params = {'iterations': 20, # 2000
                    'learning_rate': 0.5, # we set big learning_rate,
                                        # because we have small
                                        # #iterations
                    'random_seed': 0,
                    'verbose': False,
                    'loss_function' : 'Logloss',
                    'boosting_type': 'Plain'}

    evaluator = CatboostEvaluation('amazon/train.tsv',
                                fold_size=10000, # <= 50% of dataset
                                fold_count=20,
                                column_description='amazon/train.cd',
                                partition_random_seed=0,
                                #working_dir=... 
    )
    result = evaluator.eval_features(learn_config=learn_params,
                                    eval_metrics=['Logloss', 'Accuracy'],
                                    features_to_eval=[6, 7, 8])


    logloss_result = result.get_metric_results('Logloss')
    logloss_result.get_baseline_comparison(
        ScoreConfig(ScoreType.Rel, overfit_iterations_info=False)
    )

    tunned_model = CatBoostClassifier(
        random_seed=63,
        iterations=1000,
        learning_rate=0.03,
        l2_leaf_reg=3,
        bagging_temperature=1,
        random_strength=1,
        one_hot_max_size=2,
        leaf_estimation_method='Newton'
    )
    tunned_model.fit(
        X_train, y_train,
        cat_features=cat_features,
        verbose=False,
        eval_set=(X_validation, y_validation),
        plot=True
    )

    # ## Training the model after parameter tunning

    best_model = CatBoostClassifier(
        random_seed=63,
        iterations=int(tunned_model.tree_count_ * 1.2),
    )
    best_model.fit(
        X, y,
        cat_features=cat_features,
        verbose=100
    )


    # ## Calculate predictions for the contest

    X_test = test_df.drop('id', axis=1)
    test_pool = Pool(data=X_test, cat_features=cat_features)
    contest_predictions = best_model.predict_proba(test_pool)
    print('\nPredictoins:')
    print(contest_predictions)


    y_pred = [np.argmax(p) for p in contest_predictions]

    return len(X_train), y_pred, contest_predictions



### DESCRIPTION:
#    Runs the classification tutorial twice where 1 matrix is removed from the training dataset on the second run. Afterwards the prediction scores of the two runs are compared and printed

### USAGE:
# python.exe /example.py -1     ---- runs the tutorial twice without removing any matrices
# python.exe /example.py        ---- runs the tutorial removing training matrix at index 0
# python.exe /example.py 321    ---- runs the tutorial removing training matrix at index 321

if __name__ == '__main__':

    print('\n{0:} ALL MATRICES {0:}\n'.format('='*53))
    train_len_all, y_pred_all, pred_scores_all = run_tutorial(None)

    dropped_id = int(sys.argv[1:][0]) if len(sys.argv[1:]) == 1 else 0

    print('\n{0:} ALL EXCEPT 1 MATRIX  {0:}\n'.format('='*49))
    train_len_dropped, y_pred_dropped, pred_scores_dropped = run_tutorial(dropped_id)

    ### CALCULATE DIFFERENCES   
    print('\n\nDIFFERENCES (FIRST 50):\n') 
    changed_pred_indices = []
    differing_indices = []
    above_10_pct_change = 0
    for i, proba in enumerate(pred_scores_all):
        pred_score_all, pred_score_dropped = np.max(proba), np.max(pred_scores_dropped[i])
        lbl_pred_all, lbl_pred_dropped = y_pred_all[i], y_pred_dropped[i]
        differing_pred = y_pred_all[i] != y_pred_dropped[i]

        if pred_score_all != pred_score_dropped or lbl_pred_all != lbl_pred_dropped:
            lbl_changed = lbl_pred_all != lbl_pred_dropped
            pct_change = 100-(((1-pred_score_all) / pred_score_dropped)*100) if lbl_changed else 100-((pred_score_all / pred_score_dropped)*100)

            changed_pred_indices.append({'index': i, 'pct_change': pct_change})

            if abs(pct_change) > 10:
                above_10_pct_change += 1

            if i < 10 or (above_10_pct_change < 40 and abs(pct_change) > 10):
                print('Index: {:>4} | ({} matrices): {:.4f} | ({} matrices): {:.4f} | pct_change: {:6.2f}%{}'.format(i, train_len_all, pred_score_all, train_len_dropped, pred_score_dropped, pct_change, '   <== DIFFERENT LABEL' if differing_pred else ''))

        if differing_pred:
            differing_indices.append(i)

    avg = np.average([abs(c['pct_change']) for c in changed_pred_indices]) if len(changed_pred_indices) > 0 else 0
    print('\nTest size: {} | different labels: {} | changed prediction scores: {} | more than 10 pct change: {} | avg pct_change: {:.2f}%\n'.format(len(y_pred_all), len(differing_indices), len(changed_pred_indices), above_10_pct_change, avg))




#############################################################################################################
################################################## RESULT ###################################################
#############################################################################################################


# D:\example_class_tutorial> python.exe .\example_class_tutorial.py 32768                                                                                                                                          
# 0.21

# ===================================================== ALL MATRICES =====================================================

# [0, 1, 2, 3, 4, 5, 6, 7, 8]

# ALL MATRICES COUNT:
# Labels: {0, 1}
# Zero count = 1897, One count = 30872

# C:\Program Files\Python36\lib\site-packages\sklearn\model_selection\_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
#   FutureWarning)
# <IPython.core.display.HTML object>
# MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))
# Learning rate set to 0.041438
# 0:      learn: 0.6437243        total: 21ms     remaining: 23.4s
# 100:    learn: 0.1529288        total: 3.53s    remaining: 35.4s
# 200:    learn: 0.1465218        total: 8.59s    remaining: 39s
# 300:    learn: 0.1427914        total: 13.3s    remaining: 36s
# 400:    learn: 0.1395288        total: 18.2s    remaining: 32.4s
# 500:    learn: 0.1367097        total: 23.2s    remaining: 28.3s
# 600:    learn: 0.1334455        total: 28.5s    remaining: 24.3s
# 700:    learn: 0.1306259        total: 33.4s    remaining: 19.6s
# 800:    learn: 0.1279926        total: 38.5s    remaining: 15s
# 900:    learn: 0.1252181        total: 43.6s    remaining: 10.3s
# 1000:   learn: 0.1229690        total: 48.3s    remaining: 5.41s
# 1100:   learn: 0.1216446        total: 52s      remaining: 567ms
# 1112:   learn: 0.1213643        total: 52.6s    remaining: 0us

# Predictoins:
# [[0.4535 0.5465]
#  [0.0155 0.9845]
#  [0.012  0.988 ]
#  ...
#  [0.0051 0.9949]
#  [0.0517 0.9483]
#  [0.0127 0.9873]]

# ================================================= ALL EXCEPT 1 MATRIX  =================================================

# [0, 1, 2, 3, 4, 5, 6, 7, 8]

# ALL MATRICES COUNT:
# Labels: {0, 1}
# Zero count = 1897, One count = 30871

# <IPython.core.display.HTML object>
# MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))
# Learning rate set to 0.039153
# 0:      learn: 0.6463627        total: 12.2ms   remaining: 14.4s
# 100:    learn: 0.1548777        total: 3.27s    remaining: 35s
# 200:    learn: 0.1465521        total: 8.01s    remaining: 39.1s
# 300:    learn: 0.1425796        total: 12.7s    remaining: 37.2s
# 400:    learn: 0.1399863        total: 17.3s    remaining: 33.9s
# 500:    learn: 0.1375170        total: 21.8s    remaining: 29.8s
# 600:    learn: 0.1347887        total: 26.4s    remaining: 25.7s
# 700:    learn: 0.1322360        total: 31.2s    remaining: 21.5s
# 800:    learn: 0.1296949        total: 35.9s    remaining: 17.2s
# 900:    learn: 0.1271796        total: 40.6s    remaining: 12.8s
# 1000:   learn: 0.1247210        total: 45.3s    remaining: 8.28s
# 1100:   learn: 0.1226686        total: 50.1s    remaining: 3.77s
# 1183:   learn: 0.1206776        total: 54s      remaining: 0us

# Predictoins:
# [[0.3598 0.6402]
#  [0.0182 0.9818]
#  [0.0089 0.9911]
#  ...
#  [0.0053 0.9947]
#  [0.0433 0.9567]
#  [0.0095 0.9905]]


# DIFFERENCES (FIRST 50):

# Index:    0 | (26215 matrices): 0.5465 | (26214 matrices): 0.6402 | pct_change:  14.62%
# Index:    1 | (26215 matrices): 0.9845 | (26214 matrices): 0.9818 | pct_change:  -0.28%
# Index:    2 | (26215 matrices): 0.9880 | (26214 matrices): 0.9911 | pct_change:   0.31%
# Index:    3 | (26215 matrices): 0.9928 | (26214 matrices): 0.9910 | pct_change:  -0.19%
# Index:    4 | (26215 matrices): 0.9945 | (26214 matrices): 0.9955 | pct_change:   0.10%
# Index:    5 | (26215 matrices): 0.9912 | (26214 matrices): 0.9906 | pct_change:  -0.06%
# Index:    6 | (26215 matrices): 0.9926 | (26214 matrices): 0.9891 | pct_change:  -0.36%
# Index:    7 | (26215 matrices): 0.9951 | (26214 matrices): 0.9964 | pct_change:   0.12%
# Index:    8 | (26215 matrices): 0.8384 | (26214 matrices): 0.8653 | pct_change:   3.11%
# Index:    9 | (26215 matrices): 0.9890 | (26214 matrices): 0.9920 | pct_change:   0.30%
# Index:   16 | (26215 matrices): 0.8765 | (26214 matrices): 0.7050 | pct_change: -24.33%
# Index:   69 | (26215 matrices): 0.7027 | (26214 matrices): 0.5122 | pct_change: -37.19%
# Index:  101 | (26215 matrices): 0.6686 | (26214 matrices): 0.8172 | pct_change:  18.19%
# Index:  180 | (26215 matrices): 0.6942 | (26214 matrices): 0.5798 | pct_change: -19.73%
# Index:  209 | (26215 matrices): 0.7661 | (26214 matrices): 0.8756 | pct_change:  12.50%
# Index:  275 | (26215 matrices): 0.7243 | (26214 matrices): 0.5553 | pct_change: -30.44%
# Index:  296 | (26215 matrices): 0.5992 | (26214 matrices): 0.5145 | pct_change: -16.47%
# Index:  302 | (26215 matrices): 0.7977 | (26214 matrices): 0.7167 | pct_change: -11.31%
# Index:  353 | (26215 matrices): 0.7661 | (26214 matrices): 0.5768 | pct_change: -32.82%
# Index:  359 | (26215 matrices): 0.7295 | (26214 matrices): 0.6604 | pct_change: -10.46%
# Index:  363 | (26215 matrices): 0.9067 | (26214 matrices): 0.7345 | pct_change: -23.44%
# Index:  376 | (26215 matrices): 0.5907 | (26214 matrices): 0.5620 | pct_change:  27.17%   <== DIFFERENT LABEL
# Index:  380 | (26215 matrices): 0.9165 | (26214 matrices): 0.8028 | pct_change: -14.17%
# Index:  386 | (26215 matrices): 0.8241 | (26214 matrices): 0.7430 | pct_change: -10.92%
# Index:  437 | (26215 matrices): 0.8148 | (26214 matrices): 0.5204 | pct_change:  64.40%   <== DIFFERENT LABEL
# Index:  513 | (26215 matrices): 0.9262 | (26214 matrices): 0.8395 | pct_change: -10.33%
# Index:  523 | (26215 matrices): 0.9396 | (26214 matrices): 0.7359 | pct_change: -27.67%
# Index:  546 | (26215 matrices): 0.5608 | (26214 matrices): 0.5093 | pct_change:  13.77%   <== DIFFERENT LABEL
# Index:  570 | (26215 matrices): 0.7127 | (26214 matrices): 0.5820 | pct_change: -22.45%
# Index:  573 | (26215 matrices): 0.6991 | (26214 matrices): 0.8089 | pct_change:  13.57%
# Index:  615 | (26215 matrices): 0.5329 | (26214 matrices): 0.6129 | pct_change:  23.79%   <== DIFFERENT LABEL
# Index:  647 | (26215 matrices): 0.7050 | (26214 matrices): 0.8416 | pct_change:  16.23%
# Index:  665 | (26215 matrices): 0.5233 | (26214 matrices): 0.5452 | pct_change:  12.56%   <== DIFFERENT LABEL
# Index:  672 | (26215 matrices): 0.5144 | (26214 matrices): 0.6450 | pct_change:  20.25%
# Index:  680 | (26215 matrices): 0.5642 | (26214 matrices): 0.6718 | pct_change:  35.12%   <== DIFFERENT LABEL
# Index:  695 | (26215 matrices): 0.6807 | (26214 matrices): 0.7753 | pct_change:  12.20%
# Index:  784 | (26215 matrices): 0.5270 | (26214 matrices): 0.7096 | pct_change:  25.73%
# Index:  814 | (26215 matrices): 0.6502 | (26214 matrices): 0.7253 | pct_change:  10.35%
# Index:  849 | (26215 matrices): 0.6538 | (26214 matrices): 0.5683 | pct_change: -15.04%
# Index:  892 | (26215 matrices): 0.6595 | (26214 matrices): 0.7540 | pct_change:  12.54%
# Index:  951 | (26215 matrices): 0.5893 | (26214 matrices): 0.5217 | pct_change: -12.97%
# Index:  955 | (26215 matrices): 0.7544 | (26214 matrices): 0.5025 | pct_change:  51.13%   <== DIFFERENT LABEL
# Index:  979 | (26215 matrices): 0.7353 | (26214 matrices): 0.6160 | pct_change: -19.38%
# Index:  983 | (26215 matrices): 0.6603 | (26214 matrices): 0.7593 | pct_change:  13.04%
# Index: 1016 | (26215 matrices): 0.6067 | (26214 matrices): 0.5146 | pct_change:  23.58%   <== DIFFERENT LABEL
# Index: 1134 | (26215 matrices): 0.5097 | (26214 matrices): 0.6968 | pct_change:  26.84%
# Index: 1208 | (26215 matrices): 0.6113 | (26214 matrices): 0.7034 | pct_change:  13.08%
# Index: 1286 | (26215 matrices): 0.5651 | (26214 matrices): 0.5651 | pct_change:  23.05%   <== DIFFERENT LABEL

# Test size: 58921 | different labels: 529 | changed prediction scores: 58921 | more than 10 pct change: 2193 | avg pct_change: 1.45%
...