Я попытался настроить параметры для GBM H2O в R, используя,
https://github.com/h2oai/h2o-3/blob/3.10.0.7/h2o-docs/src/product/tutorials/gbm/gbmTuning.Rmd
Затем я попытался применить настроенные гиперпараметры в Rapidminer для того же набора данных.В RI получил точность 97%, тогда как в RapidMiner с теми же параметрами я получаю только 91% точности.И R, и RapidMiner используют одну и ту же версию пакета H2O.Но почему эта разница в точности?
Мой процесс RapidMiner приведен ниже:
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.0.002" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.0.002" expanded="true" height="68" name="Retrieve Mode_of_Labor_Data" width="90" x="45" y="34">
<parameter key="repository_entry" value="../Data/Mode_of_Labor_Data"/>
</operator>
<operator activated="true" class="set_role" compatibility="9.0.002" expanded="true" height="82" name="Set Role" width="90" x="313" y="34">
<parameter key="attribute_name" value="Mode of Delivery"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="split_data" compatibility="9.0.002" expanded="true" height="103" name="Split Data" width="90" x="447" y="238">
<enumeration key="partitions">
<parameter key="ratio" value="0.8"/>
<parameter key="ratio" value="0.2"/>
</enumeration>
<parameter key="sampling_type" value="automatic"/>
<parameter key="use_local_random_seed" value="true"/>
<parameter key="local_random_seed" value="1234"/>
</operator>
<operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="581" y="136">
<parameter key="number_of_trees" value="10000"/>
<parameter key="reproducible" value="false"/>
<parameter key="maximum_number_of_threads" value="4"/>
<parameter key="use_local_random_seed" value="true"/>
<parameter key="local_random_seed" value="1992"/>
<parameter key="maximal_depth" value="15"/>
<parameter key="min_rows" value="16.0"/>
<parameter key="min_split_improvement" value="1.0E-4"/>
<parameter key="number_of_bins" value="1024"/>
<parameter key="learning_rate" value="0.05"/>
<parameter key="sample_rate" value="1.0"/>
<parameter key="distribution" value="bernoulli"/>
<parameter key="early_stopping" value="true"/>
<parameter key="stopping_rounds" value="5"/>
<parameter key="stopping_metric" value="AUC"/>
<parameter key="stopping_tolerance" value="1.0E-4"/>
<parameter key="max_runtime_seconds" value="0"/>
<list key="expert_parameters">
<parameter key="nbins_cats" value="2048"/>
<parameter key="learn_rate_annealing" value="0.99"/>
<parameter key="col_sample_rate" value="0.76"/>
<parameter key="col_sample_rate_per_tree" value="0.91"/>
<parameter key="col_sample_rate_change_per_level" value="0.97"/>
</list>
</operator>
<operator activated="true" class="apply_model" compatibility="9.0.002" expanded="true" height="82" name="Apply Model" width="90" x="715" y="340">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="9.0.002" expanded="true" height="82" name="Performance" width="90" x="782" y="34">
<parameter key="main_criterion" value="accuracy"/>
<parameter key="accuracy" value="true"/>
<parameter key="classification_error" value="false"/>
<parameter key="kappa" value="false"/>
<parameter key="weighted_mean_recall" value="false"/>
<parameter key="weighted_mean_precision" value="false"/>
<parameter key="spearman_rho" value="false"/>
<parameter key="kendall_tau" value="false"/>
<parameter key="absolute_error" value="false"/>
<parameter key="relative_error" value="false"/>
<parameter key="relative_error_lenient" value="false"/>
<parameter key="relative_error_strict" value="false"/>
<parameter key="normalized_absolute_error" value="false"/>
<parameter key="root_mean_squared_error" value="false"/>
<parameter key="root_relative_squared_error" value="false"/>
<parameter key="squared_error" value="false"/>
<parameter key="correlation" value="false"/>
<parameter key="squared_correlation" value="false"/>
<parameter key="cross-entropy" value="false"/>
<parameter key="margin" value="false"/>
<parameter key="soft_margin_loss" value="false"/>
<parameter key="logistic_loss" value="false"/>
<parameter key="skip_undefined_labels" value="true"/>
<parameter key="use_example_weights" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_op="Retrieve Mode_of_Labor_Data" from_port="output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_op="Gradient Boosted Trees" to_port="training set"/>
<connect from_op="Split Data" from_port="partition 2" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Gradient Boosted Trees" from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>