Алгоритм GBT с использованием H2O 3.8.2.6 в R и Rapidminer - PullRequest
0 голосов
/ 28 сентября 2018

Я попытался настроить параметры для GBM H2O в R, используя,

https://github.com/h2oai/h2o-3/blob/3.10.0.7/h2o-docs/src/product/tutorials/gbm/gbmTuning.Rmd

Затем я попытался применить настроенные гиперпараметры в Rapidminer для того же набора данных.В RI получил точность 97%, тогда как в RapidMiner с теми же параметрами я получаю только 91% точности.И R, и RapidMiner используют одну и ту же версию пакета H2O.Но почему эта разница в точности?

Мой процесс RapidMiner приведен ниже:

<?xml version="1.0" encoding="UTF-8"?><process version="9.0.002">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.0.002" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="9.0.002" expanded="true" height="68" name="Retrieve Mode_of_Labor_Data" width="90" x="45" y="34">
        <parameter key="repository_entry" value="../Data/Mode_of_Labor_Data"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="9.0.002" expanded="true" height="82" name="Set Role" width="90" x="313" y="34">
        <parameter key="attribute_name" value="Mode of Delivery"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="split_data" compatibility="9.0.002" expanded="true" height="103" name="Split Data" width="90" x="447" y="238">
        <enumeration key="partitions">
          <parameter key="ratio" value="0.8"/>
          <parameter key="ratio" value="0.2"/>
        </enumeration>
        <parameter key="sampling_type" value="automatic"/>
        <parameter key="use_local_random_seed" value="true"/>
        <parameter key="local_random_seed" value="1234"/>
      </operator>
      <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="581" y="136">
        <parameter key="number_of_trees" value="10000"/>
        <parameter key="reproducible" value="false"/>
        <parameter key="maximum_number_of_threads" value="4"/>
        <parameter key="use_local_random_seed" value="true"/>
        <parameter key="local_random_seed" value="1992"/>
        <parameter key="maximal_depth" value="15"/>
        <parameter key="min_rows" value="16.0"/>
        <parameter key="min_split_improvement" value="1.0E-4"/>
        <parameter key="number_of_bins" value="1024"/>
        <parameter key="learning_rate" value="0.05"/>
        <parameter key="sample_rate" value="1.0"/>
        <parameter key="distribution" value="bernoulli"/>
        <parameter key="early_stopping" value="true"/>
        <parameter key="stopping_rounds" value="5"/>
        <parameter key="stopping_metric" value="AUC"/>
        <parameter key="stopping_tolerance" value="1.0E-4"/>
        <parameter key="max_runtime_seconds" value="0"/>
        <list key="expert_parameters">
          <parameter key="nbins_cats" value="2048"/>
          <parameter key="learn_rate_annealing" value="0.99"/>
          <parameter key="col_sample_rate" value="0.76"/>
          <parameter key="col_sample_rate_per_tree" value="0.91"/>
          <parameter key="col_sample_rate_change_per_level" value="0.97"/>
        </list>
      </operator>
      <operator activated="true" class="apply_model" compatibility="9.0.002" expanded="true" height="82" name="Apply Model" width="90" x="715" y="340">
        <list key="application_parameters"/>
        <parameter key="create_view" value="false"/>
      </operator>
      <operator activated="true" class="performance_classification" compatibility="9.0.002" expanded="true" height="82" name="Performance" width="90" x="782" y="34">
        <parameter key="main_criterion" value="accuracy"/>
        <parameter key="accuracy" value="true"/>
        <parameter key="classification_error" value="false"/>
        <parameter key="kappa" value="false"/>
        <parameter key="weighted_mean_recall" value="false"/>
        <parameter key="weighted_mean_precision" value="false"/>
        <parameter key="spearman_rho" value="false"/>
        <parameter key="kendall_tau" value="false"/>
        <parameter key="absolute_error" value="false"/>
        <parameter key="relative_error" value="false"/>
        <parameter key="relative_error_lenient" value="false"/>
        <parameter key="relative_error_strict" value="false"/>
        <parameter key="normalized_absolute_error" value="false"/>
        <parameter key="root_mean_squared_error" value="false"/>
        <parameter key="root_relative_squared_error" value="false"/>
        <parameter key="squared_error" value="false"/>
        <parameter key="correlation" value="false"/>
        <parameter key="squared_correlation" value="false"/>
        <parameter key="cross-entropy" value="false"/>
        <parameter key="margin" value="false"/>
        <parameter key="soft_margin_loss" value="false"/>
        <parameter key="logistic_loss" value="false"/>
        <parameter key="skip_undefined_labels" value="true"/>
        <parameter key="use_example_weights" value="true"/>
        <list key="class_weights"/>
      </operator>
      <connect from_op="Retrieve Mode_of_Labor_Data" from_port="output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Split Data" to_port="example set"/>
      <connect from_op="Split Data" from_port="partition 1" to_op="Gradient Boosted Trees" to_port="training set"/>
      <connect from_op="Split Data" from_port="partition 2" to_op="Apply Model" to_port="unlabelled data"/>
      <connect from_op="Gradient Boosted Trees" from_port="model" to_op="Apply Model" to_port="model"/>
      <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
      <connect from_op="Performance" from_port="performance" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
...