Модель BERT не изучает новое задание - PullRequest
0 голосов
/ 26 июня 2019

Я пытаюсь настроить предварительно подготовленную модель BERT в наборе данных для обзора amazon. Для этого я расширил файл run_classifier следующим процессором:

class AmazonProcessor(DataProcessor):
  """Processor for the Amazon data set."""

  def get_train_examples(self, data_dir):
    """See base class."""
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

  def get_dev_examples(self, data_dir):
    """See base class."""
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

  def get_test_examples(self, data_dir):
    """See base class."""
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

  def get_labels(self):
    """See base class."""
    return ["0", "1", "2"]

  def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      # header
      if i == 0:
        continue
      guid = "%s-%s" % (set_type, i)
      text_a = tokenization.convert_to_unicode(line[13])
      label = tokenization.convert_to_unicode(line[7])
      # only train on 3 labels instead of 5
      if int(label) <= 2: label = "0"
      if int(label) == 3: label = "1"
      if int(label) >= 4: label = "2"
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
      return examples

Я тренируюсь на ноутбуке colab на графическом процессоре, поэтому я также адаптировал основной метод для своей потребности:

processors = {
  "cola": run_classifier.ColaProcessor,
  "mnli": run_classifier.MnliProcessor,
  "mrpc": run_classifier.MrpcProcessor,
  "xnli": run_classifier.XnliProcessor,
  "amazon": run_classifier.AmazonProcessor,
}

bert_config_file = os.path.join(BERT_FOLDER, "bert_config.json")
max_seq_length = 128
output_dir = "drive/My Drive/model"
task_name = "amazon"
vocab_file = os.path.join(BERT_FOLDER, "vocab.txt")
do_lower_case = False
master = None
tpu_cluster_resolver = None
save_checkpoints_steps = 1000
iterations_per_loop = 1000
use_tpu = False
data_dir  = "drive/My Drive/csv_dataset"
learning_rate = 5e-5
warmup_proportion = 0.1
train_batch_size = 16
eval_batch_size = 1
predict_batch_size = 1
num_train_epochs = 10.0
num_train_steps = 10000
num_tpu_cores = 8
#init_checkpoint = os.path.join(BERT_FOLDER, "bert_model.ckpt")
init_checkpoint = "drive/My Drive/model2/model.ckpt-41000"

do_train = True
do_eval = True

tokenization.validate_case_matches_checkpoint(do_lower_case, init_checkpoint)


bert_config = modeling.BertConfig.from_json_file(bert_config_file)
print(bert_config)

task_name = task_name.lower()

processor = processors[task_name]()

label_list = processor.get_labels()

tokenizer = tokenization.FullTokenizer(
  vocab_file=vocab_file, do_lower_case=do_lower_case)

is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
  cluster=tpu_cluster_resolver,
  master=master,
  model_dir=output_dir,
  save_checkpoints_steps=save_checkpoints_steps,
  tpu_config=tf.contrib.tpu.TPUConfig(
      iterations_per_loop=iterations_per_loop,
      num_shards=num_tpu_cores,
      per_host_input_for_training=is_per_host))

train_examples = None
num_train_steps = None
num_warmup_steps = None
if do_train:
  train_examples = processor.get_train_examples(data_dir)
  num_train_steps = int(
      len(train_examples) / train_batch_size * num_train_epochs)
  num_warmup_steps = int(num_train_steps * warmup_proportion)

model_fn = run_classifier.model_fn_builder(
  bert_config=bert_config,
  num_labels=len(label_list),
  init_checkpoint=init_checkpoint,
  learning_rate=learning_rate,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=use_tpu,
  use_one_hot_embeddings=use_tpu)

estimator = tf.contrib.tpu.TPUEstimator(
  use_tpu=use_tpu,
  model_fn=model_fn,
  config=run_config,
  train_batch_size=train_batch_size,
  eval_batch_size=eval_batch_size,
  predict_batch_size=predict_batch_size)

if do_train:
  train_file = os.path.join(output_dir, "train.tf_record")
  run_classifier.file_based_convert_examples_to_features(
      train_examples, label_list, max_seq_length, tokenizer, train_file)
  tf.logging.info("***** Running training *****")
  tf.logging.info("  Num examples = %d", len(train_examples))
  tf.logging.info("  Batch size = %d", train_batch_size)
  tf.logging.info("  Num steps = %d", num_train_steps)
  train_input_fn = run_classifier.file_based_input_fn_builder(
      input_file=train_file,
      seq_length=max_seq_length,
      is_training=True,
      drop_remainder=True)
  estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

if do_eval:
  eval_examples = processor.get_test_examples(data_dir)
  num_actual_eval_examples = len(eval_examples)
  if use_tpu:
    # TPU requires a fixed batch size for all batches, therefore the number
    # of examples must be a multiple of the batch size, or else examples
    # will get dropped. So we pad with fake examples which are ignored
    # later on. These do NOT count towards the metric (all tf.metrics
    # support a per-instance weight, and these get a weight of 0.0).
    while len(eval_examples) % eval_batch_size != 0:
      eval_examples.append(PaddingInputExample())

  eval_file = os.path.join(output_dir, "eval.tf_record")
  run_classifier.file_based_convert_examples_to_features(
      eval_examples, label_list, max_seq_length, tokenizer, eval_file)

  tf.logging.info("***** Running evaluation *****")
  tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                  len(eval_examples), num_actual_eval_examples,
                  len(eval_examples) - num_actual_eval_examples)
  tf.logging.info("  Batch size = %d", eval_batch_size)

  # This tells the estimator to run through the entire set.
  eval_steps = None
  # However, if running eval on the TPU, you will need to specify the
  # number of steps.
  if use_tpu:
    assert len(eval_examples) % eval_batch_size == 0
    eval_steps = int(len(eval_examples) // eval_batch_size)

  eval_drop_remainder = True if use_tpu else False
  eval_input_fn = run_classifier.file_based_input_fn_builder(
      input_file=eval_file,
      seq_length=max_seq_length,
      is_training=False,
      drop_remainder=eval_drop_remainder)

  result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

  output_eval_file = os.path.join(output_dir, "eval_results.txt")
  with tf.gfile.GFile(output_eval_file, "w") as writer:
    tf.logging.info("***** Eval results *****")
    for key in sorted(result.keys()):
      tf.logging.info("  %s = %s", key, str(result[key]))
      writer.write("%s = %s\n" % (key, str(result[key])))

Я знаю, что это много кода, но поскольку я не могу точно указать на ошибку, я хочу представить все это.

Обратите внимание, что большая часть результатов логирования выглядит вполне разумно:

Например, преобразованный пример:

INFO:tensorflow:tokens: [CLS] Ich habe schon viele Klavier ##kon ##zer ##te gehört , aber was Frau Martha Ar ##geri ##ch hier spielt lässt einem ge ##wis ##ser ##ma ##ßen den At ##em stock ##en . So geni ##al habe ich diese 2 Klavier ##kon ##zer ##te von Ra ##ch ##mani ##no ##ff und T ##sch ##aik ##ov ##sky noch nie gehört . Sie ent ##fes ##selt einen regel ##rechte ##n Feuer ##stu ##rm an Vir ##tu ##osi ##tät . [SEP]
INFO:tensorflow:input_ids: 101 21023 21404 16363 18602 48021 17423 14210 10216 16706 117 11566 10134 16783 26904 18484 68462 10269 13329 28508 25758 10745 46503 83648 12754 10369 20284 10140 11699 10451 20511 10136 119 12882 107282 10415 21404 12979 12750 123 48021 17423 14210 10216 10166 38571 10269 31124 10343 13820 10130 157 12044 106333 11024 16116 11230 11058 16706 119 11583 61047 58058 26063 10897 46578 55663 10115 68686 19987 19341 10151 106433 10991 20316 24308 119 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:label: 2 (id = 2)

Или загрузка модели из файла контрольных точек:

INFO:tensorflow:  name = output_weights:0, shape = (3, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = output_bias:0, shape = (3,), *INIT_FROM_CKPT*

Но в итоге eval_accuracy всегда остается неизменным:

I0625 15:46:41.328946   eval_accuracy = 0.3338616

Полный репозиторий можно найти здесь: https://github.com/joroGER/bert/

И суть ноутбука здесь: https://colab.research.google.com/gist/joroGER/75c1c9c6383f0199bb54ce7b63d412d0/untitled4.ipynb

...