Я отправляю учебное задание через REST API.Процесс может обучаться, но когда он достигает спасительной части, он выдает ошибку, выдавая ошибку The replica master 0 exited with a non-zero status of 1.
.Я проверил свои разрешения IAM для учетной записи службы, и у нее есть следующие разрешения:
- Writer Logs
- ML Engine Admin
- Storage Admin
- Администратор объекта хранения
Вот более подробное отслеживание фактической ошибки.
Traceback (most recent call last):
File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main "__main__", mod_spec)
File "/usr/lib/python3.5/runpy.py", line 85, in _run_code exec(code, run_globals)
File "/root/.local/lib/python3.5/site-packages/trainer/task.py", line 223, in <module> dispatch(**parse_args.__dict__)
File "/root/.local/lib/python3.5/site-packages/trainer/task.py", line 133, in dispatch callbacks=callbacks)
File "/root/.local/lib/python3.5/site-packages/keras/legacy/interfaces.py", line 88, in wrapper return func(*args, **kwargs)
File "/root/.local/lib/python3.5/site-packages/keras/models.py", line 1110, in fit_generator initial_epoch=initial_epoch)
File "/root/.local/lib/python3.5/site-packages/keras/legacy/interfaces.py", line 88, in wrapper return func(*args, **kwargs)
File "/root/.local/lib/python3.5/site-packages/keras/engine/training.py", line 1849, in fit_generator callbacks.on_epoch_begin(epoch)
File "/root/.local/lib/python3.5/site-packages/keras/callbacks.py", line 63, in on_epoch_begin callback.on_epoch_begin(epoch, logs)
File "/root/.local/lib/python3.5/site-packages/trainer/task.py", line 74, in on_epoch_begin copy_file_to_gcs(self.job_dir, checkpoints[-1])
File "/root/.local/lib/python3.5/site-packages/trainer/task.py", line 150, in copy_file_to_gcs output_f.write(input_f.read())
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/lib/io/file_io.py", line 126, in read pywrap_tensorflow.ReadFromStream(self._read_buf, length, status)) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/lib/io/file_io.py", line 94, in _prepare_value return compat.as_str_any(val)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/compat.py", line 106, in as_str_any return as_str(value)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/compat.py", line 84, in as_text return bytes_or_text.decode(encoding) UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte
Я не совсем уверен, почему это происходит.Код взят из примеров проектов на странице googles git.Ничего не изменилось.Вот мой вызов REST:
{
"jobId": "training_20",
"trainingInput": {
"scaleTier": "BASIC",
"packageUris": ["gs://MY_BUCKET/census.tar.gz"],
"pythonModule": "trainer.task",
"args": [
"--train-files",
"gs://MY_BUCKET/adult.data.csv",
"--eval-files",
"gs://MY_BUCKET/adult.test.csv",
"--job-dir",
"gs://MY_BUCKET/models",
"--train-steps",
"100",
"--eval-steps",
"10"],
"region": "europe-west1",
"jobDir": "gs://MY_BUCKET/models",
"runtimeVersion": "1.4",
"pythonVersion": "3.5"
}
}
Это часть кода сохранения:
# Unhappy hack to work around h5py not being able to write to GCS.
# Force snapshots and saves to local filesystem, then copy them over to GCS.
if job_dir.startswith("gs://"):
census_model.save(CENSUS_MODEL)
copy_file_to_gcs(job_dir, CENSUS_MODEL)
else:
census_model.save(os.path.join(job_dir, CENSUS_MODEL))
# Convert the Keras model to TensorFlow SavedModel
model.to_savedmodel(census_model, os.path.join(job_dir, 'export'))
# h5py workaround: copy local models over to GCS if the job_dir is GCS.
def copy_file_to_gcs(job_dir, file_path):
with file_io.FileIO(file_path, mode='r') as input_f:
with file_io.FileIO(os.path.join(job_dir, file_path), mode='w+') as output_f:
output_f.write(input_f.read())