Для модели keras (basi c CNN LSTM) я использую генератор данных, который загружает разные .npz для обработки больших объемов данных по частям, которые умещаются в моей оперативной памяти. Через несколько эпох я получаю сообщение об ошибке, что генератор данных не инициализирован. Мне это кажется странным, поскольку первые несколько эпох прошли нормально. Тем не менее, я не могу найти много документации по генераторам данных keras.
Я попытался переустановить CUDA, перезагрузить IDE, перезагрузить P C.
Я запустил python 3.8. 2., keras 2.4.2, TF 2.2.0., CUDA 10.1, CuDNN 7.6.5.32., На моем GPU, Quadro M1000M.
Генератор данных:
class My_Custom_Generator(keras.utils.Sequence):
def __init__(self, folder):
self.folder = folder
def __len__(self):
return len(os.listdir(self.folder))
def __getitem__(self, idx):
data = np.load(self.folder + os.listdir(self.folder)[idx])
input = data['input']
output = data['output']
# Reshape (CNN)
input = input.reshape((input.shape[0], n_seq, int(n_back / n_seq), n_features))
# NaN check
if np.logical_or(np.sum(np.isnan(input)) != 0, np.sum(np.isnan(output)) != 0):
print('NAN IN THE DATA')
return (input, output)
И модель keras:
# Data Generators
trainingGenerator = My_Custom_Generator('Data/Preprocessed/Batches/Training/')
validationGenerator = My_Custom_Generator('Data/Preprocessed/Batches/Validation/')
# LSTM
t = time.time()
print('Model Training...')
model = Sequential()
model.add(TimeDistributed(convolutional.Conv1D(filters=64, kernel_size=1, activation='tanh'), input_shape=(None, int(n_back / n_seq), n_features)))
model.add(TimeDistributed(convolutional.MaxPooling1D(pool_size=2)))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(100, activation='tanh',
input_shape=(n_back, n_features),
recurrent_regularizer=regularizers.l1_l2(l1=1e-4, l2=1e-4),
activity_regularizer=regularizers.l2(1e-4),
bias_regularizer=regularizers.l2(1e-4)))
model.add(Dense(100,
activity_regularizer=regularizers.l2(1e-5),
bias_regularizer=regularizers.l2(1e-4),
kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)))
model.add(Dense(n_fore,
activity_regularizer=regularizers.l2(1e-5),
bias_regularizer=regularizers.l2(1e-4),
kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)))
optimizer = optimizers.Adam(clipnorm=1, learning_rate=1e-5)
model.compile(optimizer=optimizer, loss='mse')
print('Model Parameters: %i' % model.count_params())
model.fit(trainingGenerator, epochs=epochs, verbose=1, shuffle=True, validation_data=validationGenerator)
Затем, наконец, полный результат работы:
2020-08-05 18:38:41.348800: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll
Using TensorFlow backend.
2020-08-05 18:38:44.777127: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library nvcuda.dll
2020-08-05 18:38:45.149915: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: Quadro M1000M computeCapability: 5.0
coreClock: 1.0715GHz coreCount: 4 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 74.65GiB/s
2020-08-05 18:38:45.150516: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll
2020-08-05 18:38:45.156257: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_10.dll
2020-08-05 18:38:45.161938: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cufft64_10.dll
2020-08-05 18:38:45.163993: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library curand64_10.dll
2020-08-05 18:38:45.170353: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cusolver64_10.dll
2020-08-05 18:38:45.173377: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cusparse64_10.dll
2020-08-05 18:38:45.188075: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudnn64_7.dll
2020-08-05 18:38:45.188439: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1703] Adding visible gpu devices: 0
Model Training...
2020-08-05 18:38:45.201535: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
2020-08-05 18:38:45.215336: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x26d21162670 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-08-05 18:38:45.215751: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
2020-08-05 18:38:45.216221: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: Quadro M1000M computeCapability: 5.0
coreClock: 1.0715GHz coreCount: 4 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 74.65GiB/s
2020-08-05 18:38:45.216857: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll
2020-08-05 18:38:45.217199: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_10.dll
2020-08-05 18:38:45.217876: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cufft64_10.dll
2020-08-05 18:38:45.218273: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library curand64_10.dll
2020-08-05 18:38:45.218659: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cusolver64_10.dll
2020-08-05 18:38:45.219058: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cusparse64_10.dll
2020-08-05 18:38:45.219446: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudnn64_7.dll
2020-08-05 18:38:45.219935: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1703] Adding visible gpu devices: 0
2020-08-05 18:38:46.711630: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1102] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-08-05 18:38:46.712037: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1108] 0
2020-08-05 18:38:46.712296: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1121] 0: N
2020-08-05 18:38:46.712655: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1247] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3034 MB memory) -> physical GPU (device: 0, name: Quadro M1000M, pci bus id: 0000:01:00.0, compute capability: 5.0)
2020-08-05 18:38:46.716637: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x26d3f46dec0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2020-08-05 18:38:46.716992: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Quadro M1000M, Compute Capability 5.0
Model Parameters: 130970
Epoch 1/100
2020-08-05 18:38:50.798930: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_10.dll
2020-08-05 18:38:51.522975: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudnn64_7.dll
2020-08-05 18:38:52.503580: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] Internal: Invoking GPU asm compilation is supported on Cuda non-Windows platforms only
Relying on driver to perform ptx compilation.
Modify $PATH to customize ptxas location.
This message will be only logged once.
81/81 [==============================] - 153s 2s/step - loss: 0.3625 - val_loss: 0.3157
Epoch 2/100
81/81 [==============================] - 176s 2s/step - loss: 0.3441 - val_loss: 0.3041
Epoch 3/100
81/81 [==============================] - 178s 2s/step - loss: 0.3280 - val_loss: 0.2929
Epoch 4/100
81/81 [==============================] - 179s 2s/step - loss: 0.3135 - val_loss: 0.2829
Epoch 5/100
81/81 [==============================] - 188s 2s/step - loss: 0.3004 - val_loss: 0.2745
Epoch 6/100
81/81 [==============================] - 189s 2s/step - loss: 0.2885 - val_loss: 0.2663
Epoch 7/100
81/81 [==============================] - 201s 2s/step - loss: 0.2774 - val_loss: 0.2588
Epoch 8/100
2020-08-05 19:00:19.435663: E tensorflow/stream_executor/cuda/cuda_driver.cc:939] could not synchronize on CUDA context: CUDA_ERROR_LAUNCH_FAILED: unspecified launch failure :: 0x00007FFC90958805 tensorflow::CurrentStackTrace
0x00007FFC90689E3E tensorflow::ConfigProto::HasBitSetters::graph_options
0x00007FFC9069056E stream_executor::StreamExecutor::EnablePeerAccessTo
0x00007FFC7D47B8C8 tensorflow::StepStats::internal_default_instance
0x00007FFC7D48C9F4 google::protobuf::RepeatedPtrField<tensorflow::InterconnectLink>::Add
0x00007FFC7D1C70A2 std::vector<tensorflow::DtypeAndPartialTensorShape,std::allocator<tensorflow::DtypeAndPartialTensorShape> >::operator=
0x00007FFC77FA9E71 tensorflow::RunOptions::output_partition_graphs
0x00007FFC77FB43A1 TFE_TensorHandleResolve
0x00007FFC77D1D583 TFE_Py_TensorShapeSlice
0x00007FFC77D1B30A std::vector<tensorflow::monitoring::Point::Label,std::allocator<tensorflow::monitoring::Point::Label> >::reserve
0x00007FFCF16E2FE6 PyNumber_InPlaceLshift
0x00007FFCF16FCCE0 Py_CheckFunctionResult
0x00007FFCF16FEA32 PyEval_EvalFrameDefault
0x00007FFCF16FCFF0 Py_CheckFunctionResult
0x00007FFCF16FEA32 PyEval_EvalFrameDefault
0x00007FFCF16FCFF0 Py_CheckFunctionResult
0x00007FFCF16FEA32 PyEval_EvalFrameDefault
0x00007FFCF16FA258 PyEval_EvalCodeWithName
0x00007FFCF16FB1BF PyFunction_Vectorcall
0x00007FFCF173B2F2 PyVectorcall_Call
0x00007FFCF173B183 PySequence_GetItem
0x00007FFCF16FFB35 PyEval_EvalFrameDefault
0x00007FFCF16FA258 PyEval_EvalCodeWithName
0x00007FFCF16FD2E5 Py_CheckFunctionResult
0x00007FFCF16FE89F PyEval_EvalFrameDefault
0x00007FFCF16FA258 PyEval_EvalCodeWithName
0x00007FFCF16FD2E5 Py_CheckFunctionResult
0x00007FFCF16FECBB PyEval_EvalFrameDefault
0x00007FFCF16FCFF0 Py_CheckFunctionResult
0x00007FFCF16FECBB PyEval_EvalFrameDefault
0x00007FFCF16FCFF0 Py_CheckFunctionResult
0x00007FFCF16FEA32 PyEval_EvalFrameDefault
0x00007FFCF16FA258 PyEval_EvalCodeWithName
0x00007FFCF16FD2E5 Py_CheckFunctionResult
0x00007FFCF16FEA32 PyEval_EvalFrameDefault
0x00007FFCF16FA258 PyEval_EvalCodeWithName
0x00007FFCF16FB1BF PyFunction_Vectorcall
0x00007FFCF173B2F2 PyVectorcall_Call
0x00007FFCF173B183 PySequence_GetItem
0x00007FFCF16FFB35 PyEval_EvalFrameDefault
0x00007FFCF16FA258 PyEval_EvalCodeWithName
0x00007FFCF16F9E05 PyObject_CallFunctionObjArgs
0x00007FFCF16FCCE0 Py_CheckFunctionResult
0x00007FFCF16FF0F2 PyEval_EvalFrameDefault
0x00007FFCF16FA258 PyEval_EvalCodeWithName
0x00007FFCF172242B PyEval_EvalCodeEx
0x00007FFCF1722389 PyEval_EvalCode
0x00007FFCF17220B6 PyArena_New
0x00007FFCF1722045 PyArena_New
0x00007FFCF17D4CDC PyRun_FileExFlags
0x00007FFCF17D4A7F PyRun_SimpleFileExFlags
0x00007FFCF1892FA3 PyRun_AnyFileExFlags
0x00007FFCF1826357 Py_FatalError
0x00007FFCF177F29C Py_RunMain
0x00007FFCF177F125 Py_RunMain
0x00007FFCF177F881 Py_Main
0x00007FF7C9F81258 (unknown)
0x00007FFD2AAB7BD4 BaseThreadInitThunk
0x00007FFD2C72CE51 RtlUserThreadStart
Traceback (most recent call last):
File "C:/Users/Niels/Documents/Python/UpWork/predictive-maintenance-poc/LSTM_db.py", line 81, in <module>
model.fit(trainingGenerator, epochs=epochs, verbose=1, shuffle=True, validation_data=validationGenerator)
File "C:\Users\Niels\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
return method(self, *args, **kwargs)
File "C:\Users\Niels\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\training.py", line 855, in fit
callbacks.on_train_batch_end(step, logs)
File "C:\Users\Niels\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\callbacks.py", line 389, in on_train_batch_end
logs = self._process_logs(logs)
File "C:\Users\Niels\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\callbacks.py", line 265, in _process_logs
return tf_utils.to_numpy_or_python_type(logs)
File "C:\Users\Niels\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\utils\tf_utils.py", line 523, in to_numpy_or_python_type
return nest.map_structure(_to_single_numpy_or_python_type, tensors)
File "C:\Users\Niels\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\util\nest.py", line 617, in map_structure
structure[0], [func(*x) for x in entries],
File "C:\Users\Niels\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\util\nest.py", line 617, in <listcomp>
structure[0], [func(*x) for x in entries],
File "C:\Users\Niels\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\utils\tf_utils.py", line 519, in _to_single_numpy_or_python_type
x = t.numpy()
File "C:\Users\Niels\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\framework\ops.py", line 961, in numpy
maybe_arr = self._numpy() # pylint: disable=protected-access
File "C:\Users\Niels\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\framework\ops.py", line 929, in _numpy
six.raise_from(core._status_to_exception(e.code, e.message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InternalError: GPU sync failed
2020-08-05 19:00:19.610882: W tensorflow/core/kernels/data/generator_dataset_op.cc:103] Error occurred when finalizing GeneratorDataset iterator: Failed precondition: Python interpreter state is not initialized. The process may be terminated.
[[{{node PyFunc}}]]
Process finished with exit code 1