Что я делаю
Выполнение в основном этого графика TF (через sess.run(self.minimizer, feed_dict=...)
):
#
# Create the dataset and iterator
#
train_data_shape = (
(None, data.get_feature_amount()),
(None, data.get_target_amount()),
)
train_data = tf.data.Dataset.from_generator(
data.generate_train_data,
output_types=(config.get_dtype(), config.get_dtype()),
output_shapes=train_data_shape,
)
train_data = train_data.repeat()
train_data = train_data.prefetch(1)
train_data = train_data.apply(tf.data.experimental.unbatch())
train_data = train_data.shuffle(self.config.get_batch_size())
train_data = train_data.batch(self.config.get_batch_size(), drop_remainder=False)
test_data_shape = (
(data.get_test_data_amount(), data.get_feature_amount()),
(data.get_test_data_amount(), data.get_target_amount()),
)
test_data = tf.data.Dataset.from_generator(
data.generate_test_data,
output_types=(config.get_dtype(), config.get_dtype()),
output_shapes=test_data_shape,
)
test_data = test_data.repeat()
self.train_iterator = train_data.make_initializable_iterator()
self.test_iterator = test_data.make_initializable_iterator()
self.iterator_handle = tf.placeholder(tf.string, shape=[])
data_iterator = tf.data.Iterator.from_string_handle(self.iterator_handle, train_data.output_types, train_data.output_shapes)
#
# Create the structure of the DNN (hidden layers and stuff)
#
previous_layer_size = data.get_feature_amount()
target_amount = data.get_target_amount()
(self.features, self.targets) = data_iterator.get_next(name="layer_in")
x = tf.cast(tf.transpose(self.features), config.get_dtype())
x_is_sparse = True
y = self.targets
self.layers = [(None, None, self.features)]
for i, l in enumerate(config.get_hidden_layer_structure()):
w = tf.Variable(
tf.random_normal((previous_layer_size, l), 0, 0.05, dtype=config.get_dtype(), seed=config.get_seed()),
name="weights_" + str(i))
b = tf.Variable(tf.random_normal((l, 1), 0, 0.05, dtype=config.get_dtype(), seed=config.get_seed()),
name="bias_" + str(i))
sum_w = tf.matmul(w, x, True, b_is_sparse=x_is_sparse)
x_is_sparse = False
x = tf.nn.relu(tf.add(sum_w, b), "layer_" + str(i))
self.layers.append((w, b, x))
previous_layer_size = l
w = tf.Variable(
tf.random_normal((previous_layer_size, target_amount), 0, 0.05, dtype=config.get_dtype(), seed=config.get_seed()),
name="weights_end")
b = tf.Variable(tf.random_normal((target_amount, 1), 0, 0.05, dtype=config.get_dtype(), seed=config.get_seed()),
name="bias_end")
logit = tf.add(tf.matmul(w, x, True), b, "logit")
predictions = tf.transpose(tf.concat((tf.nn.sigmoid(logit[:1, :]), logit[1:, :]), axis=0), name="predictions")
self.layers.append((w, b, predictions))
#
# Create the evaluation of the DNN (loss/error and stuff)
#
error = tf.subtract(predictions, y, "error")
self.mae = tf.reduce_mean(tf.abs(error), axis=0, name="MAE")
#
# Create the optimization of the DNN (gradient and stuff)
#
regularization = tf.constant(0, dtype=config.get_dtype())
for w, _, _ in self.layers:
if w is not None:
regularization += tf.nn.l2_loss(w)
regularization *= config.get_lambda()
loss = tf.losses.sigmoid_cross_entropy(y[:, 0], logit[0, :]) * 500 + tf.reduce_sum(self.mae[1:]) + regularization
# grads_and_vars is a list of tuples (gradient, variable)
parameters = [w for w, _, _ in self.layers if w is not None] + [b for _, b, _ in self.layers if b is not None]
grads_and_vars = config.get_optimizer().compute_gradients(loss, var_list=parameters)
capped_grads_and_vars = [(tf.clip_by_value(gv[0], -5., 5.), gv[1]) for gv in grads_and_vars]
# apply the capped gradients.
self.minimizer = config.get_optimizer().apply_gradients(capped_grads_and_vars)
Что происходит
Иногда (примерно 1 из 200000 вызовов) sess.run
не возвращается - он застрял - как ... заморожен. Моя программа на Python, конечно, не выходит сама по себе. И самое странное: внезапно требуется огромное количество ОЗУ (например, 10 ГБ), которое я никак не могу освободить, если только я не перезагружу компьютер.
Взгляните на это, что смешно:
# ps --sort -%mem aux
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
me 13671 37.1 11.2 19321984 2764652 pts/2 Sl+ 13:02 186:10 python3 ./main.py -sp 32ki auto
me 7204 5.1 4.0 6869916 1005076 ? Sl 12:07 28:36 /snap/pycharm-professional/109/jre64/bin/java -classp
[...]
# free
total used free shared buff/cache available
Mem: 24636156 16067024 6846816 313488 1722316 7896620
Swap: 8368124 442624 7925500
# kill -15 13671
# ps --sort -%mem aux
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
me 7204 5.1 4.0 6869916 1005076 ? Sl 12:07 28:39 /snap/pycharm-professional/109/jre64/bin/java -classpath /snap/pycharm-professional/109/lib/bootstrap.jar:/snap/pycharm-professional/109/lib/extensions.jar:/snap/pycharm-pro
[...]
# free
total used free shared buff/cache available
Mem: 24636156 16034456 6849972 336212 1751728 7906252
Swap: 8368124 437760 7930364
/var/log/syslog
говорит об этом временном интервале:
Jan 9 15:05:38 MPC-6 kernel: [10906.994646] BUG: unable to handle kernel paging request at ffffdd4803208b60
Jan 9 15:05:38 MPC-6 kernel: [10906.994656] IP: kfree+0x53/0x180
Jan 9 15:05:38 MPC-6 kernel: [10906.994657] PGD 0 P4D 0
Jan 9 15:05:38 MPC-6 kernel: [10906.994661] Oops: 0000 [#1] SMP NOPTI
Jan 9 15:05:38 MPC-6 kernel: [10906.994663] Modules linked in: ipt_MASQUERADE nf_nat_masquerade_ipv4 nf_conntrack_netlink nfnetlink xfrm_user xfrm_algo iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype xt_conntrack nf_nat nf_conntrack libcrc32c br_netfilter bridge stp llc aufs ip6table_filter ip6_tables iptable_filter pci_stub vboxpci(OE) vboxnetadp(OE) vboxnetflt(OE) vboxdrv(OE) cmac rfcomm bnep binfmt_misc edac_mce_amd kvm_amd kvm irqbypass btusb btrtl crct10dif_pclmul snd_usb_audio btbcm crc32_pclmul btintel ghash_clmulni_intel snd_usbmidi_lib pcbc snd_seq_midi bluetooth snd_seq_midi_event snd_hda_codec_realtek snd_hda_codec_generic snd_hda_codec_hdmi joydev aesni_intel aes_x86_64 input_leds crypto_simd snd_hda_intel glue_helper cryptd ecdh_generic snd_rawmidi snd_hda_codec snd_hda_core snd_seq snd_hwdep snd_pcm
Jan 9 15:05:38 MPC-6 kernel: [10906.994697] snd_seq_device k10temp fam15h_power snd_timer shpchp snd soundcore mac_hid sch_fq_codel cuse lm92 nct6775 hwmon_vid parport_pc ppdev lp parport ip_tables x_tables autofs4 uas usb_storage amdgpu(OE) amdchash(OE) amdttm(OE) amd_sched(OE) hid_generic mxm_wmi amdkcl(OE) i2c_algo_bit amd_iommu_v2 drm_kms_helper pata_acpi syscopyarea sysfillrect r8169 sysimgblt fb_sys_fops mii usbhid nvme i2c_piix4 drm hid ahci pata_atiixp libahci nvme_core wmi
Jan 9 15:05:38 MPC-6 kernel: [10906.994721] CPU: 7 PID: 23629 Comm: kworker/u16:2 Tainted: G OE 4.15.0-43-generic #46-Ubuntu
Jan 9 15:05:38 MPC-6 kernel: [10906.994723] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./970A-G/3.1, BIOS P1.20 01/12/2016
Jan 9 15:05:38 MPC-6 kernel: [10906.994812] Workqueue: kfd_restore_wq restore_process_worker [amdgpu]
Jan 9 15:05:38 MPC-6 kernel: [10906.994815] RIP: 0010:kfree+0x53/0x180
Jan 9 15:05:38 MPC-6 kernel: [10906.994816] RSP: 0018:ffffb837083c3a70 EFLAGS: 00010282
Jan 9 15:05:38 MPC-6 kernel: [10906.994818] RAX: ffff89dbe11420d0 RBX: ffffb8370822d000 RCX: 0000000000008600
Jan 9 15:05:38 MPC-6 kernel: [10906.994819] RDX: 00000000ffffffe4 RSI: ffff89dc56a78fd0 RDI: 0000762940000000
Jan 9 15:05:38 MPC-6 kernel: [10906.994821] RBP: ffffb837083c3a88 R08: 000000000000ec00 R09: ffff89d928a2ec70
Jan 9 15:05:38 MPC-6 kernel: [10906.994822] R10: ffffdd4803208b40 R11: 0000000000000d00 R12: ffffb8370822cf60
Jan 9 15:05:38 MPC-6 kernel: [10906.994823] R13: ffffffffc03cece4 R14: 0000000000000083 R15: 0000000000000200
Jan 9 15:05:38 MPC-6 kernel: [10906.994825] FS: 0000000000000000(0000) GS:ffff89dc7edc0000(0000) knlGS:0000000000000000
Jan 9 15:05:38 MPC-6 kernel: [10906.994826] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jan 9 15:05:38 MPC-6 kernel: [10906.994828] CR2: ffffdd4803208b60 CR3: 00000004ec276000 CR4: 00000000000406e0
Jan 9 15:05:38 MPC-6 kernel: [10906.994829] Call Trace:
Jan 9 15:05:38 MPC-6 kernel: [10906.994875] amdgpu_vram_mgr_new+0x314/0x370 [amdgpu]
Jan 9 15:05:38 MPC-6 kernel: [10906.994882] amdttm_bo_mem_space+0x2ea/0x470 [amdttm]
Jan 9 15:05:38 MPC-6 kernel: [10906.994887] amdttm_bo_validate+0xbf/0x140 [amdttm]
Jan 9 15:05:38 MPC-6 kernel: [10906.994931] ? amdgpu_vm_bo_update+0x3fc/0x7e0 [amdgpu]
Jan 9 15:05:38 MPC-6 kernel: [10906.994982] amdgpu_amdkfd_bo_validate+0x82/0x160 [amdgpu]
Jan 9 15:05:38 MPC-6 kernel: [10906.995034] amdgpu_amdkfd_gpuvm_restore_process_bos+0x273/0x4c0 [amdgpu]
Jan 9 15:05:38 MPC-6 kernel: [10906.995038] ? irq_work_queue+0x99/0xa0
Jan 9 15:05:38 MPC-6 kernel: [10906.995040] ? console_unlock+0x2e5/0x4e0
Jan 9 15:05:38 MPC-6 kernel: [10906.995090] restore_process_worker+0x52/0x1f0 [amdgpu]
Jan 9 15:05:38 MPC-6 kernel: [10906.995092] process_one_work+0x1de/0x410
Jan 9 15:05:38 MPC-6 kernel: [10906.995094] worker_thread+0x32/0x410
Jan 9 15:05:38 MPC-6 kernel: [10906.995097] kthread+0x121/0x140
Jan 9 15:05:38 MPC-6 kernel: [10906.995098] ? process_one_work+0x410/0x410
Jan 9 15:05:38 MPC-6 kernel: [10906.995100] ? kthread_create_worker_on_cpu+0x70/0x70
Jan 9 15:05:38 MPC-6 kernel: [10906.995103] ? do_syscall_64+0x73/0x130
Jan 9 15:05:38 MPC-6 kernel: [10906.995105] ? SyS_exit+0x17/0x20
Jan 9 15:05:38 MPC-6 kernel: [10906.995108] ret_from_fork+0x22/0x40
Jan 9 15:05:38 MPC-6 kernel: [10906.995109] Code: 00 80 49 01 da 0f 82 39 01 00 00 48 c7 c7 00 00 00 80 48 2b 3d 37 46 20 01 49 01 fa 49 c1 ea 0c 49 c1 e2 06 4c 03 15 15 46 20 01 8b 42 20 48 8d 50 ff a8 01 4c 0f 45 d2 49 8b 52 20 48 8d 42
Jan 9 15:05:38 MPC-6 kernel: [10906.995133] RIP: kfree+0x53/0x180 RSP: ffffb837083c3a70
Jan 9 15:05:38 MPC-6 kernel: [10906.995134] CR2: ffffdd4803208b60
Jan 9 15:05:38 MPC-6 kernel: [10906.995136] ---[ end trace 49a520b043b99039 ]---
Мои характеристики
- ОС - linux
- Графический процессор AMD Radeon RX Vega 64
- ЦП AMD FX-8320
- ОЗУ 2 раза по 4 ГБ и 2 раза по 8 ГБ 1333 DDR-3
- Материнская плата ASRock 970A-G 3.1
- Я получаю Tensorflow при установке через
pip3 install tensorflow-rocm
(версия 1.12.0)
Я не хочу сообщать об ошибке на GitHub , прежде чем я на 100% уверен, что это ошибка TF, а не я делаю что-то не так.