Я пытаюсь обучить модель Vggish Releaseasde от Google, как автоэнкодер. Вот форма автоэнкодера:
print('Input {0}'.format(net))
with tf.variable_scope('Encoder'):
with tf.variable_scope('Stage1'):
net = lays.conv2d(net, 64, [3,3], stride=2, padding='SAME')
print('Stage1 {0}'.format(net))
with tf.variable_scope('Stage2'):
net = lays.conv2d(net, 128, [3,3], stride=2, padding='SAME')
print('Stage2 {0}'.format(net))
with tf.variable_scope('Stage3'):
net = lays.conv2d(net, 256, [3,3], stride=1, padding='SAME')
print('Stage3 {0}'.format(net))
net = lays.conv2d(net, 256, [3,3], stride=2, padding='SAME')
print('Stage3 {0}'.format(net))
with tf.variable_scope('Stage4'):
net = lays.conv2d(net, 512, [3,3], stride=1, padding='SAME')
print('Stage4 {0}'.format(net))
net = lays.conv2d(net, 512, [3,3], stride=2, padding='SAME')
print('Stage4 {0}'.format(net))
with tf.variable_scope('Stage5'):
net = slim.flatten(net)
print('Stage5 {0}'.format(net))
net = lays.fully_connected(net,4096, scope='fc1_1')
print('Stage5 {0}'.format(net))
net = lays.fully_connected(net,4096, scope='fc1_2')
print('Stage5 {0}'.format(net))
with tf.variable_scope('EMBEDDING'):
net = lays.fully_connected(net,EMBEDDING_SIZE, scope='fc2')
print('EMBEDDING {0}'.format(net))
with tf.variable_scope('Decoder'):
with tf.variable_scope('Stage5d'):
net = lays.fully_connected(net,4096, scope='fc1_2d')
print('Stage5d {0}'.format(net))
net = lays.fully_connected(net,12288, scope='fc1_1d')
print('Stage5d {0}'.format(net))
net = tf.reshape(net, [-1,6,4,512])
print('Stage5d {0}'.format(net))
with tf.variable_scope('Stage4d'):
net = lays.conv2d_transpose(net, 512, [3, 3], stride=1, padding='SAME')
print('Stage4d {0}'.format(net))
net = lays.conv2d_transpose(net, 256, [3, 3], stride=2, padding='SAME')
print('Stage4d {0}'.format(net))
with tf.variable_scope('Stage3d'):
net = lays.conv2d_transpose(net, 256, [3, 3], stride=1, padding='SAME')
print('Stage3d {0}'.format(net))
net = lays.conv2d_transpose(net, 128, [3, 3], stride=2, padding='SAME')
print('Stage3d {0}'.format(net))
with tf.variable_scope('Stage2d'):
net = lays.conv2d_transpose(net, 64, [3, 3], stride=2, padding='SAME')
print('Stage2d {0}'.format(net))
with tf.variable_scope('Stage1d'):
net = lays.conv2d_transpose(net, 1, [3,3], stride=2, padding='SAME', activation_fn=tf.nn.tanh)
print('Stage1d {0}'.format(net))
Я использую набор городских звуковых данных и генерирую данные 96 * 64 MFCC в качестве входных данных.
Вот эволюция стоимости во время поезда. Кажется, что-то не сходится.
У меня такой вопрос: хорошо ли вначале обучать модель Vggish, как автоэнкодер, генерировать узкое место встраивания?