У меня есть список токенов в качестве входных данных.Я использовал одну горячую кодировку для преобразования текстового списка в двоичную кодированную матрицу.Эта матрица затем подается в простую архитектуру автоэнкодера.Архитектура состоит из 2 полностью связанных слоев, за которыми следует первая часть этой ссылки .
Чтобы сравнить / понять результат этой архитектуры, нам нужно инвертировать одно горячее кодированиепреобразование.Этот шаг блокируется этой ошибкой:
ValueError: y содержит новые метки: [121]
def getTokens(xml_string):
firstTagMatches = re.findall('(\<\w+\>)', xml_string, re.DOTALL)
closedTagMatches = re.findall('(\<\/\w+\>)', xml_string, re.DOTALL)
betweenTagMatches = re.findall(r'>(.*?)<', xml_string)
xmlTokens = firstTagMatches + betweenTagMatches + closedTagMatches
return xmlTokens
def oneHotEncoding(data):
values = array(data)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
return onehot_encoded
def invertOneHotEncoding(data,decoded_imgs):
values = array(data)
print('values', values)
print('type', type(values))
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print('integer_encoded', integer_encoded)
print('decoded images 1', decoded_imgs)
# invert
#final_decoded = np.zeros(shape= decoded_imgs.shape)
#for (x,y), value in np.ndenumerate(decoded_imgs):
# if value > 0:
# final_decoded[x,y] = 1
#print('decoded images 2', final_decoded)
inverted = [label_encoder.inverse_transform([argmax(decoded_imgs[i, :])]) for i in range(len(decoded_imgs))]
#inverted = label_encoder.inverse_transform([argmax(final_decoded[0, :])])
return inverted
def getEncodedTrainingData(directoryPath):
#path = '/some/path/to/file'
encodedXML = np.zeros(shape=(0,166)) #because we know that the first file will give us a numpy array of shape=(166,166)
for filename in os.listdir(directoryPath):
print('filename', filename)
pretty_xml_as_string = ''
trainingTokens = []
xmlObject = xml.dom.minidom.parse(directoryPath+'/'+filename)
pretty_xml_as_string = xmlObject.toprettyxml()
trainingTokens = getTokens(pretty_xml_as_string)
transformedMatrice = oneHotEncoding(trainingTokens)
dimensions = transformedMatrice.shape[0] * transformedMatrice.shape[1]
onehotEncodedArray = np.resize(transformedMatrice, (int(dimensions/166),166)) #it may loose some information !! need a better solution
print('onehotEncodedArray shape', onehotEncodedArray.shape)
encodedXML = np.concatenate((encodedXML, onehotEncodedArray), axis=0)
print('before deleting shape', encodedXML.shape)
#encodedXML = np.delete(encodedXML,np.s_[0:166], axis=0) #remove the first initialized line
return encodedXML
encodedXML = getEncodedTrainingData('./trainingData/')
encodedXML_test = getEncodedTrainingData('./testingData/')
Часть машинного обучения:
#make the AutoEncoder Model
# this is the size of our encoded representations
encoding_dim = 32
# this is our input placeholder
input_vector = Input(shape=(166,))
# "encoded" is the encoded representation of the input
encoded = Dense(encoding_dim, activation='relu',
activity_regularizer=regularizers.l1(10e-5))(input_vector)
# "decoded" is the lossy reconstruction of the input
decoded = Dense(166, activation='relu')(encoded)
# this model maps an input to its reconstruction
autoencoder = Model(input_vector, decoded)
encoder = Model(input_vector, encoded)
encoded_input = Input(shape=(encoding_dim,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(encoded_input, decoder_layer(encoded_input))
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['binary_accuracy', 'categorical_accuracy'])
x_train = encodedXML
x_test = encodedXML_test
autoencoder.fit(x_train, x_train,
epochs=50,
batch_size=256,
shuffle=False,
validation_data=(x_test, x_test))
# encode and decode some digits
# note that we take them from the *test* set
encoded_imgs = encoder.predict(x_test)
decoded_imgs = decoder.predict(encoded_imgs)
invertedSentence = invertOneHotEncoding(testDataTokens,decoded_imgs)