Я хочу создать матрицу сходства на основе коричневого набора данных из библиотеки NLTK. Проблема в том, что потери
tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = softmax_weight, biases = softmax_bias, inputs = embed,
labels = y, num_sampled = num_sampled, num_classes = num_words))
уменьшаются с 4,2 до 2,0, а затем начинаются до go вверх и вниз. Вопрос в том, как я могу улучшить точность своей модели?
Вот мой полный код:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding,Layer
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from numpy.random import choice
import random
from itertools import repeat
import tensorflow as tf
import nltk
import re
from nltk.corpus import stopwords
from nltk.corpus import brown
import string
nltk.download('brown')
nltk.download('stopwords')
#Dataset loading and preparation:
dataset = brown.sents()
punct = list(string.punctuation)
punct.append("``")
punct.append("''")
punct.append("--")
stops = set(stopwords.words("english"))
dataset = [[word.lower() for word in sentence if word not in punct and word.lower() not in stops] for sentence in dataset]
#tokenization
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(dataset)
word2index = tokenizer.word_index
index_word = tokenizer.index_word
total_words = 5000
data_prep = tokenizer.texts_to_sequences(dataset)
data_prep = [sentence for sentence in data_prep if len(sentence) >2]
#word2vec
def word2vec_preparation(data,window_size,num_skips):
grams = []
context = []
target = []
assert window_size >= 1,'windows_size argument is <1!'
assert num_skips >= 1,'num_skips argument <1!'
for sentence in data:
if len(sentence) - window_size > 1:
#print(sentence)
for i in range(len(sentence)):
if i - window_size < 0:
gram = sentence[i+1:i+window_size + 1]
check = num_skips - len(set(gram))
#print(gram)
grams.append(gram)
if check > 0:
context.extend(random.sample(set(gram), len(set(gram))))
target.extend(repeat(sentence[i], len(set(gram))))
else:
context.extend(random.sample(set(gram), num_skips))
target.extend(repeat(sentence[i], num_skips))
elif i + window_size > len(sentence) -1:
gram = sentence[i-window_size:i]
check = num_skips - len(set(gram))
#print(gram)
grams.append(gram)
if check > 0:
context.extend(random.sample(set(gram), len(set(gram))))
target.extend(repeat(sentence[i], len(set(gram))))
else:
context.extend(random.sample(set(gram), num_skips))
target.extend(repeat(sentence[i], num_skips))
else:
gram = sentence[i-window_size:i] + sentence[i+1:i+window_size + 1]
check = num_skips - len(set(gram))
#print(gram)
grams.append(gram)
if check > 0:
context.extend(random.sample(set(gram), len(set(gram))))
target.extend(repeat(sentence[i], len(set(gram))))
else:
context.extend(random.sample(set(gram), num_skips))
target.extend(repeat(sentence[i], num_skips))
#print('----------------------')
return grams, context, target
grams,context,target = word2vec_preparation(data_prep,window_size = 2,num_skips = 3)
target = np.array(target,dtype= np.int64)
context = np.array(context,dtype= np.int64)
context = context.reshape(len(context),1)
dataset_train = tf.data.Dataset.from_tensor_slices((target, context))
dataset_train = dataset_train.shuffle(buffer_size=1024).batch(64)
#Parameters:
num_words = 5000
embed_size = 300
num_sampled = 64
initializer_softmax = tf.keras.initializers.GlorotUniform()
#Variables:
embeddings_weight = tf.Variable(tf.random.uniform([num_words,embed_size],-1.0,1.0))
softmax_weight = tf.Variable(initializer_softmax([num_words,embed_size]))
softmax_bias = tf.Variable(initializer_softmax([num_words]))
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
@tf.function
def training(X,y):
with tf.GradientTape() as tape:
embed = tf.nn.embedding_lookup(embeddings_weight,X)#embeddings_weight are parameters and X is a collection of indecies for looking up in the embedding table
loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = softmax_weight, biases = softmax_bias, inputs = embed,
labels = y, num_sampled = num_sampled, num_classes = num_words))
variables = [embeddings_weight,softmax_weight,softmax_bias]
gradients = tape.gradient(loss,variables)
optimizer.apply_gradients(zip(gradients,variables))
return loss
#tf.print('Loss:',loss)
EPOCHS = 100
for epoch in range(EPOCHS):
for step, (X,y) in enumerate(dataset_train):
loss = training(X,y)
tf.print('Epoch:',epoch + 1, 'loss:',loss)