Когда вы тренируете модель w2v
с помощью gensim, она сохраняет vocab
и index
каждого слова.
gensim
использует эту информацию для сопоставления слова с его вектором.
Есливы собираетесь точно настроить уже существующую w2v
модель, вам необходимо убедиться, что ваш словарь соответствует.
См. прикрепленный фрагмент кода.
import os
import pickle
import numpy as np
import gensim
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec
import operator
os.mkdir("model_dir")
# class EpochSaver(CallbackAny2Vec):
# '''Callback to save model after each epoch.'''
# def __init__(self, path_prefix):
# self.path_prefix = path_prefix
# self.epoch = 0
# def on_epoch_end(self, model):
# list_of_existing_files = os.listdir(".")
# output_path = 'model_dir/{}_epoch{}.model'.format(self.path_prefix, self.epoch)
# try:
# model.save(output_path)
# except:
# model.wv.save_word2vec_format('model_dir/model_{}.bin'.format(self.epoch), binary=True)
# print("number of epochs completed = {}".format(self.epoch))
# self.epoch += 1
# list_of_total_files = os.listdir(".")
# saver = EpochSaver("my_finetuned")
# function to load vectors from existing model.
# I am loading glove vectors from a text file, benefit of doing this is that I get complete vocab of glove as well.
# If you are using a previous word2vec model I would recommed save that in txt format.
# In case you decide not to do it, you can tweak the function to get vectors for words in your vocab only.
def load_vectors(token2id, path, limit=None):
embed_shape = (len(token2id), 300)
freqs = np.zeros((len(token2id)), dtype='f')
vectors = np.zeros(embed_shape, dtype='f')
i = 0
with open(path, encoding="utf8", errors='ignore') as f:
for o in f:
token, *vector = o.split(' ')
token = str.lower(token)
if len(o) <= 100:
continue
if limit is not None and i > limit:
break
vectors[token2id[token]] = np.array(vector, 'f')
i += 1
return vectors
embedding_name = "glove.840B.300d.txt"
data = "<training data(new line separated tect file)>"
# Dictionary to store a unique id for each token in vocab( in my case vocab contains both my vocab and glove vocab)
token2id = {}
# This dictionary will contain all the words and their frequencies.
vocab_freq_dict = {}
# Populating vocab_freq_dict and token2id from my data.
id_ = 0
training_examples = []
file = open("{}".format(data),'r', encoding="utf-8")
for line in file.readlines():
words = line.strip().split(" ")
training_examples.append(words)
for word in words:
if word not in vocab_freq_dict:
vocab_freq_dict.update({word:0})
vocab_freq_dict[word] += 1
if word not in token2id:
token2id.update({word:id_})
id_ += 1
# Populating vocab_freq_dict and token2id from glove vocab.
max_id = max(token2id.items(), key=operator.itemgetter(1))[0]
max_token_id = token2id[max_id]
with open(embedding_name, encoding="utf8", errors='ignore') as f:
for o in f:
token, *vector = o.split(' ')
token = str.lower(token)
if len(o) <= 100:
continue
if token not in token2id:
max_token_id += 1
token2id.update({token:max_token_id})
vocab_freq_dict.update({token:1})
with open("vocab_freq_dict","wb") as vocab_file:
pickle.dump(vocab_freq_dict, vocab_file)
with open("token2id", "wb") as token2id_file:
pickle.dump(token2id, token2id_file)
# converting vectors to keyedvectors format for gensim
vectors = load_vectors(token2id, embedding_name)
vec = KeyedVectors(300)
vec.add(list(token2id.keys()), vectors, replace=True)
# setting vectors(numpy_array) to None to release memory
vectors = None
params = dict(min_count=1,workers=14,iter=6,size=300)
model = Word2Vec(**params)
# using build from vocab to build the vocab
model.build_vocab_from_freq(vocab_freq_dict)
# using token2id to create idxmap
idxmap = np.array([token2id[w] for w in model.wv.index2entity])
# Setting hidden weights(syn0 = between input layer and hidden layer) = your vectors arranged accoring to ids
model.wv.vectors[:] = vec.vectors[idxmap]
# Setting hidden weights(syn0 = between hidden layer and output layer) = your vectors arranged accoring to ids
model.trainables.syn1neg[:] = vec.vectors[idxmap]
model.train(training_examples, total_examples=len(training_examples), epochs=model.epochs)
output_path = 'model_dir/final_model.model'
model.save(output_path)
Прокомментируйте, если у вас есть какие-либо сомнения.