Я пытаюсь агрегировать вложения BERT на уровне токена. Для каждого токена в словаре корпуса я хотел бы создать список всех их контекстных встраиваний и усреднить их, чтобы получить одно представление для каждого токена в словаре.
Код вставлен ниже.
Вопрос: Как сопоставить выходные тензоры (см. Объект token_vecs_sum
в последней строке кода ниже) с конкретными токенами?
Данные предварительной обработки
!pip install transformers
import torch
from transformers import BertTokenizer
from nltk import tokenize
import nltk
nltk.download('punkt')
import re
MAX_LEN = 64
sentences = ['Some sentences. Some sentences are.', 'Some sentences are really.', 'Some sentences are really hard.']
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
def preprocessing_for_bert(data):
input_ids = []
attention_masks = []
for row in data:
sents = tokenize.sent_tokenize(row)
print(sents)
for sent in sents:
encoded_sent = tokenizer.encode_plus(text=sent,
add_special_tokens=True,
max_length=MAX_LEN,
pad_to_max_length=True,
return_attention_mask=True,
truncation=True)
input_ids.append(encoded_sent.get('input_ids'))
attention_masks.append(encoded_sent.get('attention_mask'))
# Convert lists to tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
return input_ids, attention_masks
tokens_tensor, segments_tensor = preprocessing_for_bert(sentences)
Загрузить предварительно обученную модель
import torch
from transformers import BertTokenizer, BertModel
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
Выполнить BERT
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers.
with torch.no_grad():
outputs = model(tokens_tensor, segments_tensor)
# Evaluating the model will return a different number of objects based on
# how it's configured in the `from_pretrained` call earlier. In this case,
# becase we set `output_hidden_states = True`, the third item will be the
# hidden states from all layers.
hidden_states = outputs[2]
Результаты вывода
print ("Number of layers:", len(hidden_states), " (initial embeddings + 12 BERT layers)")
layer_i = 0
print ("Number of batches:", len(hidden_states[layer_i]))
batch_i = 0
print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
token_i = 0
print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))
Number of layers: 13 (initial embeddings + 12 BERT layers)
Number of batches: 4
Number of tokens: 64
Number of hidden units: 768
Агрегация
token_embeddings = torch.stack(hidden_states, dim=0)
# Average over batches
token_embeddings = token_embeddings.mean(1)
token_embeddings = token_embeddings.permute(1,0,2)
token_embeddings.size()
## -> torch.Size([64, 13, 768])
Подготовка матрицы встраивания токенов
token_vecs_sum = []
# For each token in the sentence...
for token in token_embeddings:
# `token` is a [13 x 768] tensor
# Sum the vectors from the last four layers.
sum_vec = torch.sum(token[-4:], dim=0)
# Use `sum_vec` to represent `token`.
token_vecs_sum.append(sum_vec)
print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))