Вложения слов с помощью BERT и отображение тензоров в слова - PullRequest
1 голос
/ 04 августа 2020

Я пытаюсь агрегировать вложения BERT на уровне токена. Для каждого токена в словаре корпуса я хотел бы создать список всех их контекстных встраиваний и усреднить их, чтобы получить одно представление для каждого токена в словаре.

Код вставлен ниже.

Вопрос: Как сопоставить выходные тензоры (см. Объект token_vecs_sum в последней строке кода ниже) с конкретными токенами?

Данные предварительной обработки

!pip install transformers
import torch
from transformers import BertTokenizer
from nltk import tokenize
import nltk
nltk.download('punkt')
import re

MAX_LEN = 64

sentences = ['Some sentences. Some sentences are.', 'Some sentences are really.', 'Some sentences are really hard.']

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def preprocessing_for_bert(data):
  input_ids = []
  attention_masks = []
  for row in data:
    sents = tokenize.sent_tokenize(row)
    print(sents)
    for sent in sents:
      encoded_sent = tokenizer.encode_plus(text=sent,
                                          add_special_tokens=True,
                                          max_length=MAX_LEN,
                                          pad_to_max_length=True,
                                          return_attention_mask=True,
                                          truncation=True)
      input_ids.append(encoded_sent.get('input_ids'))
      attention_masks.append(encoded_sent.get('attention_mask'))
  # Convert lists to tensors
  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)

  return input_ids, attention_masks


tokens_tensor, segments_tensor = preprocessing_for_bert(sentences)

Загрузить предварительно обученную модель

import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Выполнить BERT

# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensor)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers.
    hidden_states = outputs[2]

Результаты вывода

print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")

layer_i = 0
print ("Number of batches:", len(hidden_states[layer_i]))

batch_i = 0
print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))

token_i = 0
print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))

Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 4
Number of tokens: 64
Number of hidden units: 768

Агрегация

token_embeddings = torch.stack(hidden_states, dim=0)
# Average over batches
token_embeddings = token_embeddings.mean(1)
token_embeddings = token_embeddings.permute(1,0,2)
token_embeddings.size()
## -> torch.Size([64, 13, 768])

Подготовка матрицы встраивания токенов

token_vecs_sum = []
# For each token in the sentence...
for token in token_embeddings:
  # `token` is a [13 x 768] tensor
  # Sum the vectors from the last four layers.
  sum_vec = torch.sum(token[-4:], dim=0)  
  # Use `sum_vec` to represent `token`.
  token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))
...