Я пытаюсь внедрить документ с помощью BERT. Код, который я использую, представляет собой комбинацию двух источников. Я использую BERT Document Classification Tutorial с кодом и BERT Word Embeddings Tutorial . Ниже приведен код, я загружаю первые 510 токенов каждого документа в модель BERT. Наконец, я применяю кластеризацию K-средних к этим вложениям, но члены каждого кластера ПОЛНОСТЬЮ не имеют значения. Мне интересно, как это возможно. Может что-то не так с моим кодом. Буду признателен, если вы взглянете на мой код и скажете, что с ним не так. Я использую Google colab для запуска этого кода.
# text_to_embedding function
import torch
from keras.preprocessing.sequence import pad_sequences
def text_to_embedding(tokenizer, model, in_text):
'''
Uses the provided BERT 'model' and 'tokenizer' to generate a vector
representation of the input string, 'in_text'.
Returns the vector stored as a numpy ndarray.
'''
# ===========================
# STEP 1: Tokenization
# ===========================
MAX_LEN = 510
# 'encode' will:
# (1) Tokenize the sentence
# (2) Prepend the '[CLS]' token to the start.
# (3) Append the '[SEP]' token to the end.
# (4) Map tokens to their IDs.
input_ids = tokenizer.encode(
in_text, # sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
max_length = MAX_LEN, # Truncate all sentences.
#return_tensors = 'pt' # Return pytorch tensors.
)
# Pad our input tokens. Truncation was handled above by the 'encode'
# function, which also makes sure that the '[SEP]' token is placed at the
# end *after* truncating.
# Note: 'pad_sequences' expects a list of lists, but we only have one
# piece of text, so we surround 'input_ids' with an extra set of brackets.
results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long",
value=0, truncating="post", padding="post")
# Remove the outer list.
input_ids = results[0]
# Create attention masks.
attn_mask = [int(i > 0) for i in input_ids]
# Cast to tensors.
input_ids = torch.tensor(input_ids)
attn_mask = torch.tensor(attn_mask)
# Add an extra dimension for the "batch" (even though there is only one
# input in this batch)
input_ids = input_ids.unsqueeze(0)
attn_mask = attn_mask.unsqueeze(0)
# ===========================
# STEP 1: Tokenization
# ===========================
# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()
# Copy the inputs to the GPU
input_ids = input_ids.to(device)
attn_mask = attn_mask.to(device)
# telling the model not to build the backward graph will make this
# a little quicker.
with torch.no_grad():
# Forward pass, returns hidden states and predictions
# This will return the logits rather than the loss because we have
# not provided labels.
outputs = model(
input_ids = input_ids,
token_type_ids = None,
attention_mask = attn_mask)
hidden_states = outputs[2]
#Sentence Vectors
#To get a single vector for our entire sentence we have multiple
#application-dependent strategies, but a simple approach is to
#average the second to last hiden layer of each token producing
#a single 768 length vector.
# `hidden_states` has shape [13 x 1 x ? x 768]
# `token_vecs` is a tensor with shape [? x 768]
token_vecs = hidden_states[-2][0]
# Calculate the average of all ? token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)
# Move to the CPU and convert to numpy ndarray.
sentence_embedding = sentence_embedding.detach().cpu().numpy()
return(sentence_embedding)
from transformers import BertTokenizer, BertModel
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
output_hidden_states = True, # Whether the model returns all hidden-states.
)
model.cuda()
from transformers import BertTokenizer
# Load the BERT tokenizer.
print('Loadin BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)