Я хочу выполнить классификацию авторов в наборе данных Reuters 50 50, где максимальная длина токена составляет 1600+ токенов, а всего 50 классов / авторов.
С max_length=1700
и batch_size=1
, Я получаю RuntimeError: CUDA out of memory
. Эту ошибку можно предотвратить, установив max_length=512
, но это имеет нежелательный эффект усечения текстов.
Токенизация и кодировка:
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 1700
def get_encodings(texts):
token_ids = []
attention_masks = []
for text in texts:
token_id = tokenizer.encode(text, add_special_tokens=True, max_length=MAX_LEN)
token_ids.append(token_id)
return token_ids
def pad_encodings(encodings):
return pad_sequences(encodings, maxlen=MAX_LEN, dtype="long",
value=0, truncating="post", padding="post")
def get_attention_masks(padded_encodings):
attention_masks = []
for encoding in padded_encodings:
attention_mask = [int(token_id > 0) for token_id in encoding]
attention_masks.append(attention_mask)
return attention_masks
train_encodings = get_encodings(train_df.text.values)
train_encodings = pad_encodings(train_encodings)
train_attention_masks = get_attention_masks(train_encodings)
test_encodings = get_encodings(test_df.text.values)
test_encodings = pad_encodings(test_encodings)
test_attention_masks = get_attention_masks(test_encodings)
Упаковка в набор данных и загрузчик данных:
X_train = torch.tensor(train_encodings)
y_train = torch.tensor(train_df.author_id.values)
train_masks = torch.tensor(train_attention_masks)
X_test = torch.tensor(test_encodings)
y_test = torch.tensor(test_df.author_id.values)
test_masks = torch.tensor(test_attention_masks)
batch_size = 1
# Create the DataLoader for our training set.
train_data = TensorDataset(X_train, train_masks, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(X_test, test_masks, y_test)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
Настройка модели:
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
config = BertConfig.from_pretrained(
'bert-base-uncased',
num_labels = 50,
output_attentions = False,
output_hidden_states = False,
max_position_embeddings=MAX_LEN
)
model = BertForSequenceClassification(config)
model.to(device)
optimizer = AdamW(model.parameters(),
lr = 2e-5,
eps = 1e-8
)
Обучение:
for epoch_i in range(0, epochs):
model.train()
for step, batch in enumerate(train_dataloader):
b_texts = batch[0].to(device)
b_attention_masks = batch[1].to(device)
b_authors = batch[2].to(device)
model.zero_grad()
outputs = model(b_texts,
token_type_ids=None,
attention_mask=b_attention_masks,
labels=b_authors) <------- ERROR HERE
Ошибка:
RuntimeError: CUDA out of memory. Tried to allocate 6.00 GiB (GPU 0; 7.93 GiB total capacity; 1.96 GiB already allocated; 5.43 GiB free; 536.50 KiB cached)