не может установить apex для распределенной и fp16 тренировки модели bert
я пытался установить, клонируя апекс из github и пытался установить пакеты, используя pip
Я попытался установить Apex путем клонирования из git hub с помощью следующей команды:
мерзавец клон https://github.com/NVIDIA/apex.git
и cd apex в каталог goto apex и попытались установить пакет с помощью следующей команды pip:
pip install -v --no-cache-dir --global-option = "- cpp_ext" --global-option = "- cuda_ext"
полный код:
def main(server_ip,server_port,local_rank,no_cuda,fp16,train_batch_size,gradient_accumulation_steps,seed,do_train,do_eval,output_dir,task_name,data_dir,do_lower_case,bert_model,num_train_epochs,cache_dir,learning_rate,warmup_proportion,loss_scale,max_seq_length):
if server_ip and server_port:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import ptvsd
print("Waiting for debugger attach")
ptvsd.enable_attach(address=(server_ip, server_port), redirect_output=True)
ptvsd.wait_for_attach()
processors = {"ner":NerProcessor}
print(processors)
if local_rank == -1 or no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, n_gpu, bool(local_rank != -1), fp16))
if gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
train_batch_size = train_batch_size // gradient_accumulation_steps
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if not do_train and not do_eval:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
if os.path.exists(output_dir) and os.listdir(output_dir) and do_train:
raise ValueError("Output directory ({}) already exists and is not empty.".format(output_dir))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
task_name = task_name.lower()
if task_name not in processors:
raise ValueError("Task not found: %s" % (task_name))
processor = processors[task_name]()
label_list = processor.get_labels()
num_labels = len(label_list) + 1
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)
train_examples = None
num_train_optimization_steps = None
if do_train:
train_examples = processor.get_train_examples(data_dir)
num_train_optimization_steps = int(
len(train_examples) / train_batch_size / gradient_accumulation_steps) * num_train_epochs
if local_rank != -1:
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
# # Prepare model
cache_dir = cache_dir if cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(local_rank))
model = Ner.from_pretrained(bert_model,
cache_dir=cache_dir,
num_labels = num_labels)
if fp16:
model.half()
# model.cuda()
model.to(device)
if local_rank != -1:
try:
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
model = DDP(model)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if fp16:
try:
from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=learning_rate,
warmup=warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0
nb_tr_steps = 0
tr_loss = 0
label_map = {i : label for i, label in enumerate(label_list,1)}
if do_train:
train_features = convert_examples_to_features(
train_examples, label_list, max_seq_length, tokenizer)
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_examples))
logger.info(" Batch size = %d", train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long)
all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_valid_ids,all_lmask_ids)
if local_rank == -1:
train_sampler = RandomSampler(train_data)
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)
model.train()
for _ in trange(int(num_train_epochs), desc="Epoch"):
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids, valid_ids,l_mask = batch
loss = model(input_ids, segment_ids, input_mask, label_ids,valid_ids,l_mask)
del loss
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if gradient_accumulation_steps > 1:
loss = loss / gradient_accumulation_steps
if fp16:
optimizer.backward(loss)
else:
loss.backward()
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
if (step + 1) % gradient_accumulation_steps == 0:
if fp16:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step = learning_rate * warmup_linear(global_step/num_train_optimization_steps, warmup_proportion)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
optimizer.zero_grad()
global_step += 1
Главный ( '', '', - 1, правда, правда, 8,1,42, Правда, Правда, 'JPT', 'Нер', 'данные /', правда, 'Bert основанием обсаженных' , 5, 'cache_dir', 5e-5,0.4,0,128)