Я использую pytorch (0.4.0) в google-colab Laboratory (NVIDIA-SMI 396.44 Версия драйвера: 396.44)
При запуске моего кода вне любой функции я могу отправлять тензоры и модели pytorch вGPU:
...
model.cuda()
data_tensor = data_tensor.cuda()
...
И моя модель CNN успешно обучена с точностью 98%.
Но когда я добавляю в функцию тот же код,
def main(...):
....
model.cuda()
data_tensor= data_tensor.cuda()
...
if __name__ == "__main__":
main('...)
У меня следующая ошибка:
cuda runtime error (77) : an illegal memory access was encountered at /pytorch/aten/src/THC/generic/THCTensorCopy.c:20
ОБНОВЛЕНИЕ (18/11/21):
Оказалось, что быть частью функции или нет, не имеет значения.Обычно у меня сначала возникает ошибка CUDNN_STATUS_EXECUTION_FAILED, а затем во второй раз ошибка выполнения cuda (77), как показано ниже.Но иногда это срабатывает несколько раз, прежде чем происходит сбой.
CUDNN_STATUS_EXECUTION_FAILED (первая попытка):
RuntimeError Traceback (most recent call last)
<ipython-input-27-53476e08e017> in <module>()
1 main('mnist', 'to', 'ndd', Xd=16, epo=5, bs=100, tXn=-1, vXn=300,
----> 2 lr=0.05, suf="s1", n_class=10, cuda=True)
<ipython-input-23-918584456207> in main(ds, framework, format, Xd, epo, bs, tXn, vXn, lr, suf, n_class, cuda)
12 opt = torch.optim.SGD(net.parameters(), lr)
13
---> 14 train(net, opt, Xd, epo, bs, cuda, tXn, tX, tT, vX, vT,lr)
15
<ipython-input-26-6b574a9e8af6> in train(model, optimizer, Xd, epo, bs, cuda, Xn, tX, tT, vX, vT, lr)
26 #t = t.cuda()
27 optimizer.zero_grad()
---> 28 z = model(x)
29 bat_loss = criterion(z, t)
30 bat_loss.backward()
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
489 result = self._slow_forward(*input, **kwargs)
490 else:
--> 491 result = self.forward(*input, **kwargs)
492 for hook in self._forward_hooks.values():
493 hook_result = hook(self, input, result)
<ipython-input-22-b4bc2e0b39b8> in forward(self, X)
10 H0 = torch.zeros(self.n_H, X.size(0), self.Wh)
11 C0 = torch.zeros(self.n_H, X.size(0), self.Wh)
---> 12 O, (Hn, Cn), = self.lstm1(X, (H0, C0))
13 O = self.linear1(O[:, -1, :])
14 return O
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
489 result = self._slow_forward(*input, **kwargs)
490 else:
--> 491 result = self.forward(*input, **kwargs)
492 for hook in self._forward_hooks.values():
493 hook_result = hook(self, input, result)
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
190 flat_weight=flat_weight
191 )
--> 192 output, hidden = func(input, self.all_weights, hx, batch_sizes)
193 if is_packed:
194 output = PackedSequence(output, batch_sizes)
/usr/local/lib/python3.6/dist-packages/torch/nn/_functions/rnn.py in forward(input, *fargs, **fkwargs)
321 func = decorator(func)
322
--> 323 return func(input, *fargs, **fkwargs)
324
325 return forward
/usr/local/lib/python3.6/dist-packages/torch/nn/_functions/rnn.py in forward(input, weight, hx, batch_sizes)
285 batch_first, dropout, train, bool(bidirectional),
286 list(batch_sizes.data) if variable_length else (),
--> 287 dropout_ts)
288
289 if cx is not None:
RuntimeError: CUDNN_STATUS_EXECUTION_FAILED
ошибка выполнения cuda (77) (другие попытки):
RuntimeError Traceback (most recent call last)
<ipython-input-28-53476e08e017> in <module>()
1 main('mnist', 'to', 'ndd', Xd=16, epo=5, bs=100, tXn=-1, vXn=300,
----> 2 lr=0.05, suf="s1", n_class=10, cuda=True)
<ipython-input-23-918584456207> in main(ds, framework, format, Xd, epo, bs, tXn, vXn, lr, suf, n_class, cuda)
12 opt = torch.optim.SGD(net.parameters(), lr)
13
---> 14 train(net, opt, Xd, epo, bs, cuda, tXn, tX, tT, vX, vT,lr)
15
<ipython-input-26-6b574a9e8af6> in train(model, optimizer, Xd, epo, bs, cuda, Xn, tX, tT, vX, vT, lr)
4 if cuda and torch.cuda.is_available():
5 print("tX type (before):", tX.type())
----> 6 model.cuda()
7 tX = tX.cuda()
8 tT = tT.cuda()
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in cuda(self, device)
247 Module: self
248 """
--> 249 return self._apply(lambda t: t.cuda(device))
250
251 def cpu(self):
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _apply(self, fn)
174 def _apply(self, fn):
175 for module in self.children():
--> 176 module._apply(fn)
177
178 for param in self._parameters.values():
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/rnn.py in _apply(self, fn)
109
110 def _apply(self, fn):
--> 111 ret = super(RNNBase, self)._apply(fn)
112 self.flatten_parameters()
113 return ret
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _apply(self, fn)
180 # Tensors stored in modules are graph leaves, and we don't
181 # want to create copy nodes, so we have to unpack the data.
--> 182 param.data = fn(param.data)
183 if param._grad is not None:
184 param._grad.data = fn(param._grad.data)
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in <lambda>(t)
247 Module: self
248 """
--> 249 return self._apply(lambda t: t.cuda(device))
250
251 def cpu(self):
RuntimeError: cuda runtime error (77) : an illegal memory access was encountered at /pytorch/aten/src/THC/generic/THCTensorCopy.c:20