pytorch RuntimeError: CUDA error: device-side assert triggered

Question

I've a notebook on google colab that fails with following error

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     93         exception = e
---> 94         raise e
     95     finally: cb_handler.on_train_end(exception)

/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     83                 xb, yb = cb_handler.on_batch_begin(xb, yb)
---> 84                 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)
     85                 if cb_handler.on_batch_end(loss): break

/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in loss_batch(model, xb, yb, loss_func, opt, cb_handler)
     24     if opt is not None:
---> 25         loss = cb_handler.on_backward_begin(loss)
     26         loss.backward()

/usr/local/lib/python3.6/dist-packages/fastai/callback.py in on_backward_begin(self, loss)
    223         for cb in self.callbacks:
--> 224             a = cb.on_backward_begin(**self.state_dict)
    225             if a is not None: self.state_dict['last_loss'] = a

/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in on_backward_begin(self, smooth_loss, **kwargs)
    266         if self.pbar is not None and hasattr(self.pbar,'child'):
--> 267             self.pbar.child.comment = f'{smooth_loss:.4f}'
    268 

/usr/local/lib/python3.6/dist-packages/torch/tensor.py in __format__(self, format_spec)
    377         if self.dim() == 0:
--> 378             return self.item().__format__(format_spec)
    379         return object.__format__(self, format_spec)

RuntimeError: CUDA error: device-side assert triggered

During handling of the above exception, another exception occurred:

RuntimeError                              Traceback (most recent call last)
<ipython-input-33-dd390b1c8108> in <module>()
----> 1 lr_find(learn)
      2 learn.recorder.plot()

/usr/local/lib/python3.6/dist-packages/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, **kwargs)
     26     cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
     27     a = int(np.ceil(num_it/len(learn.data.train_dl)))
---> 28     learn.fit(a, start_lr, callbacks=[cb], **kwargs)
     29 
     30 def to_fp16(learn:Learner, loss_scale:float=512., flat_master:bool=False)->Learner:

/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
    160         callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
    161         fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
--> 162             callbacks=self.callbacks+callbacks)
    163 
    164     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     93         exception = e
     94         raise e
---> 95     finally: cb_handler.on_train_end(exception)
     96 
     97 loss_func_name2activ = {'cross_entropy_loss': partial(F.softmax, dim=1), 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp,

/usr/local/lib/python3.6/dist-packages/fastai/callback.py in on_train_end(self, exception)
    254     def on_train_end(self, exception:Union[bool,Exception])->None:
    255         "Handle end of training, `exception` is an `Exception` or False if no exceptions during training."
--> 256         self('train_end', exception=exception)
    257 
    258 class AverageMetric(Callback):

/usr/local/lib/python3.6/dist-packages/fastai/callback.py in __call__(self, cb_name, call_mets, **kwargs)
    185         "Call through to all of the `CallbakHandler` functions."
    186         if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]
--> 187         return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]
    188 
    189     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:

/usr/local/lib/python3.6/dist-packages/fastai/callback.py in <listcomp>(.0)
    185         "Call through to all of the `CallbakHandler` functions."
    186         if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]
--> 187         return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]
    188 
    189     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:

/usr/local/lib/python3.6/dist-packages/fastai/callbacks/lr_finder.py in on_train_end(self, **kwargs)
     45         # restore the valid_dl we turned of on `__init__`
     46         self.data.valid_dl = self.valid_dl
---> 47         self.learn.load('tmp')
     48         if hasattr(self.learn.model, 'reset'): self.learn.model.reset()
     49         print('LR Finder complete, type {learner_name}.recorder.plot() to see the graph.')

/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in load(self, name, device)
    202         "Load model `name` from `self.model_dir` using `device`, defaulting to `self.data.device`."
    203         if device is None: device = self.data.device
--> 204         self.model.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device))
    205         return self
    206 

/usr/local/lib/python3.6/dist-packages/torch/serialization.py in load(f, map_location, pickle_module)
    356         f = open(f, 'rb')
    357     try:
--> 358         return _load(f, map_location, pickle_module)
    359     finally:
    360         if new_fd:

/usr/local/lib/python3.6/dist-packages/torch/serialization.py in _load(f, map_location, pickle_module)
    527     unpickler = pickle_module.Unpickler(f)
    528     unpickler.persistent_load = persistent_load
--> 529     result = unpickler.load()
    530 
    531     deserialized_storage_keys = pickle_module.load(f)

/usr/local/lib/python3.6/dist-packages/torch/serialization.py in persistent_load(saved_id)
    493             if root_key not in deserialized_objects:
    494                 deserialized_objects[root_key] = restore_location(
--> 495                     data_type(size), location)
    496             storage = deserialized_objects[root_key]
    497             if view_metadata is not None:

/usr/local/lib/python3.6/dist-packages/torch/serialization.py in restore_location(storage, location)
    376     elif isinstance(map_location, torch.device):
    377         def restore_location(storage, location):
--> 378             return default_restore_location(storage, str(map_location))
    379     else:
    380         def restore_location(storage, location):

/usr/local/lib/python3.6/dist-packages/torch/serialization.py in default_restore_location(storage, location)
    102 def default_restore_location(storage, location):
    103     for _, _, fn in _package_registry:
--> 104         result = fn(storage, location)
    105         if result is not None:
    106             return result

/usr/local/lib/python3.6/dist-packages/torch/serialization.py in _cuda_deserialize(obj, location)
     84                                'to an existing device.'.format(
     85                                    device, torch.cuda.device_count()))
---> 86         return obj.cuda(device)
     87 
     88 

/usr/local/lib/python3.6/dist-packages/torch/_utils.py in _cuda(self, device, non_blocking, **kwargs)
     74         else:
     75             new_type = getattr(torch.cuda, self.__class__.__name__)
---> 76             return new_type(self.size()).copy_(self, non_blocking)
     77 
     78 

RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorCopy.cpp:20

There is no information about the real cause, I tried to get the stack trace by forcing cuda to run on one gpu (as suggested here) using a cell like this

!export CUDA_LAUNCH_BLOCKING=1

But this does not seem to work, still having the same error with.

Is there another way that works with Google Colab?

Mohammed Awney · Accepted Answer · 2019-04-05 09:47:01Z

4

Be sure that your targets values starts from zero to number of classes - 1. Ex: you have 100 classification class so your target should be from 0 to 99

answered Apr 5, 2019 at 9:47

Mohammed Awney

1,3221 gold badge17 silver badges21 bronze badges

Sign up to request clarification or add additional context in comments.

Comments

Ami F · Accepted Answer · 2018-11-15 17:39:59Z

2

!export FOO=blah is usually not useful to run in a notebook because ! means run the following command in a sub-shell, so the effect of the statement is gone by the time the ! returns. You might have more success by storing your python code in a file and then executing that file in a subshell:

In one cell:

%%writefile foo.py
[...your code...]

In the next cell:

!export CUDA_LAUNCH_BLOCKING=1; python3 foo.py

(or s/python3/python2/ if you're writing py2)

answered Nov 15, 2018 at 17:39

Ami F

2,28214 silver badges19 bronze badges

Comments

gary69 · Accepted Answer · 2019-10-14 19:56:48Z

0

Switch Hardware Accelerator Type to "None" under Runtime->Change Runtime Type . This should give you a more meaningful error message.

answered Oct 14, 2019 at 19:56

gary69

4,3607 gold badges45 silver badges60 bronze badges

Comments

Faraz M. · Accepted Answer · 2021-07-08 12:20:22Z

0

The proper way to set environmental variables in Google Colab is to use os:

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

Using the os library will allow you to set whatever environmental variables you need. Setting CUDA_LAUNCH_BLOCKING this way enables proper CUDA tracebacks in Google Colab.

answered Jul 8, 2021 at 12:20

Faraz M.

736 bronze badges

Collectives™ on Stack Overflow

pytorch RuntimeError: CUDA error: device-side assert triggered

4 Answers 4

Comments

Comments

Comments

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

4 Answers 4

Comments

Comments

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related