I was training the yolact model on resnet18 backbone and it was going all good but then suddenly I got some error and the training got aborted. This error came when the training of about 3-4 hours was completed. I modified the configuration for yolact backbone to use resnet18
Using - ubuntu 10.04, pytorch 1.12.1+cu113, python 3.9.12, GPU: 2 x NVIDIA RTX A6000
[ 2] 32930 || B: 3.996 | C: 4.955 | M: 4.408 | S: 1.036 | T: 14.395 || ETA: 1 day, 4:21:46 || timer: 0.127
[ 2] 32940 || B: 4.020 | C: 5.072 | M: 4.458 | S: 1.104 | T: 14.655 || ETA: 1 day, 4:23:08 || timer: 0.132
[ 2] 32950 || B: 4.063 | C: 5.246 | M: 4.552 | S: 1.199 | T: 15.060 || ETA: 1 day, 4:23:34 || timer: 0.151
[ 2] 32960 || B: 4.057 | C: 5.435 | M: 4.608 | S: 1.271 | T: 15.371 || ETA: 1 day, 4:23:31 || timer: 0.158
[ 2] 32970 || B: 4.071 | C: 5.574 | M: 4.675 | S: 1.306 | T: 15.626 || ETA: 1 day, 4:24:23 || timer: 0.166
[ 2] 32980 || B: 4.033 | C: 5.746 | M: 4.758 | S: 1.381 | T: 15.918 || ETA: 1 day, 4:26:38 || timer: 0.140
[ 2] 32990 || B: 4.031 | C: 5.817 | M: 4.741 | S: 1.411 | T: 15.999 || ETA: 1 day, 4:25:21 || timer: 0.139
[ 2] 33000 || B: 4.055 | C: 5.763 | M: 4.799 | S: 1.412 | T: 16.028 || ETA: 1 day, 4:25:51 || timer: 0.128
Traceback (most recent call last):
File "/home/gangwa/miniconda3/lib/python3.9/multiprocessing/queues.py", line 245, in _feed
obj = _ForkingPickler.dumps(obj)
File "/home/gangwa/miniconda3/lib/python3.9/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 364, in reduce_storage
shared_cache[cache_key] = StorageWeakRef(storage)
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 65, in setitem
self.free_dead_references()
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 70, in free_dead_references
if storage_ref.expired():
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 35, in expired
return torch.Storage._expired(self.cdata) # type: ignore[attr-defined]
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/storage.py", line 757, in _expired
return eval(cls.module)._UntypedStorage._expired(*args, **kwargs)
AttributeError: module 'torch.cuda' has no attribute '_UntypedStorage'
Traceback (most recent call last):
File "/home/gangwa/miniconda3/lib/python3.9/multiprocessing/queues.py", line 245, in _feed
obj = _ForkingPickler.dumps(obj)
File "/home/gangwa/miniconda3/lib/python3.9/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 364, in reduce_storage
shared_cache[cache_key] = StorageWeakRef(storage)
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 65, in setitem
self.free_dead_references()
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 70, in free_dead_references
if storage_ref.expired():
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 35, in expired
return torch.Storage._expired(self.cdata) # type: ignore[attr-defined]
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/storage.py", line 757, in _expired
return eval(cls.module)._UntypedStorage._expired(*args, **kwargs)
AttributeError: module 'torch.cuda' has no attribute '_UntypedStorage'
[ 2] 33010 || B: 4.178 | C: 5.768 | M: 4.934 | S: 1.417 | T: 16.296 || ETA: 1 day, 4:49:09 || timer: 0.126
Traceback (most recent call last):
File "/home/gangwa/miniconda3/lib/python3.9/multiprocessing/queues.py", line 245, in _feed
obj = _ForkingPickler.dumps(obj)
File "/home/gangwa/miniconda3/lib/python3.9/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 364, in reduce_storage
shared_cache[cache_key] = StorageWeakRef(storage)
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 65, in setitem
self.free_dead_references()
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 70, in free_dead_references
if storage_ref.expired():
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 35, in expired
return torch.Storage._expired(self.cdata) # type: ignore[attr-defined]
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/storage.py", line 757, in _expired
return eval(cls.module)._UntypedStorage._expired(*args, **kwargs)
AttributeError: module 'torch.cuda' has no attribute '_UntypedStorage'
Traceback (most recent call last):
File "/home/gangwa/miniconda3/lib/python3.9/multiprocessing/queues.py", line 245, in _feed
obj = _ForkingPickler.dumps(obj)
File "/home/gangwa/miniconda3/lib/python3.9/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 364, in reduce_storage
shared_cache[cache_key] = StorageWeakRef(storage)
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 65, in setitem
self.free_dead_references()
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 70, in free_dead_references
if storage_ref.expired():
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 35, in expired
return torch.Storage._expired(self.cdata) # type: ignore[attr-defined]
File "/home/gangwa/miniconda3/lib/python3.9/site-packages/torch/storage.py", line 757, in _expired
return eval(cls.module)._UntypedStorage._expired(*args, **kwargs)
AttributeError: module 'torch.cuda' has no attribute '_UntypedStorage'
Anyone has any idea why I got this after around 3-4 hours of training.
I had a similar problem previously, could not find a reason but some solutions that worked for me. You can try the following fixes to solve it:-