How do I run the run_language_modeling.py script from hugging face using the pretrained roberta case model to fine-tune using my own data on the Azure databricks with a GPU cluster.
Using Transformer version 2.9.1 and 3.0 . Python 3.6 Torch `1.5.0 torchvision 0.6
This is the script I ran below on Azure databricks
%run '/dbfs/FileStore/tables/dev/run_language_modeling.py' \
--output_dir='/dbfs/FileStore/tables/final_train/models/roberta_base_reduce_n' \
--model_type=roberta \
--model_name_or_path=roberta-base \
--do_train \
--num_train_epochs 5 \
--train_data_file='/dbfs/FileStore/tables/final_train/train_data/all_data_desc_list_full.txt' \
--mlm
This is the error I get after running the above command.
/dbfs/FileStore/tables/dev/run_language_modeling.py in <module>
279
280 if __name__ == "__main__":
--> 281 main()
/dbfs/FileStore/tables/dev/run_language_modeling.py in main()
243 else None
244 )
--> 245 trainer.train(model_path=model_path)
246 trainer.save_model()
247 # For convenience, we also re-save the tokenizer to the same directory,
/databricks/python/lib/python3.7/site-packages/transformers/trainer.py in train(self, model_path)
497 continue
498
--> 499 tr_loss += self._training_step(model, inputs, optimizer)
500
501 if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
/databricks/python/lib/python3.7/site-packages/transformers/trainer.py in _training_step(self, model, inputs, optimizer)
620 inputs["mems"] = self._past
621
--> 622 outputs = model(**inputs)
623 loss = outputs[0] # model outputs are always tuple in transformers (see doc)
624
/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
153 return self.module(*inputs[0], **kwargs[0])
154 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 155 outputs = self.parallel_apply(replicas, inputs, kwargs)
156 return self.gather(outputs, self.output_device)
157
/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)
163
164 def parallel_apply(self, replicas, inputs, kwargs):
--> 165 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
166
167 def gather(self, outputs, output_device):
/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
83 output = results[i]
84 if isinstance(output, ExceptionWrapper):
---> 85 output.reraise()
86 outputs.append(output)
87 return outputs
/databricks/python/lib/python3.7/site-packages/torch/_utils.py in reraise(self)
393 # (https://bugs.python.org/issue2651), so we work around it.
394 msg = KeyErrorMessage(msg)
--> 395 raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_roberta.py", line 239, in forward
output_hidden_states=output_hidden_states,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 762, in forward
output_hidden_states=output_hidden_states,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 439, in forward
output_attentions,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 371, in forward
hidden_states, attention_mask, head_mask, output_attentions=output_attentions,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 315, in forward
hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 240, in forward
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 11.17 GiB total capacity; 10.68 GiB already allocated; 95.31 MiB free; 10.77 GiB reserved in total by PyTorch)```
Please how do I resolve this
The out of memory error is likely caused by not cleaning up the session and or freeing up the GPU.
From the similar Github issue.
It is because of mini-batch of data does not fit on to GPU memory. Just decrease the batch size. When I set batch size = 256 for cifar10 dataset I got the same error; Then I set the batch size = 128, it is solved.