I have a pytorch training model and I recieve the following error:
Traceback (most recent call last):
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/routes.py", line 442, in run_predict
output = await app.get_blocks().process_api(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/blocks.py", line 1392, in process_api
result = await self.call_function(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/blocks.py", line 1097, in call_function
prediction = await anyio.to_thread.run_sync(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/anyio/to_thread.py", line 33, in run_sync
return await get_asynclib().run_sync_in_worker_thread(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
return await future
File "/opt/pyai-3.8/lib64/python3.8/site-packages/anyio/_backends/_asyncio.py", line 807, in run
result = context.run(func, *args)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/utils.py", line 703, in wrapper
response = f(*args, **kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/utils.py", line 703, in wrapper
response = f(*args, **kwargs)
File "app.py", line 277, in generate
return self.trainer.generate(
File "/home/ramin.mardani/simple-llm-finetuner/trainer.py", line 108, in generate
assert self.model is not None
AssertionError
Traceback (most recent call last):
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/routes.py", line 442, in run_predict
output = await app.get_blocks().process_api(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/blocks.py", line 1392, in process_api
result = await self.call_function(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/blocks.py", line 1097, in call_function
prediction = await anyio.to_thread.run_sync(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/anyio/to_thread.py", line 33, in run_sync
return await get_asynclib().run_sync_in_worker_thread(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
return await future
File "/opt/pyai-3.8/lib64/python3.8/site-packages/anyio/_backends/_asyncio.py", line 807, in run
result = context.run(func, *args)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/utils.py", line 703, in wrapper
response = f(*args, **kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/utils.py", line 703, in wrapper
response = f(*args, **kwargs)
File "app.py", line 277, in generate
return self.trainer.generate(
File "/home/ramin.mardani/simple-llm-finetuner/trainer.py", line 108, in generate
assert self.model is not None
AssertionError
/opt/pyai-3.8/lib64/python3.8/site-packages/peft/utils/other.py:119: FutureWarning: prepare_model_for_int8_training is deprecated and will be removed in a future version. Use prepare_model_for_kbit_training instead.
warnings.warn(
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
/opt/pyai-3.8/lib64/python3.8/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
{'train_runtime': 9.6476, 'train_samples_per_second': 18.658, 'train_steps_per_second': 0.311, 'train_loss': 1.3394749959309895, 'epoch': 2.0}
Traceback (most recent call last):
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/routes.py", line 442, in run_predict
output = await app.get_blocks().process_api(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/blocks.py", line 1392, in process_api
result = await self.call_function(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/blocks.py", line 1097, in call_function
prediction = await anyio.to_thread.run_sync(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/anyio/to_thread.py", line 33, in run_sync
return await get_asynclib().run_sync_in_worker_thread(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
return await future
File "/opt/pyai-3.8/lib64/python3.8/site-packages/anyio/_backends/_asyncio.py", line 807, in run
result = context.run(func, *args)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/utils.py", line 703, in wrapper
response = f(*args, **kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/gradio/utils.py", line 703, in wrapper
response = f(*args, **kwargs)
File "app.py", line 277, in generate
return self.trainer.generate(
File "/home/ramin.mardani/simple-llm-finetuner/trainer.py", line 133, in generate
output = self.model.generate(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/peft/peft_model.py", line 971, in generate
outputs = self.base_model.generate(**kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/transformers/generation/utils.py", line 1642, in generate
return self.sample(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/transformers/generation/utils.py", line 2724, in sample
outputs = self(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 1076, in forward
transformer_outputs = self.transformer(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 900, in forward
outputs = block(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 390, in forward
attn_outputs = self.attn(
File "/opt/pyai-3.8/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 312, in forward
query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/peft/tuners/lora.py", line 1078, in forward
self.lora_A[self.active_adapter](self.lora_dropout[self.active_adapter](x))
File "/opt/pyai-3.8/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/pyai-3.8/lib64/python3.8/site-packages/torch/nn/modules/linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
RuntimeError: expected scalar type Float but found Half
I tried changing training file but it's not working. Here is my trainer.py file:
import os
import gc
import torch
import transformers
import peft
import datasets
from contextlib import nullcontext
from config import (
HAS_CUDA,
MODEL,
DEVICE_MAP,
TRAINING_PARAMS,
LORA_TRAINING_PARAMS,
GENERATION_PARAMS
)
class Trainer():
def __init__(self):
self.model = None
self.model_name = None
self.lora_name = None
self.loras = {}
self.tokenizer = None
self.trainer = None
self.should_abort = False
def unload_model(self):
del self.model
del self.tokenizer
self.model = None
self.model_name = None
self.tokenizer = None
if (HAS_CUDA):
with torch.no_grad():
torch.cuda.empty_cache()
gc.collect()
def load_model(self, model_name, force=False, **kwargs):
assert model_name is not None
if (model_name == self.model_name and not force):
return
if (self.model is not None):
self.unload_model()
self.model = transformers.AutoModelForCausalLM.from_pretrained(
model_name,
device_map=DEVICE_MAP,
load_in_8bit=True,
torch_dtype=torch.float16,
)
#Clear the collection that tracks which adapters are loaded, as they are associated with self.model
self.loras = {}
if model_name.startswith('decapoda-research/llama'):
self.tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name)
else:
self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
self.tokenizer.pad_token_id = 0
self.model_name = model_name
def load_lora(self, lora_name, replace_model=True):
assert self.model is not None
assert lora_name is not None
if (lora_name == self.lora_name):
return
if lora_name in self.loras:
self.lora_name = lora_name
self.model.set_adapter(lora_name)
return
peft_config = peft.PeftConfig.from_pretrained(lora_name)
if not replace_model:
assert peft_config.base_model_name_or_path == self.model_name
if peft_config.base_model_name_or_path != self.model_name:
self.load_model(peft_config.base_model_name_or_path)
assert self.model_name is not None
assert self.model is not None
if hasattr(self.model, 'load_adapter'):
self.model.load_adapter(lora_name, adapter_name=lora_name)
else:
self.model = peft.PeftModel.from_pretrained(self.model, lora_name, adapter_name=lora_name)
self.model.set_adapter(lora_name)
if (self.model_name.startswith('cerebras')):
self.model.half()
self.lora_name = lora_name
self.loras[lora_name] = True
def unload_lora(self):
self.lora_name = None
def generate(self, prompt, **kwargs):
assert self.model is not None
assert self.model_name is not None
assert self.tokenizer is not None
kwargs = { **GENERATION_PARAMS, **kwargs }
inputs = self.tokenizer(str(prompt), return_tensors="pt")
input_ids = inputs["input_ids"].to(self.model.device)
if self.model.config.pad_token_id is None:
kwargs['pad_token_id'] = self.model.config.eos_token_id
if (kwargs['do_sample']):
del kwargs['num_beams']
generation_config = transformers.GenerationConfig(
use_cache=False,
**kwargs
)
disable_lora = nullcontext()
if self.lora_name is None and hasattr(self.model, 'disable_adapter'):
disable_lora = self.model.disable_adapter()
with torch.no_grad(), disable_lora:
output = self.model.generate(
input_ids=input_ids,
attention_mask=torch.ones_like(input_ids),
generation_config=generation_config
)[0].to(self.model.device)
return self.tokenizer.decode(output, skip_special_tokens=True).strip()
def tokenize_sample(self, item, max_seq_length, add_eos_token=True):
assert self.tokenizer is not None
result = self.tokenizer(
item["text"],
truncation=True,
max_length=max_seq_length,
padding="max_length",
)
result = {
"input_ids": result["input_ids"][:-1],
"attention_mask": result["attention_mask"][:-1],
}
if (
result["input_ids"][-1] != self.tokenizer.eos_token_id
and len(result["input_ids"]) < max_seq_length
and add_eos_token
):
result["input_ids"].append(self.tokenizer.eos_token_id)
result["attention_mask"].append(1)
return result
def tokenize_training_text(self, training_text, max_seq_length, separator="\n\n\n", **kwargs):
samples = training_text.split(separator)
samples = [x.strip() for x in samples]
def to_dict(text):
return { 'text': text }
samples = [to_dict(x) for x in samples]
training_dataset = datasets.Dataset.from_list(samples)
training_dataset = training_dataset.shuffle().map(
lambda x: self.tokenize_sample(x, max_seq_length),
batched=False
)
return training_dataset
def train(self, training_text=None, new_peft_model_name=None, **kwargs):
assert self.should_abort is False
assert self.model is not None
assert self.model_name is not None
assert self.tokenizer is not None
kwargs = { **TRAINING_PARAMS, **LORA_TRAINING_PARAMS, **kwargs }
self.lora_name = None
self.loras = {}
train_dataset = self.tokenize_training_text(training_text, **kwargs)
if hasattr(self.model, 'disable_adapter'):
self.load_model(self.model_name, force=True)
self.model = peft.prepare_model_for_int8_training(self.model)
self.model = peft.get_peft_model(self.model, peft.LoraConfig(
r=kwargs['lora_r'],
lora_alpha=kwargs['lora_alpha'],
lora_dropout=kwargs['lora_dropout'],
bias="none",
task_type="CAUSAL_LM",
))
if not os.path.exists('lora'):
os.makedirs('lora')
sanitized_model_name = self.model_name.replace('/', '_').replace('.', '_')
output_dir = f"lora/{sanitized_model_name}_{new_peft_model_name}"
training_args = transformers.TrainingArguments(
per_device_train_batch_size=kwargs['micro_batch_size'],
gradient_accumulation_steps=kwargs['gradient_accumulation_steps'],
num_train_epochs=kwargs['epochs'],
learning_rate=kwargs['learning_rate'],
fp16=True,
optim='adamw_torch',
logging_steps=20,
save_total_limit=3,
output_dir=output_dir,
)
# _trainer = self
# class LoggingCallback(transformers.TrainerCallback):
# def on_log(self, args, state, control, logs=None, **kwargs):
# _trainer.log += json.dumps(logs) + '\n'
def should_abort():
return self.should_abort
def reset_abort():
self.should_abort = False
class AbortCallback(transformers.TrainerCallback):
def on_step_end(self, args, state, control, **kwargs):
if should_abort():
print("Stopping training...")
control.should_training_stop = True
def on_train_end(self, args, state, control, **kwargs):
if should_abort():
control.should_save = False
# class CustomTrainer(transformers.Trainer):
# def __init__(self, *args, **kwargs):
# super().__init__(*args, **kwargs)
# self.abort_training = False
# def stop_training(self):
# print("Stopping training...")
# self.abort_training = True
# def training_step(self, model, inputs):
# if self.abort_training:
# raise RuntimeError("Training aborted.")
# return super().training_step(model, inputs)
self.trainer = transformers.Trainer(
model=self.model,
train_dataset=train_dataset,
args=training_args,
data_collator=transformers.DataCollatorForLanguageModeling(
self.tokenizer,
mlm=False,
),
callbacks=[AbortCallback()]
)
self.model.config.use_cache = False
result = self.trainer.train(resume_from_checkpoint=False)
if not should_abort():
self.model.save_pretrained(output_dir)
reset_abort()
return result
def abort_training(self):
self.should_abort = True
if __name__ == '__main__':
t = Trainer()
t.load_model(MODEL)
prompt = "Human: How is cheese made?\n\nAssistant:"
print(t.generate(prompt))
t.load_lora('lora/melon-mango-orange')
print(t.generate(prompt))
t.unload_lora()
print(t.generate(prompt))
Here is repo that I got this file from: https://github.com/lxe/simple-llm-finetuner/blob/master/trainer.py
seems like others had this issue before: https://github.com/lxe/simple-llm-finetuner/issues/52
dtype=torch.float for input and output
You have quite a lot of code, which makes it hard to pin point the issue. If you can't make a minimal example, it will be hard to help.
Nonetheless, you enforce types of torch.half
or torch.float16
a few times throughout the code, but state you are using float32 type inputs.
e.g.
self.model = transformers.AutoModelForCausalLM.from_pretrained(
model_name,
device_map=DEVICE_MAP,
load_in_8bit=True,
torch_dtype=torch.float16,
)
and
if (self.model_name.startswith('cerebras')):
self.model.half()
I'm not familiar with how your model should work, nor where you got it from, but I would try not changing those datatypes, or try specifying them to be torch.float32.
If you need mixed precision training to fit within compute limits, try looking at torch Automatic Mixed Precision.