In [1]:
from datasets import Dataset, load_dataset
# TODO: I hate using globals like this, but I cannot come up with a cleaner alternative right now
In [2]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
# Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
!pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
# Use this for older GPUs (V100, Tesla T4, RTX 20xx)
!pip install --no-deps xformers trl peft accelerate bitsandbytes
pass
In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 25536 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
"unsloth/mistral-7b-bnb-4bit",
"unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
"unsloth/llama-2-7b-bnb-4bit",
"unsloth/llama-2-13b-bnb-4bit",
"unsloth/codellama-34b-bnb-4bit",
"unsloth/tinyllama-bnb-4bit",
"unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
"unsloth/gemma-2b-bnb-4bit",
] # More models at https://huggingface.co/unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/llama-2-13b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning. ==((====))== Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2. \\ /| GPU: NVIDIA A100-PCIE-40GB. Max memory: 39.381 GB. Platform = Linux. O^O/ \_/ \ Pytorch: 2.4.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1. \ / Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = True] "-____-" Free Apache license: http://github.com/unslothai/unsloth
Unsloth: unsloth/llama-2-13b can only handle sequence lengths of at most 4096. But with kaiokendev's RoPE scaling of 6.234, it can be magically be extended to 25536!
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
In [4]:
model = FastLanguageModel.get_peft_model(
model,
r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 16,
lora_dropout = 0, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
random_state = 3407,
use_rslora = False, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
Unsloth 2024.9.post4 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.
In [5]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
instructions = examples["instruction"]
inputs = examples["input"]
outputs = examples["output"]
texts = []
for instruction, input, output in zip(instructions, inputs, outputs):
# Must add EOS_TOKEN, otherwise your generation will go on forever!
text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
texts.append(text)
return { "text" : texts, }
pass
from datasets import load_dataset
dataset = load_dataset("Yukang/LongAlpaca-12k", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
Repo card metadata block was not found. Setting CardData to empty.
In [6]:
print(model)
PeftModelForCausalLM( (base_model): LoraModel( (model): LlamaForCausalLM( (model): LlamaModel( (embed_tokens): Embedding(32000, 5120) (layers): ModuleList( (0-39): 40 x LlamaDecoderLayer( (self_attn): LlamaAttention( (q_proj): lora.Linear( (base_layer): Linear(in_features=5120, out_features=5120, bias=False) (lora_dropout): ModuleDict( (default): Identity() ) (lora_A): ModuleDict( (default): Linear(in_features=5120, out_features=16, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=16, out_features=5120, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (k_proj): lora.Linear( (base_layer): Linear(in_features=5120, out_features=5120, bias=False) (lora_dropout): ModuleDict( (default): Identity() ) (lora_A): ModuleDict( (default): Linear(in_features=5120, out_features=16, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=16, out_features=5120, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (v_proj): lora.Linear( (base_layer): Linear(in_features=5120, out_features=5120, bias=False) (lora_dropout): ModuleDict( (default): Identity() ) (lora_A): ModuleDict( (default): Linear(in_features=5120, out_features=16, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=16, out_features=5120, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (o_proj): lora.Linear( (base_layer): Linear(in_features=5120, out_features=5120, bias=False) (lora_dropout): ModuleDict( (default): Identity() ) (lora_A): ModuleDict( (default): Linear(in_features=5120, out_features=16, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=16, out_features=5120, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (rotary_emb): LlamaLinearScalingRotaryEmbedding() ) (mlp): LlamaMLP( (gate_proj): lora.Linear( (base_layer): Linear(in_features=5120, out_features=13824, bias=False) (lora_dropout): ModuleDict( (default): Identity() ) (lora_A): ModuleDict( (default): Linear(in_features=5120, out_features=16, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=16, out_features=13824, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (up_proj): lora.Linear( (base_layer): Linear(in_features=5120, out_features=13824, bias=False) (lora_dropout): ModuleDict( (default): Identity() ) (lora_A): ModuleDict( (default): Linear(in_features=5120, out_features=16, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=16, out_features=13824, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (down_proj): lora.Linear( (base_layer): Linear(in_features=13824, out_features=5120, bias=False) (lora_dropout): ModuleDict( (default): Identity() ) (lora_A): ModuleDict( (default): Linear(in_features=13824, out_features=16, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=16, out_features=5120, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (act_fn): SiLU() ) (input_layernorm): LlamaRMSNorm((5120,), eps=1e-05) (post_attention_layernorm): LlamaRMSNorm((5120,), eps=1e-05) ) ) (norm): LlamaRMSNorm((5120,), eps=1e-05) (rotary_emb): LlamaRotaryEmbedding() ) (lm_head): Linear(in_features=5120, out_features=32000, bias=False) ) ) )
In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
max_seq_length = 25536
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = dataset,
dataset_text_field = "text",
max_seq_length = max_seq_length,
dataset_num_proc = 2,
packing = False, # Can make training 5x faster for short sequences.
args = TrainingArguments(
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4,
warmup_steps = 4,
max_steps = 2, # Use num_train_epochs=1 instead. We use this to make training faster
learning_rate = 2e-4,
fp16 = not torch.cuda.is_bf16_supported(),
bf16 = torch.cuda.is_bf16_supported(),
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "linear",
seed = 3407,
output_dir = "outputs",
),
)
Map (num_proc=2): 0%| | 0/12000 [00:00<?, ? examples/s]
max_steps is given, it will override any value given in num_train_epochs
In [8]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
GPU = NVIDIA A100-PCIE-40GB. Max memory = 39.381 GB. 25.162 GB of memory reserved.
In [9]:
trainer_stats = trainer.train()
==((====))== Unsloth - 2x faster free finetuning | Num GPUs = 1 \\ /| Num examples = 12,000 | Num Epochs = 1 O^O/ \_/ \ Batch size per device = 2 | Gradient Accumulation steps = 4 \ / Total batch size = 8 | Total steps = 2 "-____-" Number of trainable parameters = 62,586,880
[2/2 : < :, Epoch 0.00/1]
Step | Training Loss |
---|
--------------------------------------------------------------------------- OutOfMemoryError Traceback (most recent call last) Cell In[9], line 1 ----> 1 trainer_stats = trainer.train() File <string>:142, in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) File <string>:363, in _fast_inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval) File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3318, in Trainer.training_step(self, model, inputs) 3315 return loss_mb.reduce_mean().detach().to(self.args.device) 3317 with self.compute_loss_context_manager(): -> 3318 loss = self.compute_loss(model, inputs) 3320 del inputs 3321 if ( 3322 self.args.torch_empty_cache_steps is not None 3323 and self.state.global_step % self.args.torch_empty_cache_steps == 0 3324 ): File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3363, in Trainer.compute_loss(self, model, inputs, return_outputs) 3361 else: 3362 labels = None -> 3363 outputs = model(**inputs) 3364 # Save past state if it exists 3365 # TODO: this needs to be fixed and made cleaner later. 3366 if self.args.past_index >= 0: File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs) 1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] 1552 else: -> 1553 return self._call_impl(*args, **kwargs) File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs) 1557 # If we don't have any hooks, we want to skip the rest of the logic in 1558 # this function, and just call forward. 1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1560 or _global_backward_pre_hooks or _global_backward_hooks 1561 or _global_forward_hooks or _global_forward_pre_hooks): -> 1562 return forward_call(*args, **kwargs) 1564 try: 1565 result = None File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:820, in convert_outputs_to_fp32.<locals>.forward(*args, **kwargs) 819 def forward(*args, **kwargs): --> 820 return model_forward(*args, **kwargs) File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:808, in ConvertOutputsToFp32.__call__(self, *args, **kwargs) 807 def __call__(self, *args, **kwargs): --> 808 return convert_to_fp32(self.model_forward(*args, **kwargs)) File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:787, in convert_to_fp32(tensor) 781 def _is_fp16_bf16_tensor(tensor): 782 return (is_torch_tensor(tensor) or hasattr(tensor, "dtype")) and tensor.dtype in ( 783 torch.float16, 784 torch.bfloat16, 785 ) --> 787 return recursively_apply(_convert_to_fp32, tensor, test_type=_is_fp16_bf16_tensor) File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:119, in recursively_apply(func, data, test_type, error_on_other_type, *args, **kwargs) 108 return honor_type( 109 data, 110 ( (...) 115 ), 116 ) 117 elif isinstance(data, Mapping): 118 return type(data)( --> 119 { 120 k: recursively_apply( 121 func, v, *args, test_type=test_type, error_on_other_type=error_on_other_type, **kwargs 122 ) 123 for k, v in data.items() 124 } 125 ) 126 elif test_type(data): 127 return func(data, *args, **kwargs) File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:120, in <dictcomp>(.0) 108 return honor_type( 109 data, 110 ( (...) 115 ), 116 ) 117 elif isinstance(data, Mapping): 118 return type(data)( 119 { --> 120 k: recursively_apply( 121 func, v, *args, test_type=test_type, error_on_other_type=error_on_other_type, **kwargs 122 ) 123 for k, v in data.items() 124 } 125 ) 126 elif test_type(data): 127 return func(data, *args, **kwargs) File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:127, in recursively_apply(func, data, test_type, error_on_other_type, *args, **kwargs) 118 return type(data)( 119 { 120 k: recursively_apply( (...) 124 } 125 ) 126 elif test_type(data): --> 127 return func(data, *args, **kwargs) 128 elif error_on_other_type: 129 raise TypeError( 130 f"Unsupported types ({type(data)}) passed to `{func.__name__}`. Only nested list/tuple/dicts of " 131 f"objects that are valid for `{test_type.__name__}` should be passed." 132 ) File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:779, in convert_to_fp32.<locals>._convert_to_fp32(tensor) 778 def _convert_to_fp32(tensor): --> 779 return tensor.float() OutOfMemoryError: CUDA out of memory. Tried to allocate 6.09 GiB. GPU 0 has a total capacity of 39.38 GiB of which 2.92 GiB is free. Process 2932026 has 36.42 GiB memory in use. Of the allocated memory 29.25 GiB is allocated by PyTorch, and 6.66 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
In [ ]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")