from datasets import Dataset, load_dataset
# TODO: I hate using globals like this, but I cannot come up with a cleaner alternative right now

%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

from unsloth import FastLanguageModel
import torch
max_seq_length = 25536 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
    "unsloth/gemma-2b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-2-13b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-PCIE-40GB. Max memory: 39.381 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth

Unsloth: unsloth/llama-2-13b can only handle sequence lengths of at most 4096.
But with kaiokendev's RoPE scaling of 6.234, it can be magically be extended to 25536!

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.9.post4 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("Yukang/LongAlpaca-12k", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Repo card metadata block was not found. Setting CardData to empty.

print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 5120)
        (layers): ModuleList(
          (0-39): 40 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=5120, out_features=5120, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=5120, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=5120, out_features=5120, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=5120, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (v_proj): lora.Linear(
                (base_layer): Linear(in_features=5120, out_features=5120, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=5120, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (o_proj): lora.Linear(
                (base_layer): Linear(in_features=5120, out_features=5120, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=5120, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (rotary_emb): LlamaLinearScalingRotaryEmbedding()
            )
            (mlp): LlamaMLP(
              (gate_proj): lora.Linear(
                (base_layer): Linear(in_features=5120, out_features=13824, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=13824, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (up_proj): lora.Linear(
                (base_layer): Linear(in_features=5120, out_features=13824, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=13824, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (down_proj): lora.Linear(
                (base_layer): Linear(in_features=13824, out_features=5120, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=13824, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=5120, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (act_fn): SiLU()
            )
            (input_layernorm): LlamaRMSNorm((5120,), eps=1e-05)
            (post_attention_layernorm): LlamaRMSNorm((5120,), eps=1e-05)
          )
        )
        (norm): LlamaRMSNorm((5120,), eps=1e-05)
        (rotary_emb): LlamaRotaryEmbedding()
      )
      (lm_head): Linear(in_features=5120, out_features=32000, bias=False)
    )
  )
)

from trl import SFTTrainer
from transformers import TrainingArguments

max_seq_length = 25536

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 4,
        max_steps = 2, # Use num_train_epochs=1 instead. We use this to make training faster
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/12000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs

#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-PCIE-40GB. Max memory = 39.381 GB.
25.162 GB of memory reserved.

trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 12,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 2
 "-____-"     Number of trainable parameters = 62,586,880

---------------------------------------------------------------------------
OutOfMemoryError                          Traceback (most recent call last)
Cell In[9], line 1
----> 1 trainer_stats = trainer.train()

File <string>:142, in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)

File <string>:363, in _fast_inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3318, in Trainer.training_step(self, model, inputs)
   3315     return loss_mb.reduce_mean().detach().to(self.args.device)
   3317 with self.compute_loss_context_manager():
-> 3318     loss = self.compute_loss(model, inputs)
   3320 del inputs
   3321 if (
   3322     self.args.torch_empty_cache_steps is not None
   3323     and self.state.global_step % self.args.torch_empty_cache_steps == 0
   3324 ):

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3363, in Trainer.compute_loss(self, model, inputs, return_outputs)
   3361 else:
   3362     labels = None
-> 3363 outputs = model(**inputs)
   3364 # Save past state if it exists
   3365 # TODO: this needs to be fixed and made cleaner later.
   3366 if self.args.past_index >= 0:

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
   1551     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1552 else:
-> 1553     return self._call_impl(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
   1557 # If we don't have any hooks, we want to skip the rest of the logic in
   1558 # this function, and just call forward.
   1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1560         or _global_backward_pre_hooks or _global_backward_hooks
   1561         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562     return forward_call(*args, **kwargs)
   1564 try:
   1565     result = None

File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:820, in convert_outputs_to_fp32.<locals>.forward(*args, **kwargs)
    819 def forward(*args, **kwargs):
--> 820     return model_forward(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:808, in ConvertOutputsToFp32.__call__(self, *args, **kwargs)
    807 def __call__(self, *args, **kwargs):
--> 808     return convert_to_fp32(self.model_forward(*args, **kwargs))

File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:787, in convert_to_fp32(tensor)
    781 def _is_fp16_bf16_tensor(tensor):
    782     return (is_torch_tensor(tensor) or hasattr(tensor, "dtype")) and tensor.dtype in (
    783         torch.float16,
    784         torch.bfloat16,
    785     )
--> 787 return recursively_apply(_convert_to_fp32, tensor, test_type=_is_fp16_bf16_tensor)

File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:119, in recursively_apply(func, data, test_type, error_on_other_type, *args, **kwargs)
    108     return honor_type(
    109         data,
    110         (
   (...)
    115         ),
    116     )
    117 elif isinstance(data, Mapping):
    118     return type(data)(
--> 119         {
    120             k: recursively_apply(
    121                 func, v, *args, test_type=test_type, error_on_other_type=error_on_other_type, **kwargs
    122             )
    123             for k, v in data.items()
    124         }
    125     )
    126 elif test_type(data):
    127     return func(data, *args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:120, in <dictcomp>(.0)
    108     return honor_type(
    109         data,
    110         (
   (...)
    115         ),
    116     )
    117 elif isinstance(data, Mapping):
    118     return type(data)(
    119         {
--> 120             k: recursively_apply(
    121                 func, v, *args, test_type=test_type, error_on_other_type=error_on_other_type, **kwargs
    122             )
    123             for k, v in data.items()
    124         }
    125     )
    126 elif test_type(data):
    127     return func(data, *args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:127, in recursively_apply(func, data, test_type, error_on_other_type, *args, **kwargs)
    118     return type(data)(
    119         {
    120             k: recursively_apply(
   (...)
    124         }
    125     )
    126 elif test_type(data):
--> 127     return func(data, *args, **kwargs)
    128 elif error_on_other_type:
    129     raise TypeError(
    130         f"Unsupported types ({type(data)}) passed to `{func.__name__}`. Only nested list/tuple/dicts of "
    131         f"objects that are valid for `{test_type.__name__}` should be passed."
    132     )

File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:779, in convert_to_fp32.<locals>._convert_to_fp32(tensor)
    778 def _convert_to_fp32(tensor):
--> 779     return tensor.float()

OutOfMemoryError: CUDA out of memory. Tried to allocate 6.09 GiB. GPU 0 has a total capacity of 39.38 GiB of which 2.92 GiB is free. Process 2932026 has 36.42 GiB memory in use. Of the allocated memory 29.25 GiB is allocated by PyTorch, and 6.66 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")