Author: FLOCKAH
Here's a quick cheatsheet for model sizes and their resource requirements:
Assuming we only load the weights of the model:
Model Size | Full Precision (32-bit) | Half Precision (16-bit) | Quarter Precision (8-bit) | 4-Bit Precision |
---|---|---|---|---|
Model 13b | Requires 13x4 GB vRAM and 60 GB RAM | Needs 13x2 GB vRAM | 13 GB vRAM | 13/2 GB VRAM, 30 GB RAM |
Model 7b | Requires 7x4 GB vRAM and 30 GB RAM | Needs 7x2 GB vRAM | 7 GB vRAM | 7/2 GB VRAM, 15 GB RAM |
Model 3b | Requires 3x4 GB vRAM and 12 GB RAM | Needs 3x2 GB vRAM | 3 GB vRAM | 3/2 GB VRAM, 6 GB RAM |
Here are widely-used cloud services for deep learning:
LongLoRA - Long context (100k+!) window training method. Helpful when we train the model for summarizing & writing tasks
LoRA (Low-Rank Adaptation) - Focuses on targeted module training
QLoRA - Enhances LoRA with weight freezing for optimized GPU usage
GPTQ - Quantizes the model weights, without further training or tuning, this one found its place on this list, because it is extremely helpful when we want to load bigger model for inference, without further do - or when we choose to quantize ourselves instead of relaying on QLora.
GGUF - Enables running large models on CPU through quantization
[
{
“input”: “Need a quick Python script for checking if a word is
a Palindrome”,
“output”: “Sure… {code}”
}
]
# Names of entries do not matter, they can be named "sol_1" and "gol_2"
# Can also be multiple: "input", "instruction", "output", "context", "text"
# The key is to recognize which field is 'User' (Question) and 'AI' (Answer)
Including diverse topics requires ample data to ensure effective training across all areas
# Here is your training script
training_args = TrainingArguments(
output_dir = ...,
logging_dir = ...,
evaluation_strategy = "steps",
do_eval = True,
eval_steps = 10, # Run 9 validation checks
save_steps = 100, # This one will not save anyways!
num_steps = 100,
logging_steps = 10,
# num_train_epochs=12, # Comment it, just for case
# other arguments
)
# install necessary modules
Terminal:
pip install -q -U bitsandbytes auto-gptq sentencepiece transformers peft accelerate datasets trl einops GPutil huggingface_hub tensorboard scipy protobuf
Notebook:
!pip install -q -U bitsandbytes auto-gptq sentencepiece transformers peft accelerate datasets trl einops GPutil huggingface_hub tensorboard scipy protobuf
If any of the modules cannot be imported using import module_name
then remove '-q' from pip install
command above to disable the silenter.
# Import necessary modules
import GPUtil
import os
import time
import torch
import threading
import sys
from transformers import (
AutoModelForCausalLM, # If other than llama
LlamaForCausalLM, # Only if llama
AutoTokenizer, # Will download ModelTokenizer for training
BitsAndBytesConfig, # Quantization
TrainingArguments,
Trainer, # If not quantized
LlamaTokenizer, # Only if llama
)
# Name of the model for training, Llama-2 as example
model_name = "meta-llama/Llama-2-13b-hf"
# In this example we will quantize the model to 4-bit architecture:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type= "nf4", # For 4-bit quantization
bnb_4bit_compute_dtype = compute_dtype,
bnb_4bit_use_double_quant = False # "This flag is used for nested quantization where the quantization constants from the first quantization are quantized again" source: https://huggingface.co/docs/transformers/main_classes/quantization
)
# Now load the tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right' # 'left' may cause model to generate nonsense output as of 03.11.2023
model.config.pad_token_id = tokenizer.eos_token # set padding token id to tokenizer's end of sentence token
model.config.use_cache = False # Set True for inference
# Finally, download and load the model:
max_retries = 3 # Number of retries before giving up
retry_delay = 5 # Delay between retries in seconds
# Retry downloading of the model on fail
for i in range(max_retries):
try:
# If you don't train llama then AutoModelForCausalLM
model = LlamaForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map={"": 0}, # Move the training to cuda index 0 to train using single-gpu
trust_remote_code=False, # Enable on A100 and newer GPUs for FlashAttention2 ~15-20% less GPU consumption
)
model.config.pretraining_tp = 2 # This presumably doubles the training accuracy
break # Exit the loop if the model is successfully downloaded
except Exception as e:
print(f"An exception occurred: {e}")
if i < max_retries - 1:
print(f"Retrying in {retry_delay} seconds...")
time.sleep(retry_delay)
else:
print("Max retries reached. Exiting.")
raise # Re-raise the last exception to exit the script
from peft import AutoPeftModelForCausalLM, LoraConfig, PeftModel, prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
lora_alpha=64,
lora_dropout=0.01,
r=16,
bias="none",
task_type="CAUSAL_LM",
target_modules= ["q_proj","v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)
model = PeftModel(model, peft_config)
print(model) # This will print all the modules, find Linear4bit and target them (more!=better)
model.print_trainable_parameters() # import this if doesn't work
Where:
r, the dimension of the low-rank matrices
lora_alpha, the scaling factor for the low-rank matrices
lora_dropout, the dropout probability of the LoRA layers
def monitor_gpu_usage():
while True:
gpu = GPUtil.getGPUs()[0]
util = gpu.memoryUtil * 100
free = gpu.memoryFree
used = gpu.memoryUsed
total = gpu.memoryTotal
sys.stdout.write('\033[K') # Clear to the end of line
sys.stdout.write(f"\rGPU RAM Free: {free:.0f}MB | Used: {used:.0f}MB | Util {util:3.0f}% | Total {total:.0f}MB")
sys.stdout.flush()
time.sleep(5)
# Start GPU monitoring thread
gpu_thread = threading.Thread(target=monitor_gpu_usage)
gpu_thread.daemon = True
gpu_thread.start()
Lazy loading means data is loaded (supplied to the trainer) on the fly as needed rather than all at once at the beginning. This is beneficial when dealing with large datasets that do not fit into memory
custom_data_collator is a function that takes a batch of samples from the dataset and collates them into a single batch. This is a common requirement for batch processing in PyTorch, as data needs to be gathered into a batch and converted to tensors of the same size.
from datasets import load_dataset
from torch.utils.data import Dataset, random_split
MAX_LENGTH = 512
def custom_data_collator(batch):
input_ids = [item[0] for item in batch]
attn_masks = [item[1] for item in batch]
return {
'input_ids': torch.stack(input_ids),
'attention_mask': torch.stack(attn_masks),
'labels': torch.stack(input_ids)
}
class OnTheFly(Dataset): # Lazy loading, optimizes GPU
def __init__(self, txt_list, tokenizer):
self.txt_list = txt_list
self.tokenizer = tokenizer
def __len__(self):
return len(self.txt_list)
def __getitem__(self, idx):
txt = self.txt_list[idx]
encodings_dict = self.tokenizer(txt, truncation=True, max_length=MAX_LENGTH, padding="max_length")
input_ids = torch.tensor(encodings_dict['input_ids'])
attn_masks = torch.tensor(encodings_dict['attention_mask'])
return input_ids, attn_masks
To load the dataset we will utilize standard load_dataset function from datasets module, the function loads the dataset to the memory, while OnTheFly assumes it's already there and passes that to the trainer.
training_data = load_dataset("flytech/llama-python-codes-30k", split='train') # In case you use different dataset, remember to map combined fields, ['text'] in this example: .map(lambda example: {'text': example['instruction'] + ' ' + example['input'] + ' ' + example['output']})['text']
texts = training_data
# Initialize the dataset
dataset = OnTheFly(texts, tokenizer)
# Change these ratios as needed
train_ratio = 0.95
train_dataset, val_dataset = random_split(dataset, [int(train_ratio * len(dataset)), len(dataset) - int(train_ratio * len(dataset))])
# Random split will randomly split your dataset and produce training and validation dataset, feel free to completely skip this and supply trainer with different split or dataset.
from transformers.integrations import TensorBoardCallback
training_arguments = TrainingArguments(
output_dir="/content/Ruckus-13b-24",
logging_dir="/content/Ruckus-13b-24",
evaluation_strategy="steps",
do_eval=True, # Perform evaluation (disable for no evaluation at all)
save_total_limit=14,
per_device_train_batch_size=16, # a
gradient_accumulation_steps=1, # b
per_device_eval_batch_size=16,
num_train_epochs=4,
optim="adamw_bnb_8bit",
save_steps=200, # Doesn't have to be same as eval and logging steps but it's recommended
logging_steps=200, # same as eval steps
learning_rate=2e-4, # learning rate is extremely important, defines how fast the model learns the data but may cause damage if not properly set
eval_steps=200, # Perform evaluation of validation dataset every 200 optimization steps
eval_accumulation_steps=2,
fp16=False, # Set to True, use bf16 if A100 and newer for little bit better outcome
bf16=True, # Set to True only on A100 GPUs and newer (ampere)
#max_grad_norm=1.0, # gradient normalization
#weight_decay=0.01, # weight decay, similar to lora dropout
#warmup_ratio=0.1, # use along with scheduler type linear or cosine
lr_scheduler_type="constant",
save_safetensors=True,
push_to_hub=True,
hub_model_id="Your-AI-Name",
hub_token="hf_herusygkj1823saddngb118", # Fake one, but similar, create Hugging Face account and go into Settings->Access tokens
hub_strategy="checkpoint", # enable checkpoints
remove_unused_columns=True,
dataloader_num_workers=4 # utilize CPU cores
)
# The total batch size is always per_device_train_batch_size Ă— gradient_accumulation_steps (a*b)
tensorboard_callback = TensorBoardCallback() # <- Callback for pretty metrics
trainer = SFTTrainer(
model=model,
args=training_arguments,
train_dataset=train_dataset,
eval_dataset=val_dataset,
dataset_text_field="text",
peft_config=peft_config,
max_seq_length=512, # <- Max length of a dataset entry, every entry above will be truncated to this length (in tokens)
tokenizer=tokenizer,
data_collator=custom_data_collator,
packing=False, #
callbacks=[tensorboard_callback]
)
trainer.train()
# or to resume from checkpoint:
trainer.train(
resume_from_checkpoint="/workspace/checkpoint-xxx"
)
# First let's save the model's weights:
output_dir = os.path.join("/workspace/MyAi-13b-4bit", "final_checkpoint")
trainer.model.save_pretrained(output_dir)
trainer.save_model("/workspace/MyAi-13b-4bit")
# safe_serialization will save model in safetensors format
trainer.model.save_pretrained(output_dir, safe_serialization=True)
# torch.save(model.state_dict(), "/workspace/MyAi-13b-4bit/MyAi-13b-4bit.pth") # Save the model weights in .pth if you like to
# Recycle the garbage collected in GPU
del model
torch.cuda.empty_cache()
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
# Ensure that model is not a meta tensor
if isinstance(model, torch.Tensor) and model.is_meta:
model = model.to(device="cpu", non_blocking=True, dtype=torch.float32)
# Merge and unload the model
model = model.merge_and_unload()
# Save the merged model
model.save_pretrained(output_dir)
hub_name = 'MyAi-13b-4bit'
model.push_to_hub(hub_name)
tokenizer.push_to_hub(hub_name)
!pip install flask_ngrok
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import torch
import GPUtil
import time
import threading
import sys
from flask import Flask, request
from flask_ngrok import run_with_ngrok
# Function to monitor GPU usage
def monitor_gpu_usage():
while True:
gpu = GPUtil.getGPUs()[0]
util = gpu.memoryUtil * 100
free = gpu.memoryFree
used = gpu.memoryUsed
total = gpu.memoryTotal
sys.stdout.write('\033[K') # Clear to the end of line
sys.stdout.write(f"\rGPU RAM Free: {free:.0f}MB | Used: {used:.0f}MB | Util {util:3.0f}% | Total {total:.0f}MB")
sys.stdout.flush()
time.sleep(2)
# Start the GPU monitoring thread
gpu_thread = threading.Thread(target=monitor_gpu_usage)
gpu_thread.daemon = True
gpu_thread.start()
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("flytech/Ruckus-PyAssi-13b")
device = "cuda:0"
model = AutoModelForCausalLM.from_pretrained("flytech/Ruckus-PyAssi-13b", load_in_4bit=True, device_map="auto", bnb_4bit_compute_dtype=torch.float16)
model.config.use_cache = True
# Define the generation configuration
generation_config = GenerationConfig(
temperature=0.95,
top_p=0.92,
# num_beams=4,
# no_repeat_ngram_size=4,
decoder_start_token_id=tokenizer.bos_token_id,
do_sample=True,
max_new_tokens=1024,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
)
# Create Flask app and apply ngrok
app = Flask(__name__)
run_with_ngrok(app)
# Define route for generation
@app.route("/generate", methods=["POST"])
def generate():
data = request.get_json()
prompt = data.get("prompt", "")
inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
inputs = {key: tensor.to(device) for key, tensor in inputs.items()}
outputs = model.generate(**inputs, **generation_config.to_dict())
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"generated_text": generated_text}
if __name__ == '__main__':
app.run()
import gc, torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import GPUtil
import time
import threading
import sys
# Function to monitor GPU usage statistics
def monitor_gpu_usage():
while True:
gpu = GPUtil.getGPUs()[0] # Get the first GPU
util = gpu.memoryUtil * 100 # Calculate utilization percentage
free = gpu.memoryFree # Free memory in MB
used = gpu.memoryUsed # Used memory in MB
total = gpu.memoryTotal # Total memory in MB
sys.stdout.write('\033[K') # ANSI escape sequence to clear line
sys.stdout.write(f"\rGPU RAM Free: {free:.0f}MB | Used: {used:.0f}MB | Util {util:3.0f}% | Total {total:.0f}MB")
sys.stdout.flush() # Flush the standard output
time.sleep(1) # Wait for 1 second before the next check
# Create and start the GPU monitoring thread
gpu_thread = threading.Thread(target=monitor_gpu_usage)
gpu_thread.daemon = True # Set the thread as a daemon
gpu_thread.start()
# Load tokenizer and model from your HF Repository
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-7b1")
device = "cuda:0" # Load on single GPU
model = AutoModelForCausalLM.from_pretrained(
"bigscience/bloom-7b1",
load_in_4bit=True, # Load the model in 4-bit
device_map="auto", # Automatic GPU placement
trust_remote_code=True, # To load with FlashAttention2
bnb_4bit_compute_dtype=torch.float16 # Set compute data type to float16
)
model.config.use_cache = True # Enable for inference, as was False in Training
# Infinite loop to continuously process input prompts
while True:
text = input("Enter your prompt: ")
inputs = tokenizer(text, return_tensors="pt") # Tokenize the input text
inputs = {key: tensor.to(device) for key, tensor in inputs.items()} # Move tensors to the defined device
outputs = model.generate(
**inputs,
num_beams=4, # example beam search with 4 beams
no_repeat_ngram_size=4, # Avoid repeating ngrams of size 4
decoder_start_token_id=tokenizer.bos_token_id, # Start token for decoding
do_sample=False, # Disable random sampling to always get the same output for a given prompt
max_new_tokens=160, # Maximum number of new tokens to generate
num_return_sequences=1, # Return only one sequence
eos_token_id=tokenizer.eos_token_id, # End of sequence token ID
pad_token_id=tokenizer.pad_token_id, # Padding token ID
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) # Decode the output tokens to string
print(generated_text)
del inputs, outputs # Delete variables to free memory
torch.cuda.empty_cache() # Empty the CUDA cache
gc.collect() # Run garbage collection